Updating branches/google/testing to r289206

git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/google/testing@289396 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d99626..f209338 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -414,6 +414,9 @@
 set(LLVM_ABI_BREAKING_CHECKS "WITH_ASSERTS" CACHE STRING
   "Enable abi-breaking checks.  Can be WITH_ASSERTS, FORCE_ON or FORCE_OFF.")
 
+option(LLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING
+  "Disable abi-breaking checks mismatch detection at link-tim." OFF)
+
 option(LLVM_FORCE_USE_OLD_HOST_TOOLCHAIN
        "Set to ON to force using an old, unsupported host toolchain." OFF)
 
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 3e37e58..71c1af5 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -78,6 +78,15 @@
 
 check_include_file(mach/mach.h HAVE_MACH_MACH_H)
 check_include_file(histedit.h HAVE_HISTEDIT_H)
+check_include_file(CrashReporterClient.h HAVE_CRASHREPORTERCLIENT_H)
+if(APPLE)
+  include(CheckCSourceCompiles)
+  CHECK_C_SOURCE_COMPILES("
+     static const char *__crashreporter_info__ = 0;
+     asm(\".desc ___crashreporter_info__, 0x10\");
+     int main() { return 0; }"
+    HAVE_CRASHREPORTER_INFO)
+endif()
 
 # library checks
 if( NOT PURE_WINDOWS )
@@ -164,6 +173,9 @@
 if( HAVE_SYS_UIO_H )
   check_symbol_exists(writev sys/uio.h HAVE_WRITEV)
 endif()
+set(CMAKE_REQUIRED_DEFINITIONS "-D_LARGEFILE64_SOURCE")
+check_symbol_exists(lseek64 "sys/types.h;unistd.h" HAVE_LSEEK64)
+set(CMAKE_REQUIRED_DEFINITIONS "")
 check_symbol_exists(mallctl malloc_np.h HAVE_MALLCTL)
 check_symbol_exists(mallinfo malloc.h HAVE_MALLINFO)
 check_symbol_exists(malloc_zone_statistics malloc/malloc.h
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 0d2fe37..3f494b8 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -1290,14 +1290,34 @@
 
 function(add_llvm_tool_symlink link_name target)
   cmake_parse_arguments(ARG "ALWAYS_GENERATE" "OUTPUT_DIR" "" ${ARGN})
+  # This got a bit gross... For multi-configuration generators the target
+  # properties return the resolved value of the string, not the build system
+  # expression. To reconstruct the platform-agnostic path we have to do some
+  # magic. First we grab one of the types, and a type-specific path. Then from
+  # the type-specific path we find the last occurrence of the type in the path,
+  # and replace it with CMAKE_CFG_INTDIR. This allows the build step to be type
+  # agnostic again. 
   if(NOT ARG_OUTPUT_DIR)
+    if(CMAKE_CONFIGURATION_TYPES)
+      list(GET CMAKE_CONFIGURATION_TYPES 0 first_type)
+      string(TOUPPER ${first_type} first_type_upper)
+      set(first_type_suffix _${first_type_upper})
+    endif()
     get_target_property(target_type ${target} TYPE)
     if(${target_type} STREQUAL "STATIC_LIBRARY")
-      get_target_property(ARG_OUTPUT_DIR ${target} ARCHIVE_OUTPUT_DIRECTORY)
+      get_target_property(ARG_OUTPUT_DIR ${target} ARCHIVE_OUTPUT_DIRECTORY${first_type_suffix})
     elseif(UNIX AND ${target_type} STREQUAL "SHARED_LIBRARY")
-      get_target_property(ARG_OUTPUT_DIR ${target} LIBRARY_OUTPUT_DIRECTORY)
+      get_target_property(ARG_OUTPUT_DIR ${target} LIBRARY_OUTPUT_DIRECTORY${first_type_suffix})
     else()
-      get_target_property(ARG_OUTPUT_DIR ${target} RUNTIME_OUTPUT_DIRECTORY)
+      get_target_property(ARG_OUTPUT_DIR ${target} RUNTIME_OUTPUT_DIRECTORY${first_type_suffix})
+    endif()
+    if(CMAKE_CONFIGURATION_TYPES)
+      string(FIND "${ARG_OUTPUT_DIR}" "/${first_type}/" type_start REVERSE)
+      string(SUBSTRING "${ARG_OUTPUT_DIR}" 0 ${type_start} path_prefix)
+      string(SUBSTRING "${ARG_OUTPUT_DIR}" ${type_start} -1 path_suffix)
+      string(REPLACE "/${first_type}/" "/${CMAKE_CFG_INTDIR}/"
+             path_suffix ${path_suffix})
+      set(ARG_OUTPUT_DIR ${path_prefix}${path_suffix})
     endif()
   endif()
 
diff --git a/cmake/modules/TableGen.cmake b/cmake/modules/TableGen.cmake
index a366299..9682002 100644
--- a/cmake/modules/TableGen.cmake
+++ b/cmake/modules/TableGen.cmake
@@ -4,17 +4,15 @@
 
 include(LLVMExternalProjectUtils)
 
+if(LLVM_MAIN_INCLUDE_DIR)
+  set(LLVM_TABLEGEN_FLAGS -I ${LLVM_MAIN_INCLUDE_DIR})
+endif()
+
 function(tablegen project ofn)
   # Validate calling context.
-  foreach(v
-      ${project}_TABLEGEN_EXE
-      LLVM_MAIN_SRC_DIR
-      LLVM_MAIN_INCLUDE_DIR
-      )
-    if(NOT ${v})
-      message(FATAL_ERROR "${v} not set")
-    endif()
-  endforeach()
+  if(NOT ${project}_TABLEGEN_EXE)
+    message(FATAL_ERROR "${project}_TABLEGEN_EXE not set")
+  endif()
 
   file(GLOB local_tds "*.td")
   file(GLOB_RECURSE global_tds "${LLVM_MAIN_INCLUDE_DIR}/llvm/*.td")
@@ -28,7 +26,7 @@
   add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn}.tmp
     # Generate tablegen output in a temporary file.
     COMMAND ${${project}_TABLEGEN_EXE} ${ARGN} -I ${CMAKE_CURRENT_SOURCE_DIR}
-    -I ${LLVM_MAIN_SRC_DIR}/lib/Target -I ${LLVM_MAIN_INCLUDE_DIR}
+    ${LLVM_TABLEGEN_FLAGS} 
     ${LLVM_TARGET_DEFINITIONS_ABSOLUTE}
     -o ${CMAKE_CURRENT_BINARY_DIR}/${ofn}.tmp
     # The file in LLVM_TARGET_DEFINITIONS may be not in the current
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 33cf6ad..4dd7157 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -4589,6 +4589,25 @@
     !2 = !{ i8 0, i8 2, i8 3, i8 6 }
     !3 = !{ i8 -2, i8 0, i8 3, i8 6 }
 
+'``absolute_symbol``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``absolute_symbol`` metadata may be attached to a global variable
+declaration. It marks the declaration as a reference to an absolute symbol,
+which causes the backend to use absolute relocations for the symbol even
+in position independent code, and expresses the possible ranges that the
+global variable's *address* (not its value) is in, in the same format as
+``range`` metadata.
+
+Example:
+
+.. code-block:: llvm
+
+      @a = external global i8, !absolute_symbol !0 ; Absolute symbol in range [0,256)
+
+    ...
+    !0 = !{ i64 0, i64 256 }
+
 '``unpredictable``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/examples/HowToUseJIT/CMakeLists.txt b/examples/HowToUseJIT/CMakeLists.txt
index a344ad0..e86626d 100644
--- a/examples/HowToUseJIT/CMakeLists.txt
+++ b/examples/HowToUseJIT/CMakeLists.txt
@@ -2,7 +2,6 @@
   Core
   ExecutionEngine
   Interpreter
-  MC
   Support
   nativecodegen
   )
diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h
index 6614371..cf3756d 100644
--- a/include/llvm/ADT/BitVector.h
+++ b/include/llvm/ADT/BitVector.h
@@ -21,6 +21,7 @@
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
+#include <utility>
 
 namespace llvm {
 
@@ -45,14 +46,13 @@
     BitWord *WordRef;
     unsigned BitPos;
 
-    reference();  // Undefined
-
   public:
     reference(BitVector &b, unsigned Idx) {
       WordRef = &b.Bits[Idx / BITWORD_SIZE];
       BitPos = Idx % BITWORD_SIZE;
     }
 
+    reference() = delete;
     reference(const reference&) = default;
 
     reference &operator=(reference t) {
diff --git a/include/llvm/ADT/ImmutableList.h b/include/llvm/ADT/ImmutableList.h
index a1d26bd..e5f51ba 100644
--- a/include/llvm/ADT/ImmutableList.h
+++ b/include/llvm/ADT/ImmutableList.h
@@ -16,8 +16,9 @@
 
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/DataTypes.h"
 #include <cassert>
+#include <cstdint>
+#include <new>
 
 namespace llvm {
 
@@ -25,18 +26,18 @@
 
 template <typename T>
 class ImmutableListImpl : public FoldingSetNode {
+  friend class ImmutableListFactory<T>;
+
   T Head;
   const ImmutableListImpl* Tail;
 
   ImmutableListImpl(const T& head, const ImmutableListImpl* tail = nullptr)
     : Head(head), Tail(tail) {}
 
-  friend class ImmutableListFactory<T>;
-
-  void operator=(const ImmutableListImpl&) = delete;
-  ImmutableListImpl(const ImmutableListImpl&) = delete;
-
 public:
+  ImmutableListImpl(const ImmutableListImpl &) = delete;
+  ImmutableListImpl &operator=(const ImmutableListImpl &) = delete;
+
   const T& getHead() const { return Head; }
   const ImmutableListImpl* getTail() const { return Tail; }
 
@@ -79,15 +80,17 @@
   }
 
   class iterator {
-    const ImmutableListImpl<T>* L;
+    const ImmutableListImpl<T>* L = nullptr;
+
   public:
-    iterator() : L(nullptr) {}
+    iterator() = default;
     iterator(ImmutableList l) : L(l.getInternalPointer()) {}
 
     iterator& operator++() { L = L->getTail(); return *this; }
     bool operator==(const iterator& I) const { return L == I.L; }
     bool operator!=(const iterator& I) const { return L != I.L; }
     const value_type& operator*() const { return L->getHead(); }
+
     ImmutableList getList() const { return L; }
   };
 
@@ -121,7 +124,7 @@
 
   /// getHead - Returns the head of the list.
   const T& getHead() {
-    assert (!isEmpty() && "Cannot get the head of an empty list.");
+    assert(!isEmpty() && "Cannot get the head of an empty list.");
     return X->getHead();
   }
 
@@ -145,7 +148,7 @@
   uintptr_t Allocator;
 
   bool ownsAllocator() const {
-    return Allocator & 0x1 ? false : true;
+    return (Allocator & 0x1) == 0;
   }
 
   BumpPtrAllocator& getAllocator() const {
@@ -203,18 +206,21 @@
 //===----------------------------------------------------------------------===//
 
 template<typename T> struct DenseMapInfo;
-template<typename T> struct DenseMapInfo<ImmutableList<T> > {
+template<typename T> struct DenseMapInfo<ImmutableList<T>> {
   static inline ImmutableList<T> getEmptyKey() {
     return reinterpret_cast<ImmutableListImpl<T>*>(-1);
   }
+
   static inline ImmutableList<T> getTombstoneKey() {
     return reinterpret_cast<ImmutableListImpl<T>*>(-2);
   }
+
   static unsigned getHashValue(ImmutableList<T> X) {
     uintptr_t PtrVal = reinterpret_cast<uintptr_t>(X.getInternalPointer());
     return (unsigned((uintptr_t)PtrVal) >> 4) ^
            (unsigned((uintptr_t)PtrVal) >> 9);
   }
+
   static bool isEqual(ImmutableList<T> X1, ImmutableList<T> X2) {
     return X1 == X2;
   }
@@ -222,8 +228,8 @@
 
 template <typename T> struct isPodLike;
 template <typename T>
-struct isPodLike<ImmutableList<T> > { static const bool value = true; };
+struct isPodLike<ImmutableList<T>> { static const bool value = true; };
 
-} // end llvm namespace
+} // end namespace llvm
 
 #endif // LLVM_ADT_IMMUTABLELIST_H
diff --git a/include/llvm/ADT/ImmutableMap.h b/include/llvm/ADT/ImmutableMap.h
index 7480cd7..f197d40 100644
--- a/include/llvm/ADT/ImmutableMap.h
+++ b/include/llvm/ADT/ImmutableMap.h
@@ -14,7 +14,10 @@
 #ifndef LLVM_ADT_IMMUTABLEMAP_H
 #define LLVM_ADT_IMMUTABLEMAP_H
 
+#include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/ImmutableSet.h"
+#include "llvm/Support/Allocator.h"
+#include <utility>
 
 namespace llvm {
 
@@ -56,7 +59,7 @@
 };
 
 template <typename KeyT, typename ValT,
-          typename ValInfo = ImutKeyValueInfo<KeyT,ValT> >
+          typename ValInfo = ImutKeyValueInfo<KeyT,ValT>>
 class ImmutableMap {
 public:
   typedef typename ValInfo::value_type      value_type;
@@ -106,6 +109,9 @@
     Factory(BumpPtrAllocator &Alloc, bool canonicalize = true)
         : F(Alloc), Canonicalize(canonicalize) {}
 
+    Factory(const Factory &) = delete;
+    Factory &operator=(const Factory &) = delete;
+
     ImmutableMap getEmptyMap() { return ImmutableMap(F.getEmptyTree()); }
 
     ImmutableMap add(ImmutableMap Old, key_type_ref K, data_type_ref D) {
@@ -121,10 +127,6 @@
     typename TreeTy::Factory *getTreeFactory() const {
       return const_cast<typename TreeTy::Factory *>(&F);
     }
-
-  private:
-    Factory(const Factory& RHS) = delete;
-    void operator=(const Factory& RHS) = delete;
   };
 
   bool contains(key_type_ref K) const {
@@ -203,9 +205,10 @@
   //===--------------------------------------------------===//
 
   class iterator : public ImutAVLValueIterator<ImmutableMap> {
+    friend class ImmutableMap;
+
     iterator() = default;
     explicit iterator(TreeTy *Tree) : iterator::ImutAVLValueIterator(Tree) {}
-    friend class ImmutableMap;
 
   public:
     key_type_ref getKey() const { return (*this)->first; }
@@ -248,7 +251,7 @@
 
 // NOTE: This will possibly become the new implementation of ImmutableMap some day.
 template <typename KeyT, typename ValT,
-typename ValInfo = ImutKeyValueInfo<KeyT,ValT> >
+typename ValInfo = ImutKeyValueInfo<KeyT,ValT>>
 class ImmutableMapRef {
 public:
   typedef typename ValInfo::value_type      value_type;
@@ -362,9 +365,10 @@
   //===--------------------------------------------------===//
 
   class iterator : public ImutAVLValueIterator<ImmutableMapRef> {
+    friend class ImmutableMapRef;
+
     iterator() = default;
     explicit iterator(TreeTy *Tree) : iterator::ImutAVLValueIterator(Tree) {}
-    friend class ImmutableMapRef;
 
   public:
     key_type_ref getKey() const { return (*this)->first; }
diff --git a/include/llvm/ADT/PackedVector.h b/include/llvm/ADT/PackedVector.h
index 0926717..8f925f1 100644
--- a/include/llvm/ADT/PackedVector.h
+++ b/include/llvm/ADT/PackedVector.h
@@ -15,6 +15,7 @@
 #define LLVM_ADT_PACKEDVECTOR_H
 
 #include "llvm/ADT/BitVector.h"
+#include <cassert>
 #include <limits>
 
 namespace llvm {
@@ -83,14 +84,15 @@
     PackedVector &Vec;
     const unsigned Idx;
 
-    reference(); // Undefined
   public:
+    reference() = delete;
     reference(PackedVector &vec, unsigned idx) : Vec(vec), Idx(idx) {}
 
     reference &operator=(T val) {
       Vec.setValue(Vec.Bits, Idx, val);
       return *this;
     }
+
     operator T() const {
       return Vec.getValue(Vec.Bits, Idx);
     }
@@ -144,6 +146,6 @@
 // Leave BitNum=0 undefined.
 template <typename T> class PackedVector<T, 0>;
 
-} // end llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_ADT_PACKEDVECTOR_H
diff --git a/include/llvm/ADT/ScopedHashTable.h b/include/llvm/ADT/ScopedHashTable.h
index 4af3d6d..ad805b0 100644
--- a/include/llvm/ADT/ScopedHashTable.h
+++ b/include/llvm/ADT/ScopedHashTable.h
@@ -32,7 +32,10 @@
 #define LLVM_ADT_SCOPEDHASHTABLE_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/Support/Allocator.h"
+#include <cassert>
+#include <new>
 
 namespace llvm {
 
@@ -46,6 +49,7 @@
   ScopedHashTableVal *NextForKey;
   K Key;
   V Val;
+
   ScopedHashTableVal(const K &key, const V &val) : Key(key), Val(val) {}
 
 public:
@@ -89,11 +93,11 @@
   /// LastValInScope - This is the last value that was inserted for this scope
   /// or null if none have been inserted yet.
   ScopedHashTableVal<K, V> *LastValInScope;
-  void operator=(ScopedHashTableScope &) = delete;
-  ScopedHashTableScope(ScopedHashTableScope &) = delete;
 
 public:
   ScopedHashTableScope(ScopedHashTable<K, V, KInfo, AllocatorTy> &HT);
+  ScopedHashTableScope(ScopedHashTableScope &) = delete;
+  ScopedHashTableScope &operator=(ScopedHashTableScope &) = delete;
   ~ScopedHashTableScope();
 
   ScopedHashTableScope *getParentScope() { return PrevScope; }
@@ -101,6 +105,7 @@
 
 private:
   friend class ScopedHashTable<K, V, KInfo, AllocatorTy>;
+
   ScopedHashTableVal<K, V> *getLastValInScope() {
     return LastValInScope;
   }
@@ -150,19 +155,20 @@
   typedef unsigned size_type;
 
 private:
+  friend class ScopedHashTableScope<K, V, KInfo, AllocatorTy>;
+
   typedef ScopedHashTableVal<K, V> ValTy;
   DenseMap<K, ValTy*, KInfo> TopLevelMap;
-  ScopeTy *CurScope;
+  ScopeTy *CurScope = nullptr;
 
   AllocatorTy Allocator;
 
-  ScopedHashTable(const ScopedHashTable &); // NOT YET IMPLEMENTED
-  void operator=(const ScopedHashTable &);  // NOT YET IMPLEMENTED
-  friend class ScopedHashTableScope<K, V, KInfo, AllocatorTy>;
-
 public:
-  ScopedHashTable() : CurScope(nullptr) {}
+  ScopedHashTable() = default;
   ScopedHashTable(AllocatorTy A) : CurScope(0), Allocator(A) {}
+  ScopedHashTable(const ScopedHashTable &) = delete;
+  ScopedHashTable &operator=(const ScopedHashTable &) = delete;
+
   ~ScopedHashTable() {
     assert(!CurScope && TopLevelMap.empty() && "Scope imbalance!");
   }
@@ -253,4 +259,4 @@
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_ADT_SCOPEDHASHTABLE_H
diff --git a/include/llvm/ADT/SmallPtrSet.h b/include/llvm/ADT/SmallPtrSet.h
index 88826ab..16e5ed8 100644
--- a/include/llvm/ADT/SmallPtrSet.h
+++ b/include/llvm/ADT/SmallPtrSet.h
@@ -16,7 +16,6 @@
 #define LLVM_ADT_SMALLPTRSET_H
 
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/PointerLikeTypeTraits.h"
 #include <cassert>
 #include <cstddef>
@@ -28,8 +27,6 @@
 
 namespace llvm {
 
-class SmallPtrSetIteratorImpl;
-
 /// SmallPtrSetImplBase - This is the common code shared among all the
 /// SmallPtrSet<>'s, which is almost everything.  SmallPtrSet has two modes, one
 /// for small and one for large sets.
@@ -72,12 +69,14 @@
                       const SmallPtrSetImplBase &that);
   SmallPtrSetImplBase(const void **SmallStorage, unsigned SmallSize,
                       SmallPtrSetImplBase &&that);
+
   explicit SmallPtrSetImplBase(const void **SmallStorage, unsigned SmallSize)
       : SmallArray(SmallStorage), CurArray(SmallStorage),
         CurArraySize(SmallSize), NumNonEmpty(0), NumTombstones(0) {
     assert(SmallSize && (SmallSize & (SmallSize-1)) == 0 &&
            "Initial size must be a power of two!");
   }
+
   ~SmallPtrSetImplBase() {
     if (!isSmall())
       free(CurArray);
@@ -85,6 +84,9 @@
 
 public:
   typedef unsigned size_type;
+
+  SmallPtrSetImplBase &operator=(const SmallPtrSetImplBase &) = delete;
+
   LLVM_NODISCARD bool empty() const { return size() == 0; }
   size_type size() const { return NumNonEmpty - NumTombstones; }
 
@@ -104,6 +106,7 @@
 
 protected:
   static void *getTombstoneMarker() { return reinterpret_cast<void*>(-2); }
+
   static void *getEmptyMarker() {
     // Note that -1 is chosen to make clear() efficiently implementable with
     // memset and because it's not a valid pointer value.
@@ -178,8 +181,6 @@
   /// Grow - Allocate a larger backing store for the buckets and move it over.
   void Grow(unsigned NewSize);
 
-  void operator=(const SmallPtrSetImplBase &RHS) = delete;
-
 protected:
   /// swap - Swaps the elements of two sets.
   /// Note: This method assumes that both sets have the same small size.
@@ -295,8 +296,6 @@
 class SmallPtrSetImpl : public SmallPtrSetImplBase {
   typedef PointerLikeTypeTraits<PtrType> PtrTraits;
 
-  SmallPtrSetImpl(const SmallPtrSetImpl &) = delete;
-
 protected:
   // Constructors that forward to the base.
   SmallPtrSetImpl(const void **SmallStorage, const SmallPtrSetImpl &that)
@@ -311,6 +310,8 @@
   typedef SmallPtrSetIterator<PtrType> iterator;
   typedef SmallPtrSetIterator<PtrType> const_iterator;
 
+  SmallPtrSetImpl(const SmallPtrSetImpl &) = delete;
+
   /// Inserts Ptr if and only if there is no element in the container equal to
   /// Ptr. The bool component of the returned pair is true if and only if the
   /// insertion takes place, and the iterator component of the pair points to
@@ -391,7 +392,7 @@
     return *this;
   }
 
-  SmallPtrSet<PtrType, SmallSize>&
+  SmallPtrSet<PtrType, SmallSize> &
   operator=(SmallPtrSet<PtrType, SmallSize> &&RHS) {
     if (&RHS != this)
       this->MoveFrom(SmallSizePowTwo, std::move(RHS));
@@ -410,14 +411,17 @@
     SmallPtrSetImplBase::swap(RHS);
   }
 };
-}
+
+} // end namespace llvm
 
 namespace std {
+
   /// Implement std::swap in terms of SmallPtrSet swap.
   template<class T, unsigned N>
   inline void swap(llvm::SmallPtrSet<T, N> &LHS, llvm::SmallPtrSet<T, N> &RHS) {
     LHS.swap(RHS);
   }
-}
 
-#endif
+} // end namespace std
+
+#endif // LLVM_ADT_SMALLPTRSET_H
diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index 996f56f..b958821 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@@ -27,6 +27,9 @@
 #include <initializer_list>
 #include <iterator>
 #include <memory>
+#include <new>
+#include <type_traits>
+#include <utility>
 
 namespace llvm {
 
@@ -57,8 +60,6 @@
   LLVM_NODISCARD bool empty() const { return BeginX == EndX; }
 };
 
-template <typename T, unsigned N> struct SmallVectorStorage;
-
 /// This is the part of SmallVectorTemplateBase which does not depend on whether
 /// the type T is a POD. The extra dummy template argument is used by ArrayRef
 /// to avoid unnecessarily requiring T to be complete.
@@ -70,7 +71,7 @@
   // Allocate raw space for N elements of type T.  If T has a ctor or dtor, we
   // don't want it to be automatically run, so we need to represent the space as
   // something else.  Use an array of char of sufficient alignment.
-  typedef llvm::AlignedCharArrayUnion<T> U;
+  typedef AlignedCharArrayUnion<T> U;
   U FirstEl;
   // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
 
@@ -93,6 +94,7 @@
   }
 
   void setEnd(T *P) { this->EndX = P; }
+
 public:
   typedef size_t size_type;
   typedef ptrdiff_t difference_type;
@@ -117,11 +119,12 @@
   iterator end() { return (iterator)this->EndX; }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   const_iterator end() const { return (const_iterator)this->EndX; }
+
 protected:
   iterator capacity_ptr() { return (iterator)this->CapacityX; }
   const_iterator capacity_ptr() const { return (const_iterator)this->CapacityX;}
-public:
 
+public:
   // reverse iterator creation methods.
   reverse_iterator rbegin()            { return reverse_iterator(end()); }
   const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
@@ -298,6 +301,7 @@
   void grow(size_t MinSize = 0) {
     this->grow_pod(MinSize*sizeof(T), sizeof(T));
   }
+
 public:
   void push_back(const T &Elt) {
     if (LLVM_UNLIKELY(this->EndX >= this->CapacityX))
@@ -311,14 +315,12 @@
   }
 };
 
-
 /// This class consists of common code factored out of the SmallVector class to
 /// reduce code duplication based on the SmallVector 'N' template parameter.
 template <typename T>
 class SmallVectorImpl : public SmallVectorTemplateBase<T, isPodLike<T>::value> {
   typedef SmallVectorTemplateBase<T, isPodLike<T>::value > SuperClass;
 
-  SmallVectorImpl(const SmallVectorImpl&) = delete;
 public:
   typedef typename SuperClass::iterator iterator;
   typedef typename SuperClass::const_iterator const_iterator;
@@ -331,6 +333,8 @@
   }
 
 public:
+  SmallVectorImpl(const SmallVectorImpl &) = delete;
+
   ~SmallVectorImpl() {
     // Destroy the constructed elements in the vector.
     this->destroy_range(this->begin(), this->end());
@@ -340,7 +344,6 @@
       free(this->begin());
   }
 
-
   void clear() {
     this->destroy_range(this->begin(), this->end());
     this->EndX = this->BeginX;
@@ -668,7 +671,6 @@
   }
 };
 
-
 template <typename T>
 void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
   if (this == &RHS) return;
@@ -841,6 +843,7 @@
 class SmallVector : public SmallVectorImpl<T> {
   /// Inline space for elements which aren't stored in the base class.
   SmallVectorStorage<T, N> Storage;
+
 public:
   SmallVector() : SmallVectorImpl<T>(N) {
   }
@@ -856,7 +859,7 @@
   }
 
   template <typename RangeTy>
-  explicit SmallVector(const llvm::iterator_range<RangeTy> &R)
+  explicit SmallVector(const iterator_range<RangeTy> &R)
       : SmallVectorImpl<T>(N) {
     this->append(R.begin(), R.end());
   }
@@ -906,9 +909,10 @@
   return X.capacity_in_bytes();
 }
 
-} // End llvm namespace
+} // end namespace llvm
 
 namespace std {
+
   /// Implement std::swap in terms of SmallVector swap.
   template<typename T>
   inline void
@@ -922,6 +926,7 @@
   swap(llvm::SmallVector<T, N> &LHS, llvm::SmallVector<T, N> &RHS) {
     LHS.swap(RHS);
   }
-}
 
-#endif
+} // end namespace std
+
+#endif // LLVM_ADT_SMALLVECTOR_H
diff --git a/include/llvm/ADT/SparseMultiSet.h b/include/llvm/ADT/SparseMultiSet.h
index e3aa258..08da4b6 100644
--- a/include/llvm/ADT/SparseMultiSet.h
+++ b/include/llvm/ADT/SparseMultiSet.h
@@ -21,7 +21,15 @@
 #ifndef LLVM_ADT_SPARSEMULTISET_H
 #define LLVM_ADT_SPARSEMULTISET_H
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <utility>
 
 namespace llvm {
 
@@ -73,7 +81,7 @@
 /// @tparam SparseT     An unsigned integer type. See above.
 ///
 template<typename ValueT,
-         typename KeyFunctorT = llvm::identity<unsigned>,
+         typename KeyFunctorT = identity<unsigned>,
          typename SparseT = uint8_t>
 class SparseMultiSet {
   static_assert(std::numeric_limits<SparseT>::is_integer &&
@@ -113,16 +121,16 @@
   typedef typename KeyFunctorT::argument_type KeyT;
   typedef SmallVector<SMSNode, 8> DenseT;
   DenseT Dense;
-  SparseT *Sparse;
-  unsigned Universe;
+  SparseT *Sparse = nullptr;
+  unsigned Universe = 0;
   KeyFunctorT KeyIndexOf;
   SparseSetValFunctor<KeyT, ValueT, KeyFunctorT> ValIndexOf;
 
   /// We have a built-in recycler for reusing tombstone slots. This recycler
   /// puts a singly-linked free list into tombstone slots, allowing us quick
   /// erasure, iterator preservation, and dense size.
-  unsigned FreelistIdx;
-  unsigned NumFree;
+  unsigned FreelistIdx = SMSNode::INVALID;
+  unsigned NumFree = 0;
 
   unsigned sparseIndex(const ValueT &Val) const {
     assert(ValIndexOf(Val) < Universe &&
@@ -131,11 +139,6 @@
   }
   unsigned sparseIndex(const SMSNode &N) const { return sparseIndex(N.Data); }
 
-  // Disable copy construction and assignment.
-  // This data structure is not meant to be used that way.
-  SparseMultiSet(const SparseMultiSet&) = delete;
-  SparseMultiSet &operator=(const SparseMultiSet&) = delete;
-
   /// Whether the given entry is the head of the list. List heads's previous
   /// pointers are to the tail of the list, allowing for efficient access to the
   /// list tail. D must be a valid entry node.
@@ -187,9 +190,9 @@
   typedef const ValueT *const_pointer;
   typedef unsigned size_type;
 
-  SparseMultiSet()
-    : Sparse(nullptr), Universe(0), FreelistIdx(SMSNode::INVALID), NumFree(0) {}
-
+  SparseMultiSet() = default;
+  SparseMultiSet(const SparseMultiSet &) = delete;
+  SparseMultiSet &operator=(const SparseMultiSet &) = delete;
   ~SparseMultiSet() { free(Sparse); }
 
   /// Set the universe size which determines the largest key the set can hold.
@@ -218,6 +221,7 @@
   class iterator_base : public std::iterator<std::bidirectional_iterator_tag,
                                              ValueT> {
     friend class SparseMultiSet;
+
     SMSPtrTy SMS;
     unsigned Idx;
     unsigned SparseIdx;
@@ -515,4 +519,4 @@
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_ADT_SPARSEMULTISET_H
diff --git a/include/llvm/ADT/SparseSet.h b/include/llvm/ADT/SparseSet.h
index 5b6494d..00c18c7 100644
--- a/include/llvm/ADT/SparseSet.h
+++ b/include/llvm/ADT/SparseSet.h
@@ -22,8 +22,11 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/DataTypes.h"
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
 #include <limits>
+#include <utility>
 
 namespace llvm {
 
@@ -115,7 +118,7 @@
 /// @tparam SparseT     An unsigned integer type. See above.
 ///
 template<typename ValueT,
-         typename KeyFunctorT = llvm::identity<unsigned>,
+         typename KeyFunctorT = identity<unsigned>,
          typename SparseT = uint8_t>
 class SparseSet {
   static_assert(std::numeric_limits<SparseT>::is_integer &&
@@ -126,16 +129,11 @@
   typedef SmallVector<ValueT, 8> DenseT;
   typedef unsigned size_type;
   DenseT Dense;
-  SparseT *Sparse;
-  unsigned Universe;
+  SparseT *Sparse = nullptr;
+  unsigned Universe = 0;
   KeyFunctorT KeyIndexOf;
   SparseSetValFunctor<KeyT, ValueT, KeyFunctorT> ValIndexOf;
 
-  // Disable copy construction and assignment.
-  // This data structure is not meant to be used that way.
-  SparseSet(const SparseSet&) = delete;
-  SparseSet &operator=(const SparseSet&) = delete;
-
 public:
   typedef ValueT value_type;
   typedef ValueT &reference;
@@ -143,7 +141,9 @@
   typedef ValueT *pointer;
   typedef const ValueT *const_pointer;
 
-  SparseSet() : Sparse(nullptr), Universe(0) {}
+  SparseSet() = default;
+  SparseSet(const SparseSet &) = delete;
+  SparseSet &operator=(const SparseSet &) = delete;
   ~SparseSet() { free(Sparse); }
 
   /// setUniverse - Set the universe size which determines the largest key the
@@ -308,9 +308,8 @@
     erase(I);
     return true;
   }
-
 };
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_ADT_SPARSESET_H
diff --git a/include/llvm/ADT/Twine.h b/include/llvm/ADT/Twine.h
index 81b1a6d..47caf46 100644
--- a/include/llvm/ADT/Twine.h
+++ b/include/llvm/ADT/Twine.h
@@ -12,12 +12,13 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
+#include <cstdint>
 #include <string>
 
 namespace llvm {
+
   class raw_ostream;
 
   /// Twine - A lightweight data structure for efficiently representing the
@@ -146,7 +147,6 @@
       const uint64_t *uHex;
     };
 
-  private:
     /// LHS - The prefix in the concatenation, which may be uninitialized for
     /// Null or Empty kinds.
     Child LHS;
@@ -158,7 +158,6 @@
     /// RHSKind - The NodeKind of the right hand side, \see getRHSKind().
     NodeKind RHSKind;
 
-  private:
     /// Construct a nullary twine; the kind must be NullKind or EmptyKind.
     explicit Twine(NodeKind Kind)
       : LHSKind(Kind), RHSKind(EmptyKind) {
@@ -179,10 +178,6 @@
       assert(isValid() && "Invalid twine!");
     }
 
-    /// Since the intended use of twines is as temporary objects, assignments
-    /// when concatenating might cause undefined behavior or stack corruptions
-    Twine &operator=(const Twine &Other) = delete;
-
     /// Check for the null twine.
     bool isNull() const {
       return getLHSKind() == NullKind;
@@ -370,6 +365,10 @@
       assert(isValid() && "Invalid twine!");
     }
 
+    /// Since the intended use of twines is as temporary objects, assignments
+    /// when concatenating might cause undefined behavior or stack corruptions
+    Twine &operator=(const Twine &) = delete;
+
     /// Create a 'null' string, which is an empty string that always
     /// concatenates to form another empty string.
     static Twine createNull() {
@@ -535,6 +534,7 @@
   }
 
   /// @}
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_ADT_TWINE_H
diff --git a/include/llvm/ADT/ilist.h b/include/llvm/ADT/ilist.h
index 295129f..a788f81 100644
--- a/include/llvm/ADT/ilist.h
+++ b/include/llvm/ADT/ilist.h
@@ -25,11 +25,9 @@
 #define LLVM_ADT_ILIST_H
 
 #include "llvm/ADT/simple_ilist.h"
-#include "llvm/Support/Compiler.h"
 #include <cassert>
 #include <cstddef>
 #include <iterator>
-#include <type_traits>
 
 namespace llvm {
 
@@ -208,12 +206,12 @@
   static bool op_less(const_reference L, const_reference R) { return L < R; }
   static bool op_equal(const_reference L, const_reference R) { return L == R; }
 
-  // Copying intrusively linked nodes doesn't make sense.
-  iplist_impl(const iplist_impl &) = delete;
-  void operator=(const iplist_impl &) = delete;
-
 public:
   iplist_impl() = default;
+
+  iplist_impl(const iplist_impl &) = delete;
+  iplist_impl &operator=(const iplist_impl &) = delete;
+
   iplist_impl(iplist_impl &&X)
       : TraitsT(std::move(X)), IntrusiveListT(std::move(X)) {}
   iplist_impl &operator=(iplist_impl &&X) {
@@ -221,6 +219,7 @@
     *static_cast<IntrusiveListT *>(this) = std::move(X);
     return *this;
   }
+
   ~iplist_impl() { clear(); }
 
   // Miscellaneous inspection routines.
@@ -308,7 +307,6 @@
   }
 
 public:
-
   //===----------------------------------------------------------------------===
   // Functionality derived from other functions defined above...
   //
@@ -408,25 +406,29 @@
 
 public:
   iplist() = default;
-  iplist(iplist &&X) : iplist_impl_type(std::move(X)) {}
+
   iplist(const iplist &X) = delete;
+  iplist &operator=(const iplist &X) = delete;
+
+  iplist(iplist &&X) : iplist_impl_type(std::move(X)) {}
   iplist &operator=(iplist &&X) {
     *static_cast<iplist_impl_type *>(this) = std::move(X);
     return *this;
   }
-  iplist &operator=(const iplist &X) = delete;
 };
 
 template <class T, class... Options> using ilist = iplist<T, Options...>;
 
-} // End llvm namespace
+} // end namespace llvm
 
 namespace std {
+
   // Ensure that swap uses the fast list swap...
   template<class Ty>
   void swap(llvm::iplist<Ty> &Left, llvm::iplist<Ty> &Right) {
     Left.swap(Right);
   }
-}  // End 'std' extensions...
+
+} // end namespace std
 
 #endif // LLVM_ADT_ILIST_H
diff --git a/include/llvm/Analysis/CGSCCPassManager.h b/include/llvm/Analysis/CGSCCPassManager.h
index dcff167..fbbbace 100644
--- a/include/llvm/Analysis/CGSCCPassManager.h
+++ b/include/llvm/Analysis/CGSCCPassManager.h
@@ -308,8 +308,11 @@
 
       do {
         LazyCallGraph::RefSCC *RC = RCWorklist.pop_back_val();
-        if (InvalidRefSCCSet.count(RC))
+        if (InvalidRefSCCSet.count(RC)) {
+          if (DebugLogging)
+            dbgs() << "Skipping an invalid RefSCC...\n";
           continue;
+        }
 
         assert(CWorklist.empty() &&
                "Should always start with an empty SCC worklist");
@@ -328,8 +331,17 @@
           // other RefSCCs in the worklist. The invalid ones are dead and the
           // other RefSCCs should be queued above, so we just need to skip both
           // scenarios here.
-          if (InvalidSCCSet.count(C) || &C->getOuterRefSCC() != RC)
+          if (InvalidSCCSet.count(C)) {
+            if (DebugLogging)
+              dbgs() << "Skipping an invalid SCC...\n";
             continue;
+          }
+          if (&C->getOuterRefSCC() != RC) {
+            if (DebugLogging)
+              dbgs() << "Skipping an SCC that is now part of some other "
+                        "RefSCC...\n";
+            continue;
+          }
 
           do {
             // Check that we didn't miss any update scenario.
diff --git a/include/llvm/Analysis/LazyCallGraph.h b/include/llvm/Analysis/LazyCallGraph.h
index d1864ae..58e3209 100644
--- a/include/llvm/Analysis/LazyCallGraph.h
+++ b/include/llvm/Analysis/LazyCallGraph.h
@@ -953,6 +953,13 @@
   /// useful to code doing updates or otherwise wanting to walk the IR in the
   /// same patterns as when we build the call graph.
 
+  /// Recursively visits the defined functions whose address is reachable from
+  /// every constant in the \p Worklist.
+  ///
+  /// Doesn't recurse through any constants already in the \p Visited set, and
+  /// updates that set with every constant visited.
+  ///
+  /// For each defined function, calls \p Callback with that function.
   template <typename CallbackT>
   static void visitReferences(SmallVectorImpl<Constant *> &Worklist,
                               SmallPtrSetImpl<Constant *> &Visited,
@@ -961,7 +968,8 @@
       Constant *C = Worklist.pop_back_val();
 
       if (Function *F = dyn_cast<Function>(C)) {
-        Callback(*F);
+        if (!F->isDeclaration())
+          Callback(*F);
         continue;
       }
 
@@ -969,10 +977,10 @@
         if (Visited.insert(cast<Constant>(Op)).second)
           Worklist.push_back(cast<Constant>(Op));
     }
-
-    ///@}
   }
 
+  ///@}
+
 private:
   typedef SmallVectorImpl<Node *>::reverse_iterator node_stack_iterator;
   typedef iterator_range<node_stack_iterator> node_stack_range;
diff --git a/include/llvm/CodeGen/CommandFlags.h b/include/llvm/CodeGen/CommandFlags.h
index 51013ca..aab522d 100644
--- a/include/llvm/CodeGen/CommandFlags.h
+++ b/include/llvm/CodeGen/CommandFlags.h
@@ -232,6 +232,11 @@
              cl::desc("Use .ctors instead of .init_array."),
              cl::init(false));
 
+cl::opt<bool> RelaxELFRelocations(
+    "relax-elf-relocations",
+    cl::desc("Emit GOTPCRELX/REX_GOTPCRELX instead of GOTPCREL on x86-64 ELF"),
+    cl::init(false));
+
 cl::opt<bool> DataSections("data-sections",
                            cl::desc("Emit data into separate sections"),
                            cl::init(false));
@@ -288,6 +293,7 @@
   Options.StackAlignmentOverride = OverrideStackAlignment;
   Options.StackSymbolOrdering = StackSymbolOrdering;
   Options.UseInitArray = !UseCtors;
+  Options.RelaxELFRelocations = RelaxELFRelocations;
   Options.DataSections = DataSections;
   Options.FunctionSections = FunctionSections;
   Options.UniqueSectionNames = UniqueSectionNames;
diff --git a/include/llvm/CodeGen/DIE.h b/include/llvm/CodeGen/DIE.h
index 6872f77..f1d7ecb 100644
--- a/include/llvm/CodeGen/DIE.h
+++ b/include/llvm/CodeGen/DIE.h
@@ -15,12 +15,12 @@
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DIE_H
 
 #include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/iterator.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/DwarfStringPoolEntry.h"
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Allocator.h"
@@ -31,6 +31,7 @@
 #include <iterator>
 #include <new>
 #include <type_traits>
+#include <vector>
 
 namespace llvm {
 
@@ -112,6 +113,37 @@
 };
 
 //===--------------------------------------------------------------------===//
+/// Helps unique DIEAbbrev objects and assigns abbreviation numbers.
+///
+/// This class will unique the DIE abbreviations for a llvm::DIE object and
+/// assign a unique abbreviation number to each unique DIEAbbrev object it
+/// finds. The resulting collection of DIEAbbrev objects can then be emitted
+/// into the .debug_abbrev section.
+class DIEAbbrevSet {
+  /// The bump allocator to use when creating DIEAbbrev objects in the uniqued
+  /// storage container.
+  BumpPtrAllocator &Alloc;
+  /// \brief FoldingSet that uniques the abbreviations.
+  llvm::FoldingSet<DIEAbbrev> AbbreviationsSet;
+  /// A list of all the unique abbreviations in use.
+  std::vector<DIEAbbrev *> Abbreviations;
+
+public:
+  DIEAbbrevSet(BumpPtrAllocator &A) : Alloc(A) {}
+  ~DIEAbbrevSet();
+  /// Generate the abbreviation declaration for a DIE and return a pointer to
+  /// the generated abbreviation.
+  ///
+  /// \param Die the debug info entry to generate the abbreviation for.
+  /// \returns A reference to the uniqued abbreviation declaration that is
+  /// owned by this class.
+  DIEAbbrev &uniqueAbbreviation(DIE &Die);
+
+  /// Print all abbreviations using the specified asm printer.
+  void Emit(const AsmPrinter *AP, MCSection *Section) const;
+};
+
+//===--------------------------------------------------------------------===//
 /// An integer value DIE.
 ///
 class DIEInteger {
@@ -201,8 +233,9 @@
 };
 
 //===--------------------------------------------------------------------===//
-/// A container for string values.
+/// A container for string pool string values.
 ///
+/// This class is used with the DW_FORM_strp and DW_FORM_GNU_str_index forms.
 class DIEString {
   DwarfStringPoolEntryRef S;
 
@@ -219,6 +252,27 @@
 };
 
 //===--------------------------------------------------------------------===//
+/// A container for inline string values.
+///
+/// This class is used with the DW_FORM_string form.
+class DIEInlineString {
+  std::string S;
+
+public:
+  explicit DIEInlineString(StringRef Str) : S(Str.str()) {}
+
+  ~DIEInlineString() = default;
+
+  /// Grab the string out of the object.
+  StringRef getString() const { return StringRef(S); }
+
+  void EmitValue(const AsmPrinter *AP, dwarf::Form Form) const;
+  unsigned SizeOf(const AsmPrinter *AP, dwarf::Form Form) const;
+
+  void print(raw_ostream &O) const;
+};
+
+//===--------------------------------------------------------------------===//
 /// A pointer to another debug information entry.  An instance of this class can
 /// also be used as a proxy for a debug information entry not yet defined
 /// (ie. types.)
@@ -233,14 +287,8 @@
 
   DIE &getEntry() const { return *Entry; }
 
-  /// Returns size of a ref_addr entry.
-  static unsigned getRefAddrSize(const AsmPrinter *AP);
-
   void EmitValue(const AsmPrinter *AP, dwarf::Form Form) const;
-  unsigned SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
-    return Form == dwarf::DW_FORM_ref_addr ? getRefAddrSize(AP)
-                                           : sizeof(int32_t);
-  }
+  unsigned SizeOf(const AsmPrinter *AP, dwarf::Form Form) const;
 
   void print(raw_ostream &O) const;
 };
@@ -595,20 +643,13 @@
   friend class IntrusiveBackList<DIE>;
   friend class DIEUnit;
 
-  /// Offset - Dwarf unit relative offset.
-  ///
+  /// Dwarf unit relative offset.
   unsigned Offset;
-
-  /// Size - Size of instance + children.
-  ///
+  /// Size of instance + children.
   unsigned Size;
-
   unsigned AbbrevNumber = ~0u;
-
-  /// Tag - Dwarf tag code.
-  ///
+  /// Dwarf tag code.
   dwarf::Tag Tag = (dwarf::Tag)0;
-
   /// Children DIEs.
   IntrusiveBackList<DIE> Children;
 
@@ -664,6 +705,25 @@
   /// for this DIE.
   unsigned getDebugSectionOffset() const;
 
+  /// Compute the offset of this DIE and all its children.
+  ///
+  /// This function gets called just before we are going to generate the debug
+  /// information and gives each DIE a chance to figure out its CU relative DIE
+  /// offset, unique its abbreviation and fill in the abbreviation code, and
+  /// return the unit offset that points to where the next DIE will be emitted
+  /// within the debug unit section. After this function has been called for all
+  /// DIE objects, the DWARF can be generated since all DIEs will be able to
+  /// properly refer to other DIE objects since all DIEs have calculated their
+  /// offsets.
+  ///
+  /// \param AP AsmPrinter to use when calculating sizes.
+  /// \param AbbrevSet the abbreviation used to unique DIE abbreviations.
+  /// \param CUOffset the compile/type unit relative offset in bytes.
+  /// \returns the offset for the DIE that follows this DIE within the
+  /// current compile/type unit.
+  unsigned computeOffsetsAndAbbrevs(const AsmPrinter *AP,
+                                    DIEAbbrevSet &AbbrevSet, unsigned CUOffset);
+
   /// Climb up the parent chain to get the compile unit or type unit DIE that
   /// this DIE belongs to.
   ///
diff --git a/include/llvm/CodeGen/DIEValue.def b/include/llvm/CodeGen/DIEValue.def
index c5ff401..a3fce9b 100644
--- a/include/llvm/CodeGen/DIEValue.def
+++ b/include/llvm/CodeGen/DIEValue.def
@@ -40,6 +40,7 @@
 HANDLE_DIEVALUE_LARGE(Block)
 HANDLE_DIEVALUE_LARGE(Loc)
 HANDLE_DIEVALUE_SMALL(LocList)
+HANDLE_DIEVALUE_LARGE(InlineString)
 
 #undef HANDLE_DIEVALUE
 #undef HANDLE_DIEVALUE_SMALL
diff --git a/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index ba85b8e..76e0d47 100644
--- a/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -83,7 +83,7 @@
   /// @{
 
   /// Translate \p Inst into its corresponding MachineInstr instruction(s).
-  /// Insert the newly translated instruction(s) right where the MIRBuilder
+  /// Insert the newly translated instruction(s) right where the CurBuilder
   /// is set.
   ///
   /// The general algorithm is:
@@ -114,50 +114,56 @@
 
   /// Translate an LLVM bitcast into generic IR. Either a COPY or a G_BITCAST is
   /// emitted.
-  bool translateBitCast(const User &U);
+  bool translateBitCast(const User &U, MachineIRBuilder &MIRBuilder);
 
   /// Translate an LLVM load instruction into generic IR.
-  bool translateLoad(const User &U);
+  bool translateLoad(const User &U, MachineIRBuilder &MIRBuilder);
 
   /// Translate an LLVM store instruction into generic IR.
-  bool translateStore(const User &U);
+  bool translateStore(const User &U, MachineIRBuilder &MIRBuilder);
 
-  bool translateMemcpy(const CallInst &CI);
+  bool translateMemcpy(const CallInst &CI, MachineIRBuilder &MIRBuilder);
 
-  void getStackGuard(unsigned DstReg);
+  void getStackGuard(unsigned DstReg, MachineIRBuilder &MIRBuilder);
 
-  bool translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID);
+  bool translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
+                                  MachineIRBuilder &MIRBuilder);
+
+  bool translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
+                               MachineIRBuilder &MIRBuilder);
 
   /// Translate call instruction.
   /// \pre \p U is a call instruction.
-  bool translateCall(const User &U);
+  bool translateCall(const User &U, MachineIRBuilder &MIRBuilder);
 
-  bool translateInvoke(const User &U);
+  bool translateInvoke(const User &U, MachineIRBuilder &MIRBuilder);
 
-  bool translateLandingPad(const User &U);
+  bool translateLandingPad(const User &U, MachineIRBuilder &MIRBuilder);
 
   /// Translate one of LLVM's cast instructions into MachineInstrs, with the
   /// given generic Opcode.
-  bool translateCast(unsigned Opcode, const User &U);
+  bool translateCast(unsigned Opcode, const User &U,
+                     MachineIRBuilder &MIRBuilder);
 
   /// Translate static alloca instruction (i.e. one  of constant size and in the
   /// first basic block).
-  bool translateStaticAlloca(const AllocaInst &Inst);
+  bool translateStaticAlloca(const AllocaInst &Inst,
+                             MachineIRBuilder &MIRBuilder);
 
   /// Translate a phi instruction.
-  bool translatePHI(const User &U);
+  bool translatePHI(const User &U, MachineIRBuilder &MIRBuilder);
 
   /// Translate a comparison (icmp or fcmp) instruction or constant.
-  bool translateCompare(const User &U);
+  bool translateCompare(const User &U, MachineIRBuilder &MIRBuilder);
 
   /// Translate an integer compare instruction (or constant).
-  bool translateICmp(const User &U) {
-    return translateCompare(U);
+  bool translateICmp(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateCompare(U, MIRBuilder);
   }
 
   /// Translate a floating-point compare instruction (or constant).
-  bool translateFCmp(const User &U) {
-    return translateCompare(U);
+  bool translateFCmp(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateCompare(U, MIRBuilder);
   }
 
 
@@ -167,146 +173,182 @@
 
   /// Translate \p Inst into a binary operation \p Opcode.
   /// \pre \p U is a binary operation.
-  bool translateBinaryOp(unsigned Opcode, const User &U);
+  bool translateBinaryOp(unsigned Opcode, const User &U,
+                         MachineIRBuilder &MIRBuilder);
 
   /// Translate branch (br) instruction.
   /// \pre \p U is a branch instruction.
-  bool translateBr(const User &U);
+  bool translateBr(const User &U, MachineIRBuilder &MIRBuilder);
 
-  bool translateExtractValue(const User &U);
+  bool translateExtractValue(const User &U, MachineIRBuilder &MIRBuilder);
 
-  bool translateInsertValue(const User &U);
+  bool translateInsertValue(const User &U, MachineIRBuilder &MIRBuilder);
 
-  bool translateSelect(const User &U);
+  bool translateSelect(const User &U, MachineIRBuilder &MIRBuilder);
 
-  bool translateGetElementPtr(const User &U);
+  bool translateGetElementPtr(const User &U, MachineIRBuilder &MIRBuilder);
 
   /// Translate return (ret) instruction.
   /// The target needs to implement CallLowering::lowerReturn for
   /// this to succeed.
   /// \pre \p U is a return instruction.
-  bool translateRet(const User &U);
+  bool translateRet(const User &U, MachineIRBuilder &MIRBuilder);
 
-  bool translateAdd(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_ADD, U);
+  bool translateAdd(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_ADD, U, MIRBuilder);
   }
-  bool translateSub(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_SUB, U);
+  bool translateSub(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_SUB, U, MIRBuilder);
   }
-  bool translateAnd(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_AND, U);
+  bool translateAnd(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_AND, U, MIRBuilder);
   }
-  bool translateMul(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_MUL, U);
+  bool translateMul(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_MUL, U, MIRBuilder);
   }
-  bool translateOr(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_OR, U);
+  bool translateOr(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_OR, U, MIRBuilder);
   }
-  bool translateXor(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_XOR, U);
+  bool translateXor(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_XOR, U, MIRBuilder);
   }
 
-  bool translateUDiv(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_UDIV, U);
+  bool translateUDiv(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_UDIV, U, MIRBuilder);
   }
-  bool translateSDiv(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_SDIV, U);
+  bool translateSDiv(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_SDIV, U, MIRBuilder);
   }
-  bool translateURem(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_UREM, U);
+  bool translateURem(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_UREM, U, MIRBuilder);
   }
-  bool translateSRem(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_SREM, U);
+  bool translateSRem(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_SREM, U, MIRBuilder);
+  }
+  bool translateAlloca(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateStaticAlloca(cast<AllocaInst>(U), MIRBuilder);
+  }
+  bool translateIntToPtr(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateCast(TargetOpcode::G_INTTOPTR, U, MIRBuilder);
+  }
+  bool translatePtrToInt(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateCast(TargetOpcode::G_PTRTOINT, U, MIRBuilder);
+  }
+  bool translateTrunc(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateCast(TargetOpcode::G_TRUNC, U, MIRBuilder);
+  }
+  bool translateFPTrunc(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateCast(TargetOpcode::G_FPTRUNC, U, MIRBuilder);
+  }
+  bool translateFPExt(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateCast(TargetOpcode::G_FPEXT, U, MIRBuilder);
+  }
+  bool translateFPToUI(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateCast(TargetOpcode::G_FPTOUI, U, MIRBuilder);
+  }
+  bool translateFPToSI(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateCast(TargetOpcode::G_FPTOSI, U, MIRBuilder);
+  }
+  bool translateUIToFP(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateCast(TargetOpcode::G_UITOFP, U, MIRBuilder);
+  }
+  bool translateSIToFP(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateCast(TargetOpcode::G_SITOFP, U, MIRBuilder);
+  }
+  bool translateUnreachable(const User &U, MachineIRBuilder &MIRBuilder) {
+    return true;
+  }
+  bool translateSExt(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateCast(TargetOpcode::G_SEXT, U, MIRBuilder);
   }
 
-  bool translateAlloca(const User &U) {
-    return translateStaticAlloca(cast<AllocaInst>(U));
-  }
-  bool translateIntToPtr(const User &U) {
-    return translateCast(TargetOpcode::G_INTTOPTR, U);
-  }
-  bool translatePtrToInt(const User &U) {
-    return translateCast(TargetOpcode::G_PTRTOINT, U);
-  }
-  bool translateTrunc(const User &U) {
-    return translateCast(TargetOpcode::G_TRUNC, U);
-  }
-  bool translateFPTrunc(const User &U) {
-    return translateCast(TargetOpcode::G_FPTRUNC, U);
-  }
-  bool translateFPExt(const User &U) {
-    return translateCast(TargetOpcode::G_FPEXT, U);
-  }
-  bool translateFPToUI(const User &U) {
-    return translateCast(TargetOpcode::G_FPTOUI, U);
-  }
-  bool translateFPToSI(const User &U) {
-    return translateCast(TargetOpcode::G_FPTOSI, U);
-  }
-  bool translateUIToFP(const User &U) {
-    return translateCast(TargetOpcode::G_UITOFP, U);
-  }
-  bool translateSIToFP(const User &U) {
-    return translateCast(TargetOpcode::G_SITOFP, U);
+  bool translateZExt(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateCast(TargetOpcode::G_ZEXT, U, MIRBuilder);
   }
 
-  bool translateUnreachable(const User &U) { return true; }
-
-  bool translateSExt(const User &U) {
-    return translateCast(TargetOpcode::G_SEXT, U);
+  bool translateShl(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_SHL, U, MIRBuilder);
+  }
+  bool translateLShr(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_LSHR, U, MIRBuilder);
+  }
+  bool translateAShr(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_ASHR, U, MIRBuilder);
   }
 
-  bool translateZExt(const User &U) {
-    return translateCast(TargetOpcode::G_ZEXT, U);
+  bool translateFAdd(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_FADD, U, MIRBuilder);
   }
-
-  bool translateShl(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_SHL, U);
+  bool translateFSub(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_FSUB, U, MIRBuilder);
   }
-  bool translateLShr(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_LSHR, U);
+  bool translateFMul(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_FMUL, U, MIRBuilder);
   }
-  bool translateAShr(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_ASHR, U);
+  bool translateFDiv(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_FDIV, U, MIRBuilder);
   }
-
-  bool translateFAdd(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_FADD, U);
-  }
-  bool translateFSub(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_FSUB, U);
-  }
-  bool translateFMul(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_FMUL, U);
-  }
-  bool translateFDiv(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_FDIV, U);
-  }
-  bool translateFRem(const User &U) {
-    return translateBinaryOp(TargetOpcode::G_FREM, U);
+  bool translateFRem(const User &U, MachineIRBuilder &MIRBuilder) {
+    return translateBinaryOp(TargetOpcode::G_FREM, U, MIRBuilder);
   }
 
 
   // Stubs to keep the compiler happy while we implement the rest of the
   // translation.
-  bool translateSwitch(const User &U) { return false; }
-  bool translateIndirectBr(const User &U) { return false; }
-  bool translateResume(const User &U) { return false; }
-  bool translateCleanupRet(const User &U) { return false; }
-  bool translateCatchRet(const User &U) { return false; }
-  bool translateCatchSwitch(const User &U) { return false; }
-  bool translateFence(const User &U) { return false; }
-  bool translateAtomicCmpXchg(const User &U) { return false; }
-  bool translateAtomicRMW(const User &U) { return false; }
-  bool translateAddrSpaceCast(const User &U) { return false; }
-  bool translateCleanupPad(const User &U) { return false; }
-  bool translateCatchPad(const User &U) { return false; }
-  bool translateUserOp1(const User &U) { return false; }
-  bool translateUserOp2(const User &U) { return false; }
-  bool translateVAArg(const User &U) { return false; }
-  bool translateExtractElement(const User &U) { return false; }
-  bool translateInsertElement(const User &U) { return false; }
-  bool translateShuffleVector(const User &U) { return false; }
+  bool translateSwitch(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateIndirectBr(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateResume(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateCleanupRet(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateCatchRet(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateCatchSwitch(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateFence(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateAtomicCmpXchg(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateAtomicRMW(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateAddrSpaceCast(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateCleanupPad(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateCatchPad(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateUserOp1(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateUserOp2(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateVAArg(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateExtractElement(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateInsertElement(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
+  bool translateShuffleVector(const User &U, MachineIRBuilder &MIRBuilder) {
+    return false;
+  }
 
   /// @}
 
@@ -314,12 +356,15 @@
   // I.e., compared to regular MIBuilder, this one also inserts the instruction
   // in the current block, it can creates block, etc., basically a kind of
   // IRBuilder, but for Machine IR.
-  MachineIRBuilder MIRBuilder;
+  MachineIRBuilder CurBuilder;
 
   // Builder set to the entry block (just after ABI lowering instructions). Used
   // as a convenient location for Constants.
   MachineIRBuilder EntryBuilder;
 
+  // The MachineFunction currently being translated.
+  MachineFunction *MF;
+
   /// MachineRegisterInfo used to create virtual registers.
   MachineRegisterInfo *MRI;
 
diff --git a/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 98005d1..ecd3e5e 100644
--- a/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugLoc.h"
 
 #include <queue>
@@ -47,8 +48,7 @@
   /// Fields describing the insertion point.
   /// @{
   MachineBasicBlock *MBB;
-  MachineInstr *MI;
-  bool Before;
+  MachineBasicBlock::iterator II;
   /// @}
 
   std::function<void(MachineInstr *)> InsertedInstr;
@@ -74,22 +74,28 @@
   }
 
   /// Current insertion point for new instructions.
-  MachineBasicBlock::iterator getInsertPt();
+  MachineBasicBlock::iterator getInsertPt() {
+    return II;
+  }
+
+  /// Set the insertion point before the specified position.
+  /// \pre MBB must be in getMF().
+  /// \pre II must be a valid iterator in MBB.
+  void setInsertPt(MachineBasicBlock &MBB, MachineBasicBlock::iterator II);
+  /// @}
 
   /// Setters for the insertion point.
   /// @{
   /// Set the MachineFunction where to build instructions.
   void setMF(MachineFunction &);
 
-  /// Set the insertion point to the beginning (\p Beginning = true) or end
-  /// (\p Beginning = false) of \p MBB.
+  /// Set the insertion point to the  end of \p MBB.
   /// \pre \p MBB must be contained by getMF().
-  void setMBB(MachineBasicBlock &MBB, bool Beginning = false);
+  void setMBB(MachineBasicBlock &MBB);
 
-  /// Set the insertion point to before (\p Before = true) or after
-  /// (\p Before = false) \p MI.
+  /// Set the insertion point to before MI.
   /// \pre MI must be in getMF().
-  void setInstr(MachineInstr &MI, bool Before = true);
+  void setInstr(MachineInstr &MI);
   /// @}
 
   /// Control where instructions we create are recorded (typically for
@@ -292,6 +298,18 @@
 
   /// Build and insert \p Res = G_CONSTANT \p Val
   ///
+  /// G_CONSTANT is an integer constant with the specified size and value. \p
+  /// Val will be extended or truncated to the size of \p Reg.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res must be a generic virtual register with scalar or pointer
+  ///      type.
+  ///
+  /// \return The newly created instruction.
+  MachineInstrBuilder buildConstant(unsigned Res, const ConstantInt &Val);
+
+  /// Build and insert \p Res = G_CONSTANT \p Val
+  ///
   /// G_CONSTANT is an integer constant with the specified size and value.
   ///
   /// \pre setBasicBlock or setMI must have been called.
@@ -475,7 +493,7 @@
   /// \pre setBasicBlock or setMI must have been called.
   /// \pre \p Res, \p Op0 and \p Op1 must be generic virtual registers
   ///      with the same type.
-  /// \pre \p Tst must be a generic virtual register with scalar or
+  /// \pre \p Tst must be a generic virtual register with scalar, pointer or
   ///      vector type. If vector then it must have the same number of
   ///      elements as the other parameters.
   ///
diff --git a/include/llvm/CodeGen/LivePhysRegs.h b/include/llvm/CodeGen/LivePhysRegs.h
index 96660de..15bdefb 100644
--- a/include/llvm/CodeGen/LivePhysRegs.h
+++ b/include/llvm/CodeGen/LivePhysRegs.h
@@ -60,11 +60,10 @@
   }
 
   /// \brief Clear and initialize the LivePhysRegs set.
-  void init(const TargetRegisterInfo *TRI) {
-    assert(TRI && "Invalid TargetRegisterInfo pointer.");
-    this->TRI = TRI;
+  void init(const TargetRegisterInfo &TRI) {
+    this->TRI = &TRI;
     LiveRegs.clear();
-    LiveRegs.setUniverse(TRI->getNumRegs());
+    LiveRegs.setUniverse(TRI.getNumRegs());
   }
 
   /// \brief Clears the LivePhysRegs set.
diff --git a/include/llvm/Config/abi-breaking.h.cmake b/include/llvm/Config/abi-breaking.h.cmake
index 303625a..e5697f7 100644
--- a/include/llvm/Config/abi-breaking.h.cmake
+++ b/include/llvm/Config/abi-breaking.h.cmake
@@ -15,6 +15,11 @@
 /* Define to enable checks that alter the LLVM C++ ABI */
 #cmakedefine01 LLVM_ENABLE_ABI_BREAKING_CHECKS
 
+/* Define to disable the link-time checking of mismatch for
+   LLVM_ENABLE_ABI_BREAKING_CHECKS */
+#cmakedefine01 LLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING
+#if !LLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING
+
 // ABI_BREAKING_CHECKS protection: provides link-time failure when clients build
 // mismatch with LLVM
 #if defined(_MSC_VER)
@@ -38,4 +43,6 @@
 }
 #endif // _MSC_VER
 
+#endif // LLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING
+
 #endif
diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake
index ebe33d5..fe87829 100644
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake
@@ -17,10 +17,10 @@
 #cmakedefine HAVE_BACKTRACE ${HAVE_BACKTRACE}
 
 /* Define to 1 if you have the <CrashReporterClient.h> header file. */
-#undef HAVE_CRASHREPORTERCLIENT_H
+#cmakedefine HAVE_CRASHREPORTERCLIENT_H
 
 /* can use __crashreporter_info__ */
-#undef HAVE_CRASHREPORTER_INFO
+#cmakedefine01 HAVE_CRASHREPORTER_INFO
 
 /* Define to 1 if you have the declaration of `arc4random', and to 0 if you
    don't. */
@@ -120,6 +120,9 @@
 /* Define to 1 if you have the <link.h> header file. */
 #cmakedefine HAVE_LINK_H ${HAVE_LINK_H}
 
+/* Define to 1 if you have the `lseek64' function. */
+#cmakedefine HAVE_LSEEK64 ${HAVE_LSEEK64}
+
 /* Define to 1 if you have the <mach/mach.h> header file. */
 #cmakedefine HAVE_MACH_MACH_H ${HAVE_MACH_MACH_H}
 
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h b/include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h
index 67c4a2b..f732dee 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h
@@ -23,6 +23,9 @@
   uint32_t FirstAbbrCode;
   std::vector<DWARFAbbreviationDeclaration> Decls;
 
+  typedef std::vector<DWARFAbbreviationDeclaration>::const_iterator
+      const_iterator;
+
 public:
   DWARFAbbreviationDeclarationSet();
 
@@ -33,6 +36,14 @@
   const DWARFAbbreviationDeclaration *
   getAbbreviationDeclaration(uint32_t AbbrCode) const;
 
+  const_iterator begin() const {
+    return Decls.begin();
+  }
+
+  const_iterator end() const {
+    return Decls.end();
+  }
+
 private:
   void clear();
 };
@@ -53,6 +64,14 @@
   void dump(raw_ostream &OS) const;
   void extract(DataExtractor Data);
 
+  DWARFAbbreviationDeclarationSetMap::const_iterator begin() const {
+    return AbbrDeclSets.begin();
+  }
+
+  DWARFAbbreviationDeclarationSetMap::const_iterator end() const {
+    return AbbrDeclSets.end();
+  }
+
 private:
   void clear();
 };
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h b/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
index 837a8e6..5a60239 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
@@ -59,6 +59,8 @@
 
   uint32_t getCompileUnitDIEOffset() const { return HeaderData.CuOffset; }
 
+  const Header &getHeader() const { return HeaderData; }
+
   desc_iterator_range descriptors() const {
     return desc_iterator_range(ArangeDescriptors.begin(),
                                ArangeDescriptors.end());
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h b/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
index 5e3a079..83102e6 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
@@ -105,6 +105,10 @@
                                       dwarf::Attribute Attr,
                                       uint64_t FailValue) const;
 
+  int64_t getAttributeValueAsSignedConstant(const DWARFUnit *U,
+                                            dwarf::Attribute Attr,
+                                            int64_t FailValue) const;
+
   uint64_t getAttributeValueAsUnsignedConstant(const DWARFUnit *U,
                                                dwarf::Attribute Attr,
                                                uint64_t FailValue) const;
diff --git a/include/llvm/DebugInfo/PDB/Raw/PDBFile.h b/include/llvm/DebugInfo/PDB/Raw/PDBFile.h
index ede5eaf..29f5b21 100644
--- a/include/llvm/DebugInfo/PDB/Raw/PDBFile.h
+++ b/include/llvm/DebugInfo/PDB/Raw/PDBFile.h
@@ -26,7 +26,6 @@
 
 namespace msf {
 class MappedBlockStream;
-class WritableStream;
 }
 
 namespace pdb {
@@ -96,7 +95,20 @@
 
   BumpPtrAllocator &getAllocator() { return Allocator; }
 
-private:
+  bool hasPDBDbiStream() const;
+  bool hasPDBGlobalsStream();
+  bool hasPDBInfoStream();
+  bool hasPDBIpiStream() const;
+  bool hasPDBPublicsStream();
+  bool hasPDBSymbolStream();
+  bool hasPDBTpiStream() const;
+  bool hasStringTable();
+
+ private:
+  Expected<std::unique_ptr<msf::MappedBlockStream>> safelyCreateIndexedStream(
+      const msf::MSFLayout &Layout, const msf::ReadableStream &MsfData,
+      uint32_t StreamIndex) const;
+
   BumpPtrAllocator &Allocator;
 
   std::unique_ptr<msf::ReadableStream> Buffer;
diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h
index 50eedec..93dbd57 100644
--- a/include/llvm/IR/BasicBlock.h
+++ b/include/llvm/IR/BasicBlock.h
@@ -14,21 +14,24 @@
 #ifndef LLVM_IR_BASICBLOCK_H
 #define LLVM_IR_BASICBLOCK_H
 
-#include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/SymbolTableListTraits.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Support/CBindingWrapping.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm-c/Types.h"
+#include <cassert>
+#include <cstddef>
 
 namespace llvm {
 
 class CallInst;
-class LandingPadInst;
-class TerminatorInst;
-class LLVMContext;
-class BlockAddress;
 class Function;
+class LandingPadInst;
+class LLVMContext;
+class TerminatorInst;
 
 /// \brief LLVM Basic Block Representation
 ///
@@ -47,19 +50,17 @@
 /// are "well formed".
 class BasicBlock : public Value, // Basic blocks are data objects also
                    public ilist_node_with_parent<BasicBlock, Function> {
-  friend class BlockAddress;
 public:
   typedef SymbolTableList<Instruction> InstListType;
 
 private:
+  friend class BlockAddress;
+  friend class SymbolTableListTraits<BasicBlock>;
+
   InstListType InstList;
   Function *Parent;
 
   void setParent(Function *parent);
-  friend class SymbolTableListTraits<BasicBlock>;
-
-  BasicBlock(const BasicBlock &) = delete;
-  void operator=(const BasicBlock &) = delete;
 
   /// \brief Constructor.
   ///
@@ -69,7 +70,12 @@
   explicit BasicBlock(LLVMContext &C, const Twine &Name = "",
                       Function *Parent = nullptr,
                       BasicBlock *InsertBefore = nullptr);
+
 public:
+  BasicBlock(const BasicBlock &) = delete;
+  BasicBlock &operator=(const BasicBlock &) = delete;
+  ~BasicBlock() override;
+
   /// \brief Get the context in which this basic block lives.
   LLVMContext &getContext() const;
 
@@ -89,7 +95,6 @@
                             BasicBlock *InsertBefore = nullptr) {
     return new BasicBlock(Context, Name, Parent, InsertBefore);
   }
-  ~BasicBlock() override;
 
   /// \brief Return the enclosing method, or null if none.
   const Function *getParent() const { return Parent; }
@@ -330,6 +335,7 @@
     assert((int)(signed char)getSubclassDataFromValue() >= 0 &&
            "Refcount wrap-around");
   }
+
   /// \brief Shadow Value::setValueSubclassData with a private forwarding method
   /// so that any future subclasses cannot accidentally use it.
   void setValueSubclassData(unsigned short D) {
@@ -340,6 +346,6 @@
 // Create wrappers for C Binding types (see CBindingWrapping.h).
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(BasicBlock, LLVMBasicBlockRef)
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_BASICBLOCK_H
diff --git a/include/llvm/IR/Comdat.h b/include/llvm/IR/Comdat.h
index 577247f..f4a391c 100644
--- a/include/llvm/IR/Comdat.h
+++ b/include/llvm/IR/Comdat.h
@@ -36,7 +36,9 @@
     SameSize,     ///< The data referenced by the COMDAT must be the same size.
   };
 
+  Comdat(const Comdat &) = delete;
   Comdat(Comdat &&C);
+
   SelectionKind getSelectionKind() const { return SK; }
   void setSelectionKind(SelectionKind Val) { SK = Val; }
   StringRef getName() const;
@@ -45,8 +47,8 @@
 
 private:
   friend class Module;
+
   Comdat();
-  Comdat(const Comdat &) = delete;
 
   // Points to the map in Module.
   StringMapEntry<Comdat> *Name;
@@ -58,6 +60,6 @@
   return OS;
 }
 
-} // end llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_COMDAT_H
diff --git a/include/llvm/IR/DIBuilder.h b/include/llvm/IR/DIBuilder.h
index 391bef6..6137956 100644
--- a/include/llvm/IR/DIBuilder.h
+++ b/include/llvm/IR/DIBuilder.h
@@ -15,21 +15,27 @@
 #ifndef LLVM_IR_DIBUILDER_H
 #define LLVM_IR_DIBUILDER_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/TrackingMDRef.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cstdint>
 
 namespace llvm {
+
   class BasicBlock;
-  class Instruction;
+  class Constant;
   class Function;
+  class Instruction;
+  class LLVMContext;
   class Module;
   class Value;
-  class Constant;
-  class LLVMContext;
-  class StringRef;
-  template <typename T> class ArrayRef;
 
   class DIBuilder {
     Module &M;
@@ -57,9 +63,6 @@
     /// copy.
     DenseMap<MDNode *, SmallVector<TrackingMDNodeRef, 1>> PreservedVariables;
 
-    DIBuilder(const DIBuilder &) = delete;
-    void operator=(const DIBuilder &) = delete;
-
     /// Create a temporary.
     ///
     /// Create an \a temporary node and track it in \a UnresolvedNodes.
@@ -71,6 +74,8 @@
     /// If \c AllowUnresolved, collect unresolved nodes attached to the module
     /// in order to resolve cycles during \a finalize().
     explicit DIBuilder(Module &M, bool AllowUnresolved = true);
+    DIBuilder(const DIBuilder &) = delete;
+    DIBuilder &operator=(const DIBuilder &) = delete;
 
     /// Construct any deferred debug info descriptors.
     void finalize();
@@ -223,7 +228,7 @@
     DIDerivedType *createStaticMemberType(DIScope *Scope, StringRef Name,
                                           DIFile *File, unsigned LineNo,
                                           DIType *Ty, DINode::DIFlags Flags,
-                                          llvm::Constant *Val,
+                                          Constant *Val,
                                           uint32_t AlignInBits = 0);
 
     /// Create debugging information entry for Objective-C
@@ -517,7 +522,7 @@
     ///
     /// \param OffsetInBits Offset of the piece in bits.
     /// \param SizeInBits   Size of the piece in bits.
-    DIExpression *createBitPieceExpression(unsigned OffsetInBits,
+    DIExpression *createFragmentExpression(unsigned OffsetInBits,
                                            unsigned SizeInBits);
 
     /// Create an expression for a variable that does not have an address, but
@@ -742,6 +747,7 @@
       return Replacement;
     }
   };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_IR_DIBUILDER_H
diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h
index 15756ac..bda66ae 100644
--- a/include/llvm/IR/DebugInfoMetadata.h
+++ b/include/llvm/IR/DebugInfoMetadata.h
@@ -1955,13 +1955,13 @@
   }
 
   /// Return whether this is a piece of an aggregate variable.
-  bool isBitPiece() const;
+  bool isFragment() const;
 
-  /// Return the offset of this piece in bits.
-  uint64_t getBitPieceOffset() const;
+  /// Return the offset of this fragment in bits.
+  uint64_t getFragmentOffsetInBits() const;
 
-  /// Return the size of this piece in bits.
-  uint64_t getBitPieceSize() const;
+  /// Return the size of this fragment in bits.
+  uint64_t getFragmentSizeInBits() const;
 
   typedef ArrayRef<uint64_t>::iterator element_iterator;
   element_iterator elements_begin() const { return getElements().begin(); }
diff --git a/include/llvm/IR/DerivedTypes.h b/include/llvm/IR/DerivedTypes.h
index 8892d3c..05e9915 100644
--- a/include/llvm/IR/DerivedTypes.h
+++ b/include/llvm/IR/DerivedTypes.h
@@ -18,17 +18,19 @@
 #ifndef LLVM_IR_DERIVEDTYPES_H
 #define LLVM_IR_DERIVEDTYPES_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/DataTypes.h"
+#include <cassert>
+#include <cstdint>
 
 namespace llvm {
 
 class Value;
 class APInt;
 class LLVMContext;
-template<typename T> class ArrayRef;
-class StringRef;
 
 /// Class to represent integer types. Note that this class is also used to
 /// represent the built-in integer types: Int1Ty, Int8Ty, Int16Ty, Int32Ty and
@@ -98,11 +100,12 @@
 /// Class to represent function types
 ///
 class FunctionType : public Type {
-  FunctionType(const FunctionType &) = delete;
-  const FunctionType &operator=(const FunctionType &) = delete;
   FunctionType(Type *Result, ArrayRef<Type*> Params, bool IsVarArgs);
 
 public:
+  FunctionType(const FunctionType &) = delete;
+  FunctionType &operator=(const FunctionType &) = delete;
+
   /// This static method is the primary way of constructing a FunctionType.
   static FunctionType *get(Type *Result,
                            ArrayRef<Type*> Params, bool isVarArg);
@@ -194,10 +197,9 @@
 /// generator for a target expects).
 ///
 class StructType : public CompositeType {
-  StructType(const StructType &) = delete;
-  const StructType &operator=(const StructType &) = delete;
   StructType(LLVMContext &C)
     : CompositeType(C, StructTyID), SymbolTableEntry(nullptr) {}
+
   enum {
     /// This is the contents of the SubClassData field.
     SCDB_HasBody = 1,
@@ -213,6 +215,9 @@
   void *SymbolTableEntry;
 
 public:
+  StructType(const StructType &) = delete;
+  StructType &operator=(const StructType &) = delete;
+
   /// This creates an identified struct.
   static StructType *create(LLVMContext &Context, StringRef Name);
   static StructType *create(LLVMContext &Context);
@@ -314,8 +319,6 @@
 class SequentialType : public CompositeType {
   Type *ContainedType;               ///< Storage for the single contained type.
   uint64_t NumElements;
-  SequentialType(const SequentialType &) = delete;
-  const SequentialType &operator=(const SequentialType &) = delete;
 
 protected:
   SequentialType(TypeID TID, Type *ElType, uint64_t NumElements)
@@ -326,6 +329,9 @@
   }
 
 public:
+  SequentialType(const SequentialType &) = delete;
+  SequentialType &operator=(const SequentialType &) = delete;
+
   uint64_t getNumElements() const { return NumElements; }
   Type *getElementType() const { return ContainedType; }
 
@@ -337,11 +343,12 @@
 
 /// Class to represent array types.
 class ArrayType : public SequentialType {
-  ArrayType(const ArrayType &) = delete;
-  const ArrayType &operator=(const ArrayType &) = delete;
   ArrayType(Type *ElType, uint64_t NumEl);
 
 public:
+  ArrayType(const ArrayType &) = delete;
+  ArrayType &operator=(const ArrayType &) = delete;
+
   /// This static method is the primary way to construct an ArrayType
   static ArrayType *get(Type *ElementType, uint64_t NumElements);
 
@@ -360,11 +367,12 @@
 
 /// Class to represent vector types.
 class VectorType : public SequentialType {
-  VectorType(const VectorType &) = delete;
-  const VectorType &operator=(const VectorType &) = delete;
   VectorType(Type *ElType, unsigned NumEl);
 
 public:
+  VectorType(const VectorType &) = delete;
+  VectorType &operator=(const VectorType &) = delete;
+
   /// This static method is the primary way to construct an VectorType.
   static VectorType *get(Type *ElementType, unsigned NumElements);
 
@@ -433,13 +441,14 @@
 
 /// Class to represent pointers.
 class PointerType : public Type {
-  PointerType(const PointerType &) = delete;
-  const PointerType &operator=(const PointerType &) = delete;
   explicit PointerType(Type *ElType, unsigned AddrSpace);
 
   Type *PointeeTy;
 
 public:
+  PointerType(const PointerType &) = delete;
+  PointerType &operator=(const PointerType &) = delete;
+
   /// This constructs a pointer to an object of the specified type in a numbered
   /// address space.
   static PointerType *get(Type *ElementType, unsigned AddressSpace);
@@ -471,6 +480,6 @@
   return cast<PointerType>(getScalarType())->getAddressSpace();
 }
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_DERIVEDTYPES_H
diff --git a/include/llvm/IR/GetElementPtrTypeIterator.h b/include/llvm/IR/GetElementPtrTypeIterator.h
index 75caee0..490bff2 100644
--- a/include/llvm/IR/GetElementPtrTypeIterator.h
+++ b/include/llvm/IR/GetElementPtrTypeIterator.h
@@ -34,7 +34,7 @@
 
     ItTy OpIt;
     PointerUnion<StructType *, Type *> CurTy;
-    enum { Unbounded = -1ull };
+    enum : uint64_t { Unbounded = -1ull };
     uint64_t NumElements = Unbounded;
     generic_gep_type_iterator() = default;
 
diff --git a/include/llvm/IR/GlobalAlias.h b/include/llvm/IR/GlobalAlias.h
index 3727c10..37a291d 100644
--- a/include/llvm/IR/GlobalAlias.h
+++ b/include/llvm/IR/GlobalAlias.h
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/IR/GlobalIndirectSymbol.h"
+#include "llvm/IR/Value.h"
 
 namespace llvm {
 
@@ -27,13 +28,14 @@
 class GlobalAlias : public GlobalIndirectSymbol,
                     public ilist_node<GlobalAlias> {
   friend class SymbolTableListTraits<GlobalAlias>;
-  void operator=(const GlobalAlias &) = delete;
-  GlobalAlias(const GlobalAlias &) = delete;
 
   GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage,
               const Twine &Name, Constant *Aliasee, Module *Parent);
 
 public:
+  GlobalAlias(const GlobalAlias &) = delete;
+  GlobalAlias &operator=(const GlobalAlias &) = delete;
+
   /// If a parent module is specified, the alias is automatically inserted into
   /// the end of the specified module's alias list.
   static GlobalAlias *create(Type *Ty, unsigned AddressSpace,
@@ -87,6 +89,6 @@
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_GLOBALALIAS_H
diff --git a/include/llvm/IR/GlobalIFunc.h b/include/llvm/IR/GlobalIFunc.h
index afe6b3d..bfaa996 100644
--- a/include/llvm/IR/GlobalIFunc.h
+++ b/include/llvm/IR/GlobalIFunc.h
@@ -20,6 +20,7 @@
 
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/IR/GlobalIndirectSymbol.h"
+#include "llvm/IR/Value.h"
 
 namespace llvm {
 
@@ -32,13 +33,14 @@
 class GlobalIFunc final : public GlobalIndirectSymbol,
                           public ilist_node<GlobalIFunc> {
   friend class SymbolTableListTraits<GlobalIFunc>;
-  void operator=(const GlobalIFunc &) = delete;
-  GlobalIFunc(const GlobalIFunc &) = delete;
 
   GlobalIFunc(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage,
               const Twine &Name, Constant *Resolver, Module *Parent);
 
 public:
+  GlobalIFunc(const GlobalIFunc &) = delete;
+  GlobalIFunc &operator=(const GlobalIFunc &) = delete;
+
   /// If a parent module is specified, the ifunc is automatically inserted into
   /// the end of the specified module's ifunc list.
   static GlobalIFunc *create(Type *Ty, unsigned AddressSpace,
@@ -69,6 +71,6 @@
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_GLOBALIFUNC_H
diff --git a/include/llvm/IR/GlobalIndirectSymbol.h b/include/llvm/IR/GlobalIndirectSymbol.h
index 8edb3d1..671309e 100644
--- a/include/llvm/IR/GlobalIndirectSymbol.h
+++ b/include/llvm/IR/GlobalIndirectSymbol.h
@@ -16,20 +16,25 @@
 #ifndef LLVM_IR_GLOBALINDIRECTSYMBOL_H
 #define LLVM_IR_GLOBALINDIRECTSYMBOL_H
 
+#include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <cstddef>
 
 namespace llvm {
 
 class GlobalIndirectSymbol : public GlobalValue {
-  void operator=(const GlobalIndirectSymbol &) = delete;
-  GlobalIndirectSymbol(const GlobalIndirectSymbol &) = delete;
-
 protected:
   GlobalIndirectSymbol(Type *Ty, ValueTy VTy, unsigned AddressSpace,
       LinkageTypes Linkage, const Twine &Name, Constant *Symbol);
 
 public:
+  GlobalIndirectSymbol(const GlobalIndirectSymbol &) = delete;
+  GlobalIndirectSymbol &operator=(const GlobalIndirectSymbol &) = delete;
+
   // allocate space for exactly one operand
   void *operator new(size_t s) {
     return User::operator new(s, 1);
@@ -79,6 +84,6 @@
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GlobalIndirectSymbol, Constant)
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_GLOBALINDIRECTSYMBOL_H
diff --git a/include/llvm/IR/GlobalObject.h b/include/llvm/IR/GlobalObject.h
index 04737a0..11eb713 100644
--- a/include/llvm/IR/GlobalObject.h
+++ b/include/llvm/IR/GlobalObject.h
@@ -15,18 +15,19 @@
 #ifndef LLVM_IR_GLOBALOBJECT_H
 #define LLVM_IR_GLOBALOBJECT_H
 
-#include "llvm/IR/DerivedTypes.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Value.h"
+#include <string>
+#include <utility>
 
 namespace llvm {
+
 class Comdat;
 class MDNode;
 class Metadata;
-class Module;
 
 class GlobalObject : public GlobalValue {
-  GlobalObject(const GlobalObject &) = delete;
-
 protected:
   GlobalObject(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps,
                LinkageTypes Linkage, const Twine &Name,
@@ -53,6 +54,8 @@
   static const unsigned GlobalObjectMask = (1 << GlobalObjectBits) - 1;
 
 public:
+  GlobalObject(const GlobalObject &) = delete;
+
   unsigned getAlignment() const {
     unsigned Data = getGlobalValueSubClassData();
     unsigned AlignmentData = Data & AlignmentMask;
@@ -141,6 +144,6 @@
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_GLOBALOBJECT_H
diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index a46151b..9397007 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -18,24 +18,31 @@
 #ifndef LLVM_IR_GLOBALVALUE_H
 #define LLVM_IR_GLOBALVALUE_H
 
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Support/MD5.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
 
 namespace llvm {
 
 class Comdat;
+class ConstantRange;
 class Error;
 class GlobalObject;
-class PointerType;
 class Module;
 
 namespace Intrinsic {
   enum ID : unsigned;
-}
+} // end namespace Intrinsic
 
 class GlobalValue : public Constant {
-  GlobalValue(const GlobalValue &) = delete;
 public:
   /// @brief An enumeration for the kinds of linkage for global values.
   enum LinkageTypes {
@@ -90,11 +97,12 @@
   static const unsigned GlobalValueSubClassDataBits = 19;
 
 private:
+  friend class Constant;
+
   // Give subclasses access to what otherwise would be wasted padding.
   // (19 + 4 + 2 + 2 + 2 + 3) == 32.
   unsigned SubClassData : GlobalValueSubClassDataBits;
 
-  friend class Constant;
   void destroyConstantImpl();
   Value *handleOperandChangeImpl(Value *From, Value *To);
 
@@ -155,6 +163,8 @@
     LocalExecTLSModel
   };
 
+  GlobalValue(const GlobalValue &) = delete;
+
   ~GlobalValue() override {
     removeDeadConstantUsers();   // remove any dead constants using this.
   }
@@ -502,6 +512,13 @@
   }
   GlobalObject *getBaseObject();
 
+  /// Returns whether this is a reference to an absolute symbol.
+  bool isAbsoluteSymbolRef() const;
+
+  /// If this is an absolute symbol reference, returns the range of the symbol,
+  /// otherwise returns None.
+  Optional<ConstantRange> getAbsoluteSymbolRange() const;
+
   /// This method unlinks 'this' from the containing module, but does not delete
   /// it.
   virtual void removeFromParent() = 0;
@@ -522,6 +539,6 @@
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_GLOBALVALUE_H
diff --git a/include/llvm/IR/GlobalVariable.h b/include/llvm/IR/GlobalVariable.h
index 3787f03..32281e1 100644
--- a/include/llvm/IR/GlobalVariable.h
+++ b/include/llvm/IR/GlobalVariable.h
@@ -24,31 +24,28 @@
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/Value.h"
+#include <cassert>
+#include <cstddef>
 
 namespace llvm {
 
 class Constant;
 class DIGlobalVariable;
 class Module;
+
 template <typename ValueSubClass> class SymbolTableListTraits;
 
 class GlobalVariable : public GlobalObject, public ilist_node<GlobalVariable> {
   friend class SymbolTableListTraits<GlobalVariable>;
-  void *operator new(size_t, unsigned) = delete;
-  void operator=(const GlobalVariable &) = delete;
-  GlobalVariable(const GlobalVariable &) = delete;
 
   bool isConstantGlobal : 1;                   // Is this a global constant?
   bool isExternallyInitializedConstant : 1;    // Is this a global whose value
                                                // can change from its initial
                                                // value before global
                                                // initializers are run?
-public:
-  // allocate space for exactly one operand
-  void *operator new(size_t s) {
-    return User::operator new(s, 1);
-  }
 
+public:
   /// GlobalVariable ctor - If a parent module is specified, the global is
   /// automatically inserted into the end of the specified modules global list.
   GlobalVariable(Type *Ty, bool isConstant, LinkageTypes Linkage,
@@ -62,6 +59,8 @@
                  const Twine &Name = "", GlobalVariable *InsertBefore = nullptr,
                  ThreadLocalMode = NotThreadLocal, unsigned AddressSpace = 0,
                  bool isExternallyInitialized = false);
+  GlobalVariable(const GlobalVariable &) = delete;
+  GlobalVariable &operator=(const GlobalVariable &) = delete;
 
   ~GlobalVariable() override {
     dropAllReferences();
@@ -70,6 +69,13 @@
     setGlobalVariableNumOperands(1);
   }
 
+  // allocate space for exactly one operand
+  void *operator new(size_t s) {
+    return User::operator new(s, 1);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
   /// Provide fast operand accessors
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
@@ -180,6 +186,6 @@
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GlobalVariable, Value)
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_GLOBALVARIABLE_H
diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index a4c4096..1d9c169 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@@ -17,7 +17,6 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/BasicBlock.h"
@@ -45,6 +44,8 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <algorithm>
+#include <functional>
 
 namespace llvm {
 
@@ -101,7 +102,7 @@
 public:
   IRBuilderBase(LLVMContext &context, MDNode *FPMathTag = nullptr,
                 ArrayRef<OperandBundleDef> OpBundles = None)
-      : Context(context), DefaultFPMathTag(FPMathTag), FMF(),
+      : Context(context), DefaultFPMathTag(FPMathTag),
         DefaultOperandBundles(OpBundles) {
     ClearInsertionPoint();
   }
@@ -165,12 +166,12 @@
 
   /// InsertPoint - A saved insertion point.
   class InsertPoint {
-    BasicBlock *Block;
+    BasicBlock *Block = nullptr;
     BasicBlock::iterator Point;
 
   public:
     /// \brief Creates a new insertion point which doesn't point to anything.
-    InsertPoint() : Block(nullptr) {}
+    InsertPoint() = default;
 
     /// \brief Creates a new insertion point at the given location.
     InsertPoint(BasicBlock *InsertBlock, BasicBlock::iterator InsertPoint)
@@ -179,8 +180,8 @@
     /// \brief Returns true if this insert point is set.
     bool isSet() const { return (Block != nullptr); }
 
-    llvm::BasicBlock *getBlock() const { return Block; }
-    llvm::BasicBlock::iterator getPoint() const { return Point; }
+    BasicBlock *getBlock() const { return Block; }
+    BasicBlock::iterator getPoint() const { return Point; }
   };
 
   /// \brief Returns the current insert point.
@@ -230,14 +231,14 @@
     BasicBlock::iterator Point;
     DebugLoc DbgLoc;
 
-    InsertPointGuard(const InsertPointGuard &) = delete;
-    InsertPointGuard &operator=(const InsertPointGuard &) = delete;
-
   public:
     InsertPointGuard(IRBuilderBase &B)
         : Builder(B), Block(B.GetInsertBlock()), Point(B.GetInsertPoint()),
           DbgLoc(B.getCurrentDebugLocation()) {}
 
+    InsertPointGuard(const InsertPointGuard &) = delete;
+    InsertPointGuard &operator=(const InsertPointGuard &) = delete;
+
     ~InsertPointGuard() {
       Builder.restoreIP(InsertPoint(Block, Point));
       Builder.SetCurrentDebugLocation(DbgLoc);
@@ -251,14 +252,13 @@
     FastMathFlags FMF;
     MDNode *FPMathTag;
 
-    FastMathFlagGuard(const FastMathFlagGuard &) = delete;
-    FastMathFlagGuard &operator=(
-        const FastMathFlagGuard &) = delete;
-
   public:
     FastMathFlagGuard(IRBuilderBase &B)
         : Builder(B), FMF(B.FMF), FPMathTag(B.DefaultFPMathTag) {}
 
+    FastMathFlagGuard(const FastMathFlagGuard &) = delete;
+    FastMathFlagGuard &operator=(const FastMathFlagGuard &) = delete;
+
     ~FastMathFlagGuard() {
       Builder.FMF = FMF;
       Builder.DefaultFPMathTag = FPMathTag;
@@ -1035,7 +1035,7 @@
     if (Constant *LC = dyn_cast<Constant>(LHS))
       if (Constant *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateBinOp(Opc, LC, RC), Name);
-    llvm::Instruction *BinOp = BinaryOperator::Create(Opc, LHS, RHS);
+    Instruction *BinOp = BinaryOperator::Create(Opc, LHS, RHS);
     if (isa<FPMathOperator>(BinOp))
       BinOp = AddFPMathAttributes(BinOp, FPMathTag, FMF);
     return Insert(BinOp, Name);
@@ -1445,12 +1445,6 @@
     return CreateBitCast(V, DestTy, Name);
   }
 
-private:
-  // \brief Provided to resolve 'CreateIntCast(Ptr, Ptr, "...")', giving a
-  // compile time error, instead of converting the string to bool for the
-  // isSigned parameter.
-  Value *CreateIntCast(Value *, Type *, const char *) = delete;
-
 public:
   Value *CreateFPCast(Value *V, Type *DestTy, const Twine &Name = "") {
     if (V->getType() == DestTy)
@@ -1460,6 +1454,11 @@
     return Insert(CastInst::CreateFPCast(V, DestTy), Name);
   }
 
+  // \brief Provided to resolve 'CreateIntCast(Ptr, Ptr, "...")', giving a
+  // compile time error, instead of converting the string to bool for the
+  // isSigned parameter.
+  Value *CreateIntCast(Value *, Type *, const char *) = delete;
+
   //===--------------------------------------------------------------------===//
   // Instruction creation methods: Compare Instructions
   //===--------------------------------------------------------------------===//
@@ -1584,7 +1583,7 @@
     return CreateCall(FTy, Callee, Args, Name, FPMathTag);
   }
 
-  CallInst *CreateCall(llvm::FunctionType *FTy, Value *Callee,
+  CallInst *CreateCall(FunctionType *FTy, Value *Callee,
                        ArrayRef<Value *> Args, const Twine &Name = "",
                        MDNode *FPMathTag = nullptr) {
     CallInst *CI = CallInst::Create(FTy, Callee, Args, DefaultOperandBundles);
diff --git a/include/llvm/IR/InlineAsm.h b/include/llvm/IR/InlineAsm.h
index 40ba830..f95509b 100644
--- a/include/llvm/IR/InlineAsm.h
+++ b/include/llvm/IR/InlineAsm.h
@@ -18,15 +18,14 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Value.h"
+#include <cassert>
+#include <string>
 #include <vector>
 
 namespace llvm {
 
-class PointerType;
 class FunctionType;
-class Module;
-
-struct InlineAsmKeyType;
+class PointerType;
 template <class ConstantClass> class ConstantUniqueMap;
 
 class InlineAsm : public Value {
@@ -40,9 +39,6 @@
   friend struct InlineAsmKeyType;
   friend class ConstantUniqueMap<InlineAsm>;
 
-  InlineAsm(const InlineAsm &) = delete;
-  void operator=(const InlineAsm&) = delete;
-
   std::string AsmString, Constraints;
   FunctionType *FTy;
   bool HasSideEffects;
@@ -59,6 +55,9 @@
   void destroyConstant();
 
 public:
+  InlineAsm(const InlineAsm &) = delete;
+  InlineAsm &operator=(const InlineAsm &) = delete;
+
   /// InlineAsm::get - Return the specified uniqued inline asm string.
   ///
   static InlineAsm *get(FunctionType *Ty, StringRef AsmString,
@@ -361,6 +360,6 @@
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_INLINEASM_H
diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index c4fb836..e408bcb 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -17,23 +17,27 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/ilist_node.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/SymbolTableListTraits.h"
 #include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
 
 namespace llvm {
 
-class FastMathFlags;
-class LLVMContext;
-class MDNode;
 class BasicBlock;
+class FastMathFlags;
+class MDNode;
 struct AAMDNodes;
 
 class Instruction : public User,
                     public ilist_node_with_parent<Instruction, BasicBlock> {
-  void operator=(const Instruction &) = delete;
-  Instruction(const Instruction &) = delete;
-
   BasicBlock *Parent;
   DebugLoc DbgLoc;                         // 'dbg' Metadata cache.
 
@@ -42,7 +46,11 @@
     /// this instruction has metadata attached to it or not.
     HasMetadataBit = 1 << 15
   };
+
 public:
+  Instruction(const Instruction &) = delete;
+  Instruction &operator=(const Instruction &) = delete;
+
   // Out of line virtual method, so the vtable, etc has a home.
   ~Instruction() override;
 
@@ -352,12 +360,12 @@
       SmallVectorImpl<std::pair<unsigned, MDNode *>> &) const;
   /// Clear all hashtable-based metadata from this instruction.
   void clearMetadataHashEntries();
+
 public:
   //===--------------------------------------------------------------------===//
   // Predicates and helper methods.
   //===--------------------------------------------------------------------===//
 
-
   /// Return true if the instruction is associative:
   ///
   ///   Associative operators satisfy:  x op (y op z) === (x op y) op z
@@ -546,12 +554,16 @@
 #define   LAST_OTHER_INST(N)             OtherOpsEnd = N+1
 #include "llvm/IR/Instruction.def"
   };
+
 private:
+  friend class SymbolTableListTraits<Instruction>;
+
   // Shadow Value::setValueSubclassData with a private forwarding method so that
   // subclasses cannot accidentally use it.
   void setValueSubclassData(unsigned short D) {
     Value::setValueSubclassData(D);
   }
+
   unsigned short getSubclassDataFromValue() const {
     return Value::getSubclassDataFromValue();
   }
@@ -561,8 +573,8 @@
                          (V ? HasMetadataBit : 0));
   }
 
-  friend class SymbolTableListTraits<Instruction>;
   void setParent(BasicBlock *P);
+
 protected:
   // Instruction subclasses can stick up to 15 bits of stuff into the
   // SubclassData field of instruction with these members.
@@ -591,14 +603,17 @@
 template<>
 class PointerLikeTypeTraits<Instruction*> {
   typedef Instruction* PT;
+
 public:
   static inline void *getAsVoidPointer(PT P) { return P; }
+
   static inline PT getFromVoidPointer(void *P) {
     return static_cast<PT>(P);
   }
+
   enum { NumLowBitsAvailable = 2 };
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_INSTRUCTION_H
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index 19ca4e4..a5d78a0 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -17,23 +17,33 @@
 #define LLVM_IR_INSTRUCTIONS_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
 #include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
-#include <iterator>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
 
 namespace llvm {
 
 class APInt;
 class ConstantInt;
-class ConstantRange;
 class DataLayout;
 class LLVMContext;
 
@@ -53,6 +63,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   AllocaInst *cloneImpl() const;
 
 public:
@@ -156,6 +167,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   LoadInst *cloneImpl() const;
 
 public:
@@ -190,7 +202,6 @@
            unsigned Align, AtomicOrdering Order,
            SynchronizationScope SynchScope,
            BasicBlock *InsertAtEnd);
-
   LoadInst(Value *Ptr, const char *NameStr, Instruction *InsertBefore);
   LoadInst(Value *Ptr, const char *NameStr, BasicBlock *InsertAtEnd);
   LoadInst(Type *Ty, Value *Ptr, const char *NameStr = nullptr,
@@ -287,19 +298,15 @@
 
 /// An instruction for storing to memory.
 class StoreInst : public Instruction {
-  void *operator new(size_t, unsigned) = delete;
   void AssertOK();
 
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   StoreInst *cloneImpl() const;
 
 public:
-  // allocate space for exactly two operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 2);
-  }
   StoreInst(Value *Val, Value *Ptr, Instruction *InsertBefore);
   StoreInst(Value *Val, Value *Ptr, BasicBlock *InsertAtEnd);
   StoreInst(Value *Val, Value *Ptr, bool isVolatile = false,
@@ -318,6 +325,13 @@
             SynchronizationScope SynchScope,
             BasicBlock *InsertAtEnd);
 
+  // allocate space for exactly two operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 2);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
   /// Return true if this is a store to a volatile memory location.
   bool isVolatile() const { return getSubclassDataFromInstruction() & 1; }
 
@@ -414,20 +428,15 @@
 
 /// An instruction for ordering other memory operations.
 class FenceInst : public Instruction {
-  void *operator new(size_t, unsigned) = delete;
   void Init(AtomicOrdering Ordering, SynchronizationScope SynchScope);
 
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   FenceInst *cloneImpl() const;
 
 public:
-  // allocate space for exactly zero operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 0);
-  }
-
   // Ordering may only be Acquire, Release, AcquireRelease, or
   // SequentiallyConsistent.
   FenceInst(LLVMContext &C, AtomicOrdering Ordering,
@@ -437,6 +446,13 @@
             SynchronizationScope SynchScope,
             BasicBlock *InsertAtEnd);
 
+  // allocate space for exactly zero operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 0);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
   /// Returns the ordering effect of this fence.
   AtomicOrdering getOrdering() const {
     return AtomicOrdering(getSubclassDataFromInstruction() >> 1);
@@ -486,7 +502,6 @@
 /// there.  Returns the value that was loaded.
 ///
 class AtomicCmpXchgInst : public Instruction {
-  void *operator new(size_t, unsigned) = delete;
   void Init(Value *Ptr, Value *Cmp, Value *NewVal,
             AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering,
             SynchronizationScope SynchScope);
@@ -494,13 +509,10 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   AtomicCmpXchgInst *cloneImpl() const;
 
 public:
-  // allocate space for exactly three operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 3);
-  }
   AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
                     AtomicOrdering SuccessOrdering,
                     AtomicOrdering FailureOrdering,
@@ -512,6 +524,13 @@
                     SynchronizationScope SynchScope,
                     BasicBlock *InsertAtEnd);
 
+  // allocate space for exactly three operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 3);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
   /// Return true if this is a cmpxchg from a volatile memory
   /// location.
   ///
@@ -648,11 +667,10 @@
 /// the old value.
 ///
 class AtomicRMWInst : public Instruction {
-  void *operator new(size_t, unsigned) = delete;
-
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   AtomicRMWInst *cloneImpl() const;
 
 public:
@@ -689,10 +707,6 @@
     BAD_BINOP
   };
 
-  // allocate space for exactly two operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 2);
-  }
   AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
                 AtomicOrdering Ordering, SynchronizationScope SynchScope,
                 Instruction *InsertBefore = nullptr);
@@ -700,6 +714,13 @@
                 AtomicOrdering Ordering, SynchronizationScope SynchScope,
                 BasicBlock *InsertAtEnd);
 
+  // allocate space for exactly two operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 2);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
   BinOp getOperation() const {
     return static_cast<BinOp>(getSubclassDataFromInstruction() >> 5);
   }
@@ -776,6 +797,7 @@
 private:
   void Init(BinOp Operation, Value *Ptr, Value *Val,
             AtomicOrdering Ordering, SynchronizationScope SynchScope);
+
   // Shadow Instruction::setInstructionSubclassData with a private forwarding
   // method so that subclasses cannot accidentally use it.
   void setInstructionSubclassData(unsigned short D) {
@@ -828,6 +850,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   GetElementPtrInst *cloneImpl() const;
 
 public:
@@ -846,6 +869,7 @@
     return new (Values) GetElementPtrInst(PointeeType, Ptr, IdxList, Values,
                                           NameStr, InsertBefore);
   }
+
   static GetElementPtrInst *Create(Type *PointeeType, Value *Ptr,
                                    ArrayRef<Value *> IdxList,
                                    const Twine &NameStr,
@@ -870,6 +894,7 @@
                                            Instruction *InsertBefore = nullptr){
     return CreateInBounds(nullptr, Ptr, IdxList, NameStr, InsertBefore);
   }
+
   static GetElementPtrInst *
   CreateInBounds(Type *PointeeType, Value *Ptr, ArrayRef<Value *> IdxList,
                  const Twine &NameStr = "",
@@ -879,12 +904,14 @@
     GEP->setIsInBounds(true);
     return GEP;
   }
+
   static GetElementPtrInst *CreateInBounds(Value *Ptr,
                                            ArrayRef<Value *> IdxList,
                                            const Twine &NameStr,
                                            BasicBlock *InsertAtEnd) {
     return CreateInBounds(nullptr, Ptr, IdxList, NameStr, InsertAtEnd);
   }
+
   static GetElementPtrInst *CreateInBounds(Type *PointeeType, Value *Ptr,
                                            ArrayRef<Value *> IdxList,
                                            const Twine &NameStr,
@@ -1039,6 +1066,7 @@
          cast<PointerType>(getType()->getScalarType())->getElementType());
   init(Ptr, IdxList, NameStr);
 }
+
 GetElementPtrInst::GetElementPtrInst(Type *PointeeType, Value *Ptr,
                                      ArrayRef<Value *> IdxList, unsigned Values,
                                      const Twine &NameStr,
@@ -1081,6 +1109,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   /// Clone an identical ICmpInst
   ICmpInst *cloneImpl() const;
 
@@ -1211,6 +1240,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   /// Clone an identical FCmpInst
   FCmpInst *cloneImpl() const;
 
@@ -1322,24 +1352,19 @@
 ///
 class CallInst : public Instruction,
                  public OperandBundleUser<CallInst, User::op_iterator> {
+  friend class OperandBundleUser<CallInst, User::op_iterator>;
+
   AttributeSet AttributeList; ///< parameter attributes for call
   FunctionType *FTy;
+
   CallInst(const CallInst &CI);
-  void init(Value *Func, ArrayRef<Value *> Args,
-            ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr) {
-    init(cast<FunctionType>(
-             cast<PointerType>(Func->getType())->getElementType()),
-         Func, Args, Bundles, NameStr);
-  }
-  void init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
-            ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr);
-  void init(Value *Func, const Twine &NameStr);
 
   /// Construct a CallInst given a range of arguments.
   /// Construct a CallInst from a range of arguments
   inline CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
                   ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
                   Instruction *InsertBefore);
+
   inline CallInst(Value *Func, ArrayRef<Value *> Args,
                   ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
                   Instruction *InsertBefore)
@@ -1359,17 +1384,30 @@
 
   explicit CallInst(Value *F, const Twine &NameStr,
                     Instruction *InsertBefore);
+
   CallInst(Value *F, const Twine &NameStr, BasicBlock *InsertAtEnd);
 
-  friend class OperandBundleUser<CallInst, User::op_iterator>;
+  void init(Value *Func, ArrayRef<Value *> Args,
+            ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr) {
+    init(cast<FunctionType>(
+             cast<PointerType>(Func->getType())->getElementType()),
+         Func, Args, Bundles, NameStr);
+  }
+  void init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
+            ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr);
+  void init(Value *Func, const Twine &NameStr);
+
   bool hasDescriptor() const { return HasDescriptor; }
 
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   CallInst *cloneImpl() const;
 
 public:
+  ~CallInst() override;
+
   static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
                           ArrayRef<OperandBundleDef> Bundles = None,
                           const Twine &NameStr = "",
@@ -1378,6 +1416,7 @@
                       cast<PointerType>(Func->getType())->getElementType()),
                   Func, Args, Bundles, NameStr, InsertBefore);
   }
+
   static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
                           const Twine &NameStr,
                           Instruction *InsertBefore = nullptr) {
@@ -1385,12 +1424,14 @@
                       cast<PointerType>(Func->getType())->getElementType()),
                   Func, Args, None, NameStr, InsertBefore);
   }
+
   static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
                           const Twine &NameStr,
                           Instruction *InsertBefore = nullptr) {
     return new (unsigned(Args.size() + 1))
         CallInst(Ty, Func, Args, None, NameStr, InsertBefore);
   }
+
   static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
                           ArrayRef<OperandBundleDef> Bundles = None,
                           const Twine &NameStr = "",
@@ -1402,6 +1443,7 @@
     return new (TotalOps, DescriptorBytes)
         CallInst(Ty, Func, Args, Bundles, NameStr, InsertBefore);
   }
+
   static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
                           ArrayRef<OperandBundleDef> Bundles,
                           const Twine &NameStr, BasicBlock *InsertAtEnd) {
@@ -1412,15 +1454,18 @@
     return new (TotalOps, DescriptorBytes)
         CallInst(Func, Args, Bundles, NameStr, InsertAtEnd);
   }
+
   static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
                           const Twine &NameStr, BasicBlock *InsertAtEnd) {
     return new (unsigned(Args.size() + 1))
         CallInst(Func, Args, None, NameStr, InsertAtEnd);
   }
+
   static CallInst *Create(Value *F, const Twine &NameStr = "",
                           Instruction *InsertBefore = nullptr) {
     return new(1) CallInst(F, NameStr, InsertBefore);
   }
+
   static CallInst *Create(Value *F, const Twine &NameStr,
                           BasicBlock *InsertAtEnd) {
     return new(1) CallInst(F, NameStr, InsertAtEnd);
@@ -1475,8 +1520,6 @@
                                  ArrayRef<OperandBundleDef> Bundles,
                                  BasicBlock *InsertAtEnd);
 
-  ~CallInst() override;
-
   FunctionType *getFunctionType() const { return FTy; }
 
   void mutateFunctionType(FunctionType *FTy) {
@@ -1490,20 +1533,25 @@
   TailCallKind getTailCallKind() const {
     return TailCallKind(getSubclassDataFromInstruction() & 3);
   }
+
   bool isTailCall() const {
     unsigned Kind = getSubclassDataFromInstruction() & 3;
     return Kind == TCK_Tail || Kind == TCK_MustTail;
   }
+
   bool isMustTailCall() const {
     return (getSubclassDataFromInstruction() & 3) == TCK_MustTail;
   }
+
   bool isNoTailCall() const {
     return (getSubclassDataFromInstruction() & 3) == TCK_NoTail;
   }
+
   void setTailCall(bool isTC = true) {
     setInstructionSubclassData((getSubclassDataFromInstruction() & ~3) |
                                unsigned(isTC ? TCK_Tail : TCK_None));
   }
+
   void setTailCallKind(TailCallKind TCK) {
     setInstructionSubclassData((getSubclassDataFromInstruction() & ~3) |
                                unsigned(TCK));
@@ -1869,13 +1917,6 @@
 /// This class represents the LLVM 'select' instruction.
 ///
 class SelectInst : public Instruction {
-  void init(Value *C, Value *S1, Value *S2) {
-    assert(!areInvalidOperands(C, S1, S2) && "Invalid operands for select");
-    Op<0>() = C;
-    Op<1>() = S1;
-    Op<2>() = S2;
-  }
-
   SelectInst(Value *C, Value *S1, Value *S2, const Twine &NameStr,
              Instruction *InsertBefore)
     : Instruction(S1->getType(), Instruction::Select,
@@ -1883,6 +1924,7 @@
     init(C, S1, S2);
     setName(NameStr);
   }
+
   SelectInst(Value *C, Value *S1, Value *S2, const Twine &NameStr,
              BasicBlock *InsertAtEnd)
     : Instruction(S1->getType(), Instruction::Select,
@@ -1891,9 +1933,17 @@
     setName(NameStr);
   }
 
+  void init(Value *C, Value *S1, Value *S2) {
+    assert(!areInvalidOperands(C, S1, S2) && "Invalid operands for select");
+    Op<0>() = C;
+    Op<1>() = S1;
+    Op<2>() = S2;
+  }
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   SelectInst *cloneImpl() const;
 
 public:
@@ -1961,6 +2011,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   VAArgInst *cloneImpl() const;
 
 public:
@@ -1969,6 +2020,7 @@
     : UnaryInstruction(Ty, VAArg, List, InsertBefore) {
     setName(NameStr);
   }
+
   VAArgInst(Value *List, Type *Ty, const Twine &NameStr,
             BasicBlock *InsertAtEnd)
     : UnaryInstruction(Ty, VAArg, List, InsertAtEnd) {
@@ -2004,6 +2056,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   ExtractElementInst *cloneImpl() const;
 
 public:
@@ -2012,6 +2065,7 @@
                                    Instruction *InsertBefore = nullptr) {
     return new(2) ExtractElementInst(Vec, Idx, NameStr, InsertBefore);
   }
+
   static ExtractElementInst *Create(Value *Vec, Value *Idx,
                                    const Twine &NameStr,
                                    BasicBlock *InsertAtEnd) {
@@ -2067,6 +2121,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   InsertElementInst *cloneImpl() const;
 
 public:
@@ -2075,6 +2130,7 @@
                                    Instruction *InsertBefore = nullptr) {
     return new(3) InsertElementInst(Vec, NewElt, Idx, NameStr, InsertBefore);
   }
+
   static InsertElementInst *Create(Value *Vec, Value *NewElt, Value *Idx,
                                    const Twine &NameStr,
                                    BasicBlock *InsertAtEnd) {
@@ -2122,19 +2178,21 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   ShuffleVectorInst *cloneImpl() const;
 
 public:
-  // allocate space for exactly three operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 3);
-  }
   ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
                     const Twine &NameStr = "",
                     Instruction *InsertBefor = nullptr);
   ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
                     const Twine &NameStr, BasicBlock *InsertAtEnd);
 
+  // allocate space for exactly three operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 3);
+  }
+
   /// Return true if a shufflevector instruction can be
   /// formed with the specified operands.
   static bool isValidOperands(const Value *V1, const Value *V2,
@@ -2206,8 +2264,6 @@
   SmallVector<unsigned, 4> Indices;
 
   ExtractValueInst(const ExtractValueInst &EVI);
-  void init(ArrayRef<unsigned> Idxs, const Twine &NameStr);
-
   /// Constructors - Create a extractvalue instruction with a base aggregate
   /// value and a list of indices.  The first ctor can optionally insert before
   /// an existing instruction, the second appends the new instruction to the
@@ -2223,9 +2279,12 @@
   // allocate space for exactly one operand
   void *operator new(size_t s) { return User::operator new(s, 1); }
 
+  void init(ArrayRef<unsigned> Idxs, const Twine &NameStr);
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   ExtractValueInst *cloneImpl() const;
 
 public:
@@ -2236,6 +2295,7 @@
     return new
       ExtractValueInst(Agg, Idxs, NameStr, InsertBefore);
   }
+
   static ExtractValueInst *Create(Value *Agg,
                                   ArrayRef<unsigned> Idxs,
                                   const Twine &NameStr,
@@ -2295,6 +2355,7 @@
                      ExtractValue, Agg, InsertBefore) {
   init(Idxs, NameStr);
 }
+
 ExtractValueInst::ExtractValueInst(Value *Agg,
                                    ArrayRef<unsigned> Idxs,
                                    const Twine &NameStr,
@@ -2314,10 +2375,7 @@
 class InsertValueInst : public Instruction {
   SmallVector<unsigned, 4> Indices;
 
-  void *operator new(size_t, unsigned) = delete;
   InsertValueInst(const InsertValueInst &IVI);
-  void init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
-            const Twine &NameStr);
 
   /// Constructors - Create a insertvalue instruction with a base aggregate
   /// value, a value to insert, and a list of indices.  The first ctor can
@@ -2339,9 +2397,13 @@
   InsertValueInst(Value *Agg, Value *Val, unsigned Idx, const Twine &NameStr,
                   BasicBlock *InsertAtEnd);
 
+  void init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
+            const Twine &NameStr);
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   InsertValueInst *cloneImpl() const;
 
 public:
@@ -2350,12 +2412,15 @@
     return User::operator new(s, 2);
   }
 
+  void *operator new(size_t, unsigned) = delete;
+
   static InsertValueInst *Create(Value *Agg, Value *Val,
                                  ArrayRef<unsigned> Idxs,
                                  const Twine &NameStr = "",
                                  Instruction *InsertBefore = nullptr) {
     return new InsertValueInst(Agg, Val, Idxs, NameStr, InsertBefore);
   }
+
   static InsertValueInst *Create(Value *Agg, Value *Val,
                                  ArrayRef<unsigned> Idxs,
                                  const Twine &NameStr,
@@ -2429,6 +2494,7 @@
                 2, InsertBefore) {
   init(Agg, Val, Idxs, NameStr);
 }
+
 InsertValueInst::InsertValueInst(Value *Agg,
                                  Value *Val,
                                  ArrayRef<unsigned> Idxs,
@@ -2451,17 +2517,13 @@
 // scientist's overactive imagination.
 //
 class PHINode : public Instruction {
-  void anchor() override;
-
-  void *operator new(size_t, unsigned) = delete;
   /// The number of operands actually allocated.  NumOperands is
   /// the number actually in use.
   unsigned ReservedSpace;
+
   PHINode(const PHINode &PN);
   // allocate space for exactly zero operands
-  void *operator new(size_t s) {
-    return User::operator new(s);
-  }
+
   explicit PHINode(Type *Ty, unsigned NumReservedValues,
                    const Twine &NameStr = "",
                    Instruction *InsertBefore = nullptr)
@@ -2479,7 +2541,18 @@
     allocHungoffUses(ReservedSpace);
   }
 
+  void *operator new(size_t s) {
+    return User::operator new(s);
+  }
+
+  void anchor() override;
+
 protected:
+  // Note: Instruction needs to be a friend here to call cloneImpl.
+  friend class Instruction;
+
+  PHINode *cloneImpl() const;
+
   // allocHungoffUses - this is more complicated than the generic
   // User::allocHungoffUses, because we have to allocate Uses for the incoming
   // values and pointers to the incoming blocks, all in one allocation.
@@ -2487,11 +2560,9 @@
     User::allocHungoffUses(N, /* IsPhi */ true);
   }
 
-  // Note: Instruction needs to be a friend here to call cloneImpl.
-  friend class Instruction;
-  PHINode *cloneImpl() const;
-
 public:
+  void *operator new(size_t, unsigned) = delete;
+
   /// Constructors - NumReservedValues is a hint for the number of incoming
   /// edges that this phi node will have (use 0 if you really have no idea).
   static PHINode *Create(Type *Ty, unsigned NumReservedValues,
@@ -2499,6 +2570,7 @@
                          Instruction *InsertBefore = nullptr) {
     return new PHINode(Ty, NumReservedValues, NameStr, InsertBefore);
   }
+
   static PHINode *Create(Type *Ty, unsigned NumReservedValues,
                          const Twine &NameStr, BasicBlock *InsertAtEnd) {
     return new PHINode(Ty, NumReservedValues, NameStr, InsertAtEnd);
@@ -2679,31 +2751,35 @@
   /// The number of operands actually allocated.  NumOperands is
   /// the number actually in use.
   unsigned ReservedSpace;
+
   LandingPadInst(const LandingPadInst &LP);
 
 public:
   enum ClauseType { Catch, Filter };
 
 private:
-  void *operator new(size_t, unsigned) = delete;
-  // Allocate space for exactly zero operands.
-  void *operator new(size_t s) {
-    return User::operator new(s);
-  }
-  void growOperands(unsigned Size);
-  void init(unsigned NumReservedValues, const Twine &NameStr);
-
   explicit LandingPadInst(Type *RetTy, unsigned NumReservedValues,
                           const Twine &NameStr, Instruction *InsertBefore);
   explicit LandingPadInst(Type *RetTy, unsigned NumReservedValues,
                           const Twine &NameStr, BasicBlock *InsertAtEnd);
 
+  // Allocate space for exactly zero operands.
+  void *operator new(size_t s) {
+    return User::operator new(s);
+  }
+
+  void growOperands(unsigned Size);
+  void init(unsigned NumReservedValues, const Twine &NameStr);
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   LandingPadInst *cloneImpl() const;
 
 public:
+  void *operator new(size_t, unsigned) = delete;
+
   /// Constructors - NumReservedClauses is a hint for the number of incoming
   /// clauses that this landingpad will have (use 0 if you really have no idea).
   static LandingPadInst *Create(Type *RetTy, unsigned NumReservedClauses,
@@ -2798,21 +2874,25 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   ReturnInst *cloneImpl() const;
 
 public:
+  ~ReturnInst() override;
+
   static ReturnInst* Create(LLVMContext &C, Value *retVal = nullptr,
                             Instruction *InsertBefore = nullptr) {
     return new(!!retVal) ReturnInst(C, retVal, InsertBefore);
   }
+
   static ReturnInst* Create(LLVMContext &C, Value *retVal,
                             BasicBlock *InsertAtEnd) {
     return new(!!retVal) ReturnInst(C, retVal, InsertAtEnd);
   }
+
   static ReturnInst* Create(LLVMContext &C, BasicBlock *InsertAtEnd) {
     return new(0) ReturnInst(C, InsertAtEnd);
   }
-  ~ReturnInst() override;
 
   /// Provide fast operand accessors
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
@@ -2857,7 +2937,6 @@
   /// they don't have to check for cond/uncond branchness. These are mostly
   /// accessed relative from op_end().
   BranchInst(const BranchInst &BI);
-  void AssertOK();
   // BranchInst constructors (where {B, T, F} are blocks, and C is a condition):
   // BranchInst(BB *B)                           - 'br B'
   // BranchInst(BB* T, BB *F, Value *C)          - 'br C, T, F'
@@ -2872,9 +2951,12 @@
   BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
              BasicBlock *InsertAtEnd);
 
+  void AssertOK();
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   BranchInst *cloneImpl() const;
 
 public:
@@ -2882,13 +2964,16 @@
                             Instruction *InsertBefore = nullptr) {
     return new(1) BranchInst(IfTrue, InsertBefore);
   }
+
   static BranchInst *Create(BasicBlock *IfTrue, BasicBlock *IfFalse,
                             Value *Cond, Instruction *InsertBefore = nullptr) {
     return new(3) BranchInst(IfTrue, IfFalse, Cond, InsertBefore);
   }
+
   static BranchInst *Create(BasicBlock *IfTrue, BasicBlock *InsertAtEnd) {
     return new(1) BranchInst(IfTrue, InsertAtEnd);
   }
+
   static BranchInst *Create(BasicBlock *IfTrue, BasicBlock *IfFalse,
                             Value *Cond, BasicBlock *InsertAtEnd) {
     return new(3) BranchInst(IfTrue, IfFalse, Cond, InsertAtEnd);
@@ -2957,19 +3042,14 @@
 /// Multiway switch
 ///
 class SwitchInst : public TerminatorInst {
-  void *operator new(size_t, unsigned) = delete;
   unsigned ReservedSpace;
+
   // Operand[0]    = Value to switch on
   // Operand[1]    = Default basic block destination
   // Operand[2n  ] = Value to match
   // Operand[2n+1] = BasicBlock to go to on match
   SwitchInst(const SwitchInst &SI);
-  void init(Value *Value, BasicBlock *Default, unsigned NumReserved);
-  void growOperands();
-  // allocate space for exactly zero operands
-  void *operator new(size_t s) {
-    return User::operator new(s);
-  }
+
   /// Create a new switch instruction, specifying a value to switch on and a
   /// default destination. The number of additional cases can be specified here
   /// to make memory allocation more efficient. This constructor can also
@@ -2984,12 +3064,23 @@
   SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
              BasicBlock *InsertAtEnd);
 
+  // allocate space for exactly zero operands
+  void *operator new(size_t s) {
+    return User::operator new(s);
+  }
+
+  void init(Value *Value, BasicBlock *Default, unsigned NumReserved);
+  void growOperands();
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   SwitchInst *cloneImpl() const;
 
 public:
+  void *operator new(size_t, unsigned) = delete;
+
   // -2
   static const unsigned DefaultPseudoIndex = static_cast<unsigned>(~0L-1);
 
@@ -3086,7 +3177,6 @@
     ConstCaseIt;
 
   class CaseIt : public CaseIteratorT<SwitchInst, ConstantInt, BasicBlock> {
-
     typedef CaseIteratorT<SwitchInst, ConstantInt, BasicBlock> ParentTy;
 
   public:
@@ -3110,6 +3200,7 @@
                             Instruction *InsertBefore = nullptr) {
     return new SwitchInst(Value, Default, NumCases, InsertBefore);
   }
+
   static SwitchInst *Create(Value *Value, BasicBlock *Default,
                             unsigned NumCases, BasicBlock *InsertAtEnd) {
     return new SwitchInst(Value, Default, NumCases, InsertAtEnd);
@@ -3141,6 +3232,7 @@
   CaseIt case_begin() {
     return CaseIt(this, 0);
   }
+
   /// Returns a read-only iterator that points to the first case in the
   /// SwitchInst.
   ConstCaseIt case_begin() const {
@@ -3152,6 +3244,7 @@
   CaseIt case_end() {
     return CaseIt(this, getNumCases());
   }
+
   /// Returns a read-only iterator that points one past the last in the
   /// SwitchInst.
   ConstCaseIt case_end() const {
@@ -3264,17 +3357,12 @@
 /// Indirect Branch Instruction.
 ///
 class IndirectBrInst : public TerminatorInst {
-  void *operator new(size_t, unsigned) = delete;
   unsigned ReservedSpace;
+
   // Operand[0]   = Address to jump to
   // Operand[n+1] = n-th destination
   IndirectBrInst(const IndirectBrInst &IBI);
-  void init(Value *Address, unsigned NumDests);
-  void growOperands();
-  // allocate space for exactly zero operands
-  void *operator new(size_t s) {
-    return User::operator new(s);
-  }
+
   /// Create a new indirectbr instruction, specifying an
   /// Address to jump to.  The number of expected destinations can be specified
   /// here to make memory allocation more efficient.  This constructor can also
@@ -3287,16 +3375,28 @@
   /// autoinserts at the end of the specified BasicBlock.
   IndirectBrInst(Value *Address, unsigned NumDests, BasicBlock *InsertAtEnd);
 
+  // allocate space for exactly zero operands
+  void *operator new(size_t s) {
+    return User::operator new(s);
+  }
+
+  void init(Value *Address, unsigned NumDests);
+  void growOperands();
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   IndirectBrInst *cloneImpl() const;
 
 public:
+  void *operator new(size_t, unsigned) = delete;
+
   static IndirectBrInst *Create(Value *Address, unsigned NumDests,
                                 Instruction *InsertBefore = nullptr) {
     return new IndirectBrInst(Address, NumDests, InsertBefore);
   }
+
   static IndirectBrInst *Create(Value *Address, unsigned NumDests,
                                 BasicBlock *InsertAtEnd) {
     return new IndirectBrInst(Address, NumDests, InsertAtEnd);
@@ -3363,19 +3463,12 @@
 ///
 class InvokeInst : public TerminatorInst,
                    public OperandBundleUser<InvokeInst, User::op_iterator> {
+  friend class OperandBundleUser<InvokeInst, User::op_iterator>;
+
   AttributeSet AttributeList;
   FunctionType *FTy;
+
   InvokeInst(const InvokeInst &BI);
-  void init(Value *Func, BasicBlock *IfNormal, BasicBlock *IfException,
-            ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> Bundles,
-            const Twine &NameStr) {
-    init(cast<FunctionType>(
-             cast<PointerType>(Func->getType())->getElementType()),
-         Func, IfNormal, IfException, Args, Bundles, NameStr);
-  }
-  void init(FunctionType *FTy, Value *Func, BasicBlock *IfNormal,
-            BasicBlock *IfException, ArrayRef<Value *> Args,
-            ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr);
 
   /// Construct an InvokeInst given a range of arguments.
   ///
@@ -3401,12 +3494,24 @@
                     unsigned Values, const Twine &NameStr,
                     BasicBlock *InsertAtEnd);
 
-  friend class OperandBundleUser<InvokeInst, User::op_iterator>;
   bool hasDescriptor() const { return HasDescriptor; }
 
+  void init(Value *Func, BasicBlock *IfNormal, BasicBlock *IfException,
+            ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> Bundles,
+            const Twine &NameStr) {
+    init(cast<FunctionType>(
+             cast<PointerType>(Func->getType())->getElementType()),
+         Func, IfNormal, IfException, Args, Bundles, NameStr);
+  }
+
+  void init(FunctionType *FTy, Value *Func, BasicBlock *IfNormal,
+            BasicBlock *IfException, ArrayRef<Value *> Args,
+            ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr);
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   InvokeInst *cloneImpl() const;
 
 public:
@@ -3419,6 +3524,7 @@
                   Func, IfNormal, IfException, Args, None, NameStr,
                   InsertBefore);
   }
+
   static InvokeInst *Create(Value *Func, BasicBlock *IfNormal,
                             BasicBlock *IfException, ArrayRef<Value *> Args,
                             ArrayRef<OperandBundleDef> Bundles = None,
@@ -3429,6 +3535,7 @@
                   Func, IfNormal, IfException, Args, Bundles, NameStr,
                   InsertBefore);
   }
+
   static InvokeInst *Create(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
                             BasicBlock *IfException, ArrayRef<Value *> Args,
                             const Twine &NameStr,
@@ -3437,6 +3544,7 @@
     return new (Values) InvokeInst(Ty, Func, IfNormal, IfException, Args, None,
                                    Values, NameStr, InsertBefore);
   }
+
   static InvokeInst *Create(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
                             BasicBlock *IfException, ArrayRef<Value *> Args,
                             ArrayRef<OperandBundleDef> Bundles = None,
@@ -3449,6 +3557,7 @@
         InvokeInst(Ty, Func, IfNormal, IfException, Args, Bundles, Values,
                    NameStr, InsertBefore);
   }
+
   static InvokeInst *Create(Value *Func,
                             BasicBlock *IfNormal, BasicBlock *IfException,
                             ArrayRef<Value *> Args, const Twine &NameStr,
@@ -3842,6 +3951,7 @@
                      InsertBefore) {
   init(Ty, Func, IfNormal, IfException, Args, Bundles, NameStr);
 }
+
 InvokeInst::InvokeInst(Value *Func, BasicBlock *IfNormal,
                        BasicBlock *IfException, ArrayRef<Value *> Args,
                        ArrayRef<OperandBundleDef> Bundles, unsigned Values,
@@ -3872,12 +3982,14 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   ResumeInst *cloneImpl() const;
 
 public:
   static ResumeInst *Create(Value *Exn, Instruction *InsertBefore = nullptr) {
     return new(1) ResumeInst(Exn, InsertBefore);
   }
+
   static ResumeInst *Create(Value *Exn, BasicBlock *InsertAtEnd) {
     return new(1) ResumeInst(Exn, InsertAtEnd);
   }
@@ -3915,18 +4027,15 @@
 //                         CatchSwitchInst Class
 //===----------------------------------------------------------------------===//
 class CatchSwitchInst : public TerminatorInst {
-  void *operator new(size_t, unsigned) = delete;
   /// The number of operands actually allocated.  NumOperands is
   /// the number actually in use.
   unsigned ReservedSpace;
+
   // Operand[0] = Outer scope
   // Operand[1] = Unwind block destination
   // Operand[n] = BasicBlock to go to on match
   CatchSwitchInst(const CatchSwitchInst &CSI);
-  void init(Value *ParentPad, BasicBlock *UnwindDest, unsigned NumReserved);
-  void growOperands(unsigned Size);
-  // allocate space for exactly zero operands
-  void *operator new(size_t s) { return User::operator new(s); }
+
   /// Create a new switch instruction, specifying a
   /// default destination.  The number of additional handlers can be specified
   /// here to make memory allocation more efficient.
@@ -3943,12 +4052,21 @@
                   unsigned NumHandlers, const Twine &NameStr,
                   BasicBlock *InsertAtEnd);
 
+  // allocate space for exactly zero operands
+  void *operator new(size_t s) { return User::operator new(s); }
+
+  void init(Value *ParentPad, BasicBlock *UnwindDest, unsigned NumReserved);
+  void growOperands(unsigned Size);
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   CatchSwitchInst *cloneImpl() const;
 
 public:
+  void *operator new(size_t, unsigned) = delete;
+
   static CatchSwitchInst *Create(Value *ParentPad, BasicBlock *UnwindDest,
                                  unsigned NumHandlers,
                                  const Twine &NameStr = "",
@@ -3956,6 +4074,7 @@
     return new CatchSwitchInst(ParentPad, UnwindDest, NumHandlers, NameStr,
                                InsertBefore);
   }
+
   static CatchSwitchInst *Create(Value *ParentPad, BasicBlock *UnwindDest,
                                  unsigned NumHandlers, const Twine &NameStr,
                                  BasicBlock *InsertAtEnd) {
@@ -4002,8 +4121,6 @@
   typedef std::pointer_to_unary_function<Value *, BasicBlock *> DerefFnTy;
   typedef mapped_iterator<op_iterator, DerefFnTy> handler_iterator;
   typedef iterator_range<handler_iterator> handler_range;
-
-
   typedef std::pointer_to_unary_function<const Value *, const BasicBlock *>
       ConstDerefFnTy;
   typedef mapped_iterator<const_op_iterator, ConstDerefFnTy> const_handler_iterator;
@@ -4016,6 +4133,7 @@
       ++It;
     return handler_iterator(It, DerefFnTy(handler_helper));
   }
+
   /// Returns an iterator that points to the first handler in the
   /// CatchSwitchInst.
   const_handler_iterator handler_begin() const {
@@ -4030,6 +4148,7 @@
   handler_iterator handler_end() {
     return handler_iterator(op_end(), DerefFnTy(handler_helper));
   }
+
   /// Returns an iterator that points one past the last handler in the
   /// CatchSwitchInst.
   const_handler_iterator handler_end() const {
@@ -4109,6 +4228,7 @@
     return new (Values)
         CleanupPadInst(ParentPad, Args, Values, NameStr, InsertBefore);
   }
+
   static CleanupPadInst *Create(Value *ParentPad, ArrayRef<Value *> Args,
                                 const Twine &NameStr, BasicBlock *InsertAtEnd) {
     unsigned Values = 1 + Args.size();
@@ -4149,6 +4269,7 @@
     return new (Values)
         CatchPadInst(CatchSwitch, Args, Values, NameStr, InsertBefore);
   }
+
   static CatchPadInst *Create(Value *CatchSwitch, ArrayRef<Value *> Args,
                               const Twine &NameStr, BasicBlock *InsertAtEnd) {
     unsigned Values = 1 + Args.size();
@@ -4180,14 +4301,15 @@
 
 class CatchReturnInst : public TerminatorInst {
   CatchReturnInst(const CatchReturnInst &RI);
-
-  void init(Value *CatchPad, BasicBlock *BB);
   CatchReturnInst(Value *CatchPad, BasicBlock *BB, Instruction *InsertBefore);
   CatchReturnInst(Value *CatchPad, BasicBlock *BB, BasicBlock *InsertAtEnd);
 
+  void init(Value *CatchPad, BasicBlock *BB);
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   CatchReturnInst *cloneImpl() const;
 
 public:
@@ -4197,6 +4319,7 @@
     assert(BB);
     return new (2) CatchReturnInst(CatchPad, BB, InsertBefore);
   }
+
   static CatchReturnInst *Create(Value *CatchPad, BasicBlock *BB,
                                  BasicBlock *InsertAtEnd) {
     assert(CatchPad);
@@ -4254,16 +4377,17 @@
 class CleanupReturnInst : public TerminatorInst {
 private:
   CleanupReturnInst(const CleanupReturnInst &RI);
-
-  void init(Value *CleanupPad, BasicBlock *UnwindBB);
   CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, unsigned Values,
                     Instruction *InsertBefore = nullptr);
   CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, unsigned Values,
                     BasicBlock *InsertAtEnd);
 
+  void init(Value *CleanupPad, BasicBlock *UnwindBB);
+
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   CleanupReturnInst *cloneImpl() const;
 
 public:
@@ -4277,6 +4401,7 @@
     return new (Values)
         CleanupReturnInst(CleanupPad, UnwindBB, Values, InsertBefore);
   }
+
   static CleanupReturnInst *Create(Value *CleanupPad, BasicBlock *UnwindBB,
                                    BasicBlock *InsertAtEnd) {
     assert(CleanupPad);
@@ -4349,20 +4474,22 @@
 /// end of the block cannot be reached.
 ///
 class UnreachableInst : public TerminatorInst {
-  void *operator new(size_t, unsigned) = delete;
-
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   UnreachableInst *cloneImpl() const;
 
 public:
+  explicit UnreachableInst(LLVMContext &C, Instruction *InsertBefore = nullptr);
+  explicit UnreachableInst(LLVMContext &C, BasicBlock *InsertAtEnd);
+
   // allocate space for exactly zero operands
   void *operator new(size_t s) {
     return User::operator new(s, 0);
   }
-  explicit UnreachableInst(LLVMContext &C, Instruction *InsertBefore = nullptr);
-  explicit UnreachableInst(LLVMContext &C, BasicBlock *InsertAtEnd);
+
+  void *operator new(size_t, unsigned) = delete;
 
   unsigned getNumSuccessors() const { return 0; }
 
@@ -4389,6 +4516,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   /// Clone an identical TruncInst
   TruncInst *cloneImpl() const;
 
@@ -4427,6 +4555,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   /// Clone an identical ZExtInst
   ZExtInst *cloneImpl() const;
 
@@ -4465,6 +4594,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   /// Clone an identical SExtInst
   SExtInst *cloneImpl() const;
 
@@ -4503,6 +4633,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   /// Clone an identical FPTruncInst
   FPTruncInst *cloneImpl() const;
 
@@ -4541,6 +4672,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   /// Clone an identical FPExtInst
   FPExtInst *cloneImpl() const;
 
@@ -4579,6 +4711,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   /// Clone an identical UIToFPInst
   UIToFPInst *cloneImpl() const;
 
@@ -4617,6 +4750,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   /// Clone an identical SIToFPInst
   SIToFPInst *cloneImpl() const;
 
@@ -4655,6 +4789,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   /// Clone an identical FPToUIInst
   FPToUIInst *cloneImpl() const;
 
@@ -4693,6 +4828,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   /// Clone an identical FPToSIInst
   FPToSIInst *cloneImpl() const;
 
@@ -4729,6 +4865,9 @@
 /// This class represents a cast from an integer to a pointer.
 class IntToPtrInst : public CastInst {
 public:
+  // Note: Instruction needs to be a friend here to call cloneImpl.
+  friend class Instruction;
+
   /// Constructor with insert-before-instruction semantics
   IntToPtrInst(
     Value *S,                           ///< The value to be converted
@@ -4745,8 +4884,6 @@
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
 
-  // Note: Instruction needs to be a friend here to call cloneImpl.
-  friend class Instruction;
   /// Clone an identical IntToPtrInst.
   IntToPtrInst *cloneImpl() const;
 
@@ -4773,6 +4910,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   /// Clone an identical PtrToIntInst.
   PtrToIntInst *cloneImpl() const;
 
@@ -4823,6 +4961,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   /// Clone an identical BitCastInst.
   BitCastInst *cloneImpl() const;
 
@@ -4862,6 +5001,7 @@
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
+
   /// Clone an identical AddrSpaceCastInst.
   AddrSpaceCastInst *cloneImpl() const;
 
@@ -4916,6 +5056,6 @@
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_INSTRUCTIONS_H
diff --git a/include/llvm/IR/IntrinsicInst.h b/include/llvm/IR/IntrinsicInst.h
index f797457..b14a545 100644
--- a/include/llvm/IR/IntrinsicInst.h
+++ b/include/llvm/IR/IntrinsicInst.h
@@ -25,20 +25,28 @@
 #define LLVM_IR_INTRINSICINST_H
 
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
+#include <cstdint>
 
 namespace llvm {
+
   /// A wrapper class for inspecting calls to intrinsic functions.
   /// This allows the standard isa/dyncast/cast functionality to work with calls
   /// to intrinsic functions.
   class IntrinsicInst : public CallInst {
-    IntrinsicInst() = delete;
-    IntrinsicInst(const IntrinsicInst&) = delete;
-    void operator=(const IntrinsicInst&) = delete;
   public:
+    IntrinsicInst() = delete;
+    IntrinsicInst(const IntrinsicInst &) = delete;
+    IntrinsicInst &operator=(const IntrinsicInst &) = delete;
+
     /// Return the intrinsic ID of this intrinsic.
     Intrinsic::ID getIntrinsicID() const {
       return getCalledFunction()->getIntrinsicID();
@@ -81,9 +89,11 @@
   class DbgDeclareInst : public DbgInfoIntrinsic {
   public:
     Value *getAddress() const { return getVariableLocation(); }
+
     DILocalVariable *getVariable() const {
       return cast<DILocalVariable>(getRawVariable());
     }
+
     DIExpression *getExpression() const {
       return cast<DIExpression>(getRawExpression());
     }
@@ -91,6 +101,7 @@
     Metadata *getRawVariable() const {
       return cast<MetadataAsValue>(getArgOperand(1))->getMetadata();
     }
+
     Metadata *getRawExpression() const {
       return cast<MetadataAsValue>(getArgOperand(2))->getMetadata();
     }
@@ -110,13 +121,16 @@
     Value *getValue() const {
       return getVariableLocation(/* AllowNullOp = */ false);
     }
+
     uint64_t getOffset() const {
       return cast<ConstantInt>(
                           const_cast<Value*>(getArgOperand(1)))->getZExtValue();
     }
+
     DILocalVariable *getVariable() const {
       return cast<DILocalVariable>(getRawVariable());
     }
+
     DIExpression *getExpression() const {
       return cast<DIExpression>(getRawExpression());
     }
@@ -124,6 +138,7 @@
     Metadata *getRawVariable() const {
       return cast<MetadataAsValue>(getArgOperand(2))->getMetadata();
     }
+
     Metadata *getRawExpression() const {
       return cast<MetadataAsValue>(getArgOperand(3))->getMetadata();
     }
@@ -159,6 +174,7 @@
     ConstantInt *getVolatileCst() const {
       return cast<ConstantInt>(const_cast<Value*>(getArgOperand(4)));
     }
+
     bool isVolatile() const {
       return !getVolatileCst()->isZero();
     }
@@ -268,7 +284,6 @@
     }
   };
 
-
   /// This class wraps the llvm.memcpy intrinsic.
   class MemCpyInst : public MemTransferInst {
   public:
@@ -359,6 +374,7 @@
     ConstantInt *getIndex() const {
       return cast<ConstantInt>(const_cast<Value *>(getArgOperand(3)));
     }
+
     Value *getStep() const;
   };
 
@@ -404,6 +420,7 @@
       return cast<ConstantInt>(const_cast<Value *>(getArgOperand(4)));
     }
   };
-} // namespace llvm
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_IR_INTRINSICINST_H
diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td
index 4ce5d87..078959c 100644
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -475,6 +475,14 @@
   GCCBuiltin<"__builtin_amdgcn_s_getreg">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrReadMem]>;
 
+// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0>
+// param values: 0 = P10, 1 = P20, 2 = P0
+def int_amdgcn_interp_mov :
+  GCCBuiltin<"__builtin_amdgcn_interp_mov">,
+  Intrinsic<[llvm_float_ty],
+            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+
 // __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0>
 def int_amdgcn_interp_p1 :
   GCCBuiltin<"__builtin_amdgcn_interp_p1">,
diff --git a/include/llvm/IR/LLVMContext.h b/include/llvm/IR/LLVMContext.h
index 2b537eb..7f43d5d 100644
--- a/include/llvm/IR/LLVMContext.h
+++ b/include/llvm/IR/LLVMContext.h
@@ -15,28 +15,30 @@
 #ifndef LLVM_IR_LLVMCONTEXT_H
 #define LLVM_IR_LLVMCONTEXT_H
 
+#include "llvm-c/Types.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/Options.h"
+#include <cstdint>
+#include <memory>
+#include <string>
 
 namespace llvm {
 
-class LLVMContextImpl;
-class StringRef;
-class Twine;
-class Instruction;
-class Module;
-class MDString;
-class DICompositeType;
-class SMDiagnostic;
 class DiagnosticInfo;
 enum DiagnosticSeverity : char;
-template <typename T> class SmallVectorImpl;
 class Function;
-class DebugLoc;
+class Instruction;
+class LLVMContextImpl;
+class Module;
 class OptBisect;
+template <typename T> class SmallVectorImpl;
+class SMDiagnostic;
+class StringRef;
+class Twine;
+
 namespace yaml {
 class Output;
-}
+} // end namespace yaml
 
 /// This is an important class for using LLVM in a threaded context.  It
 /// (opaquely) owns and manages the core "global" data of LLVM's core
@@ -47,6 +49,8 @@
 public:
   LLVMContextImpl *const pImpl;
   LLVMContext();
+  LLVMContext(LLVMContext &) = delete;
+  LLVMContext &operator=(const LLVMContext &) = delete;
   ~LLVMContext();
 
   // Pinned metadata names, which always have the same value.  This is a
@@ -73,6 +77,7 @@
     MD_loop = 18,                     // "llvm.loop"
     MD_type = 19,                     // "type"
     MD_section_prefix = 20,           // "section_prefix"
+    MD_absolute_symbol = 21,          // "absolute_symbol"
   };
 
   /// Known operand bundle tag IDs, which always have the same value.  All
@@ -259,8 +264,8 @@
   /// analysis.
   OptBisect &getOptBisect();
 private:
-  LLVMContext(LLVMContext&) = delete;
-  void operator=(LLVMContext&) = delete;
+  // Module needs access to the add/removeModule methods.
+  friend class Module;
 
   /// addModule - Register a module as being instantiated in this context.  If
   /// the context is deleted, the module will be deleted as well.
@@ -268,9 +273,6 @@
 
   /// removeModule - Unregister a module from this context.
   void removeModule(Module*);
-
-  // Module needs access to the add/removeModule methods.
-  friend class Module;
 };
 
 // Create wrappers for C Binding types (see CBindingWrapping.h).
@@ -286,6 +288,6 @@
   return reinterpret_cast<LLVMContextRef*>(const_cast<LLVMContext**>(Tys));
 }
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_LLVMCONTEXT_H
diff --git a/include/llvm/IR/Operator.h b/include/llvm/IR/Operator.h
index 3810116..444ce93 100644
--- a/include/llvm/IR/Operator.h
+++ b/include/llvm/IR/Operator.h
@@ -15,28 +15,22 @@
 #ifndef LLVM_IR_OPERATOR_H
 #define LLVM_IR_OPERATOR_H
 
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <cstddef>
 
 namespace llvm {
 
-class GetElementPtrInst;
-class BinaryOperator;
-class ConstantExpr;
-
 /// This is a utility class that provides an abstraction for the common
 /// functionality between Instructions and ConstantExprs.
 class Operator : public User {
-private:
-  // The Operator class is intended to be used as a utility, and is never itself
-  // instantiated.
-  void *operator new(size_t, unsigned) = delete;
-  void *operator new(size_t s) = delete;
-  Operator() = delete;
-
 protected:
   // NOTE: Cannot use = delete because it's not legal to delete
   // an overridden method that's not deleted in the base class. Cannot leave
@@ -44,6 +38,13 @@
   ~Operator() override;
 
 public:
+  // The Operator class is intended to be used as a utility, and is never itself
+  // instantiated.
+  Operator() = delete;
+
+  void *operator new(size_t, unsigned) = delete;
+  void *operator new(size_t s) = delete;
+
   /// Return the opcode for this Instruction or ConstantExpr.
   unsigned getOpcode() const {
     if (const Instruction *I = dyn_cast<Instruction>(this))
@@ -81,6 +82,7 @@
 private:
   friend class Instruction;
   friend class ConstantExpr;
+
   void setHasNoUnsignedWrap(bool B) {
     SubclassOptionalData =
       (SubclassOptionalData & ~NoUnsignedWrap) | (B * NoUnsignedWrap);
@@ -132,6 +134,7 @@
 private:
   friend class Instruction;
   friend class ConstantExpr;
+
   void setIsExact(bool B) {
     SubclassOptionalData = (SubclassOptionalData & ~IsExact) | (B * IsExact);
   }
@@ -148,6 +151,7 @@
            OpC == Instruction::AShr ||
            OpC == Instruction::LShr;
   }
+
   static inline bool classof(const ConstantExpr *CE) {
     return isPossiblyExactOpcode(CE->getOpcode());
   }
@@ -164,7 +168,9 @@
 class FastMathFlags {
 private:
   friend class FPMathOperator;
-  unsigned Flags;
+
+  unsigned Flags = 0;
+
   FastMathFlags(unsigned F) : Flags(F) { }
 
 public:
@@ -176,8 +182,7 @@
     AllowReciprocal = (1 << 4)
   };
 
-  FastMathFlags() : Flags(0)
-  { }
+  FastMathFlags() = default;
 
   /// Whether any flag is set
   bool any() const { return Flags != 0; }
@@ -210,7 +215,6 @@
   }
 };
 
-
 /// Utility class for floating point operations which can have
 /// information about relaxed accuracy requirements attached to them.
 class FPMathOperator : public Operator {
@@ -230,21 +234,25 @@
       setHasAllowReciprocal(true);
     }
   }
+
   void setHasNoNaNs(bool B) {
     SubclassOptionalData =
       (SubclassOptionalData & ~FastMathFlags::NoNaNs) |
       (B * FastMathFlags::NoNaNs);
   }
+
   void setHasNoInfs(bool B) {
     SubclassOptionalData =
       (SubclassOptionalData & ~FastMathFlags::NoInfs) |
       (B * FastMathFlags::NoInfs);
   }
+
   void setHasNoSignedZeros(bool B) {
     SubclassOptionalData =
       (SubclassOptionalData & ~FastMathFlags::NoSignedZeros) |
       (B * FastMathFlags::NoSignedZeros);
   }
+
   void setHasAllowReciprocal(bool B) {
     SubclassOptionalData =
       (SubclassOptionalData & ~FastMathFlags::AllowReciprocal) |
@@ -313,7 +321,6 @@
   }
 };
 
-
 /// A helper template for defining operators for individual opcodes.
 template<typename SuperClass, unsigned Opc>
 class ConcreteOperator : public SuperClass {
@@ -343,7 +350,6 @@
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Shl> {
 };
 
-
 class SDivOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::SDiv> {
 };
@@ -357,19 +363,18 @@
   : public ConcreteOperator<PossiblyExactOperator, Instruction::LShr> {
 };
 
-
 class ZExtOperator : public ConcreteOperator<Operator, Instruction::ZExt> {};
 
-
 class GEPOperator
   : public ConcreteOperator<Operator, Instruction::GetElementPtr> {
+  friend class GetElementPtrInst;
+  friend class ConstantExpr;
+
   enum {
     IsInBounds = (1 << 0),
     // InRangeIndex: bits 1-6
   };
 
-  friend class GetElementPtrInst;
-  friend class ConstantExpr;
   void setIsInBounds(bool B) {
     SubclassOptionalData =
       (SubclassOptionalData & ~IsInBounds) | (B * IsInBounds);
@@ -380,6 +385,7 @@
   bool isInBounds() const {
     return SubclassOptionalData & IsInBounds;
   }
+
   /// Returns the offset of the index with an inrange attachment, or None if
   /// none.
   Optional<unsigned> getInRangeIndex() const {
@@ -470,6 +476,7 @@
   const Value *getPointerOperand() const {
     return getOperand(0);
   }
+
   static unsigned getPointerOperandIndex() {
     return 0U;                      // get index for modifying correct operand
   }
@@ -500,6 +507,6 @@
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_OPERATOR_H
diff --git a/include/llvm/IR/Statepoint.h b/include/llvm/IR/Statepoint.h
index 783a5cc..916faa4 100644
--- a/include/llvm/IR/Statepoint.h
+++ b/include/llvm/IR/Statepoint.h
@@ -1,4 +1,4 @@
-//===-- llvm/IR/Statepoint.h - gc.statepoint utilities ------ --*- C++ -*-===//
+//===-- llvm/IR/Statepoint.h - gc.statepoint utilities ----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -19,6 +19,7 @@
 
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
@@ -26,8 +27,14 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
 
 namespace llvm {
+
 /// The statepoint intrinsic accepts a set of flags as its third argument.
 /// Valid values come out of this set.
 enum class StatepointFlags {
@@ -47,7 +54,6 @@
 
 class GCRelocateInst;
 class GCResultInst;
-class ImmutableStatepoint;
 
 bool isStatepoint(ImmutableCallSite CS);
 bool isStatepoint(const Value *V);
@@ -66,8 +72,6 @@
           typename CallSiteTy>
 class StatepointBase {
   CallSiteTy StatepointCS;
-  void *operator new(size_t, unsigned) = delete;
-  void *operator new(size_t s) = delete;
 
 protected:
   explicit StatepointBase(InstructionTy *I) {
@@ -76,6 +80,7 @@
       assert(StatepointCS && "isStatepoint implies CallSite");
     }
   }
+
   explicit StatepointBase(CallSiteTy CS) {
     if (isStatepoint(CS))
       StatepointCS = CS;
@@ -93,6 +98,9 @@
     CallArgsBeginPos = 5,
   };
 
+  void *operator new(size_t, unsigned) = delete;
+  void *operator new(size_t s) = delete;
+
   explicit operator bool() const {
     // We do not assign non-statepoint CallSites to StatepointCS.
     return (bool)StatepointCS;
@@ -451,6 +459,7 @@
 /// Return \c true if the the \p Attr is an attribute that is a statepoint
 /// directive.
 bool isStatepointDirectiveAttr(Attribute Attr);
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_IR_STATEPOINT_H
diff --git a/include/llvm/IR/Use.h b/include/llvm/IR/Use.h
index e62eab5..f1c3cbb 100644
--- a/include/llvm/IR/Use.h
+++ b/include/llvm/IR/Use.h
@@ -27,7 +27,7 @@
 
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/Support/CBindingWrapping.h"
-#include <cstddef>
+#include "llvm-c/Types.h"
 
 namespace llvm {
 
@@ -40,9 +40,11 @@
 template <> class PointerLikeTypeTraits<Use **> {
 public:
   static inline void *getAsVoidPointer(Use **P) { return P; }
+
   static inline Use **getFromVoidPointer(void *P) {
     return static_cast<Use **>(P);
   }
+
   enum { NumLowBitsAvailable = 2 };
 };
 
@@ -65,6 +67,8 @@
 /// time complexity.
 class Use {
 public:
+  Use(const Use &U) = delete;
+
   /// \brief Provide a fast substitute to std::swap<Use>
   /// that also works with less standard-compliant compilers
   void swap(Use &RHS);
@@ -74,8 +78,6 @@
   typedef PointerIntPair<User *, 1, unsigned> UserRef;
 
 private:
-  Use(const Use &U) = delete;
-
   /// Destructor - Only for zap()
   ~Use() {
     if (Val)
@@ -128,6 +130,7 @@
   PointerIntPair<Use **, 2, PrevPtrTag> Prev;
 
   void setPrev(Use **NewPrev) { Prev.setPointer(NewPrev); }
+
   void addToList(Use **List) {
     Next = *List;
     if (Next)
@@ -135,6 +138,7 @@
     setPrev(List);
     *List = this;
   }
+
   void removeFromList() {
     Use **StrippedPrev = Prev.getPointer();
     *StrippedPrev = Next;
@@ -159,6 +163,6 @@
 // Create wrappers for C Binding types (see CBindingWrapping.h).
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(Use, LLVMUseRef)
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_USE_H
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index b5c1b85..94dd6a4 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -239,6 +239,7 @@
 
 private:
   void destroyValueName();
+  void doRAUW(Value *New, bool NoMetadata);
   void setNameImpl(const Twine &Name);
 
 public:
@@ -269,6 +270,12 @@
   /// guaranteed to be empty.
   void replaceAllUsesWith(Value *V);
 
+  /// \brief Change non-metadata uses of this to point to a new Value.
+  ///
+  /// Go through the uses list for this definition and make each use point to
+  /// "V" instead of "this". This function skips metadata entries in the list.
+  void replaceNonMetadataUsesWith(Value *V);
+
   /// replaceUsesOutsideBlock - Go through the uses list for this definition and
   /// make each use point to "V" instead of "this" when the use is outside the
   /// block. 'This's use list is expected to have at least one element.
diff --git a/include/llvm/IR/ValueMap.h b/include/llvm/IR/ValueMap.h
index 85379ad..9648e19 100644
--- a/include/llvm/IR/ValueMap.h
+++ b/include/llvm/IR/ValueMap.h
@@ -27,14 +27,20 @@
 #define LLVM_IR_VALUEMAP_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/IR/TrackingMDRef.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/UniqueLock.h"
-#include "llvm/Support/type_traits.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
 #include <iterator>
-#include <memory>
+#include <type_traits>
+#include <utility>
 
 namespace llvm {
 
@@ -77,11 +83,12 @@
 };
 
 /// See the file comment.
-template<typename KeyT, typename ValueT, typename Config =ValueMapConfig<KeyT> >
+template<typename KeyT, typename ValueT, typename Config =ValueMapConfig<KeyT>>
 class ValueMap {
   friend class ValueMapCallbackVH<KeyT, ValueT, Config>;
+
   typedef ValueMapCallbackVH<KeyT, ValueT, Config> ValueMapCVH;
-  typedef DenseMap<ValueMapCVH, ValueT, DenseMapInfo<ValueMapCVH> > MapT;
+  typedef DenseMap<ValueMapCVH, ValueT, DenseMapInfo<ValueMapCVH>> MapT;
   typedef DenseMap<const Metadata *, TrackingMDRef> MDMapT;
   typedef typename Config::ExtraData ExtraData;
   MapT Map;
@@ -90,8 +97,6 @@
 
   bool MayMapMetadata = true;
 
-  ValueMap(const ValueMap&) = delete;
-  ValueMap& operator=(const ValueMap&) = delete;
 public:
   typedef KeyT key_type;
   typedef ValueT mapped_type;
@@ -102,6 +107,8 @@
       : Map(NumInitBuckets), Data() {}
   explicit ValueMap(const ExtraData &Data, unsigned NumInitBuckets = 64)
       : Map(NumInitBuckets), Data(Data) {}
+  ValueMap(const ValueMap &) = delete;
+  ValueMap &operator=(const ValueMap &) = delete;
 
   bool hasMD() const { return bool(MDMap); }
   MDMapT &MD() {
@@ -183,7 +190,6 @@
       insert(*I);
   }
 
-
   bool erase(const KeyT &Val) {
     typename MapT::iterator I = Map.find_as(Val);
     if (I == Map.end())
@@ -237,6 +243,7 @@
 class ValueMapCallbackVH final : public CallbackVH {
   friend class ValueMap<KeyT, ValueT, Config>;
   friend struct DenseMapInfo<ValueMapCallbackVH>;
+
   typedef ValueMap<KeyT, ValueT, Config> ValueMapT;
   typedef typename std::remove_pointer<KeyT>::type KeySansPointerT;
 
@@ -262,6 +269,7 @@
     Config::onDelete(Copy.Map->Data, Copy.Unwrap());  // May destroy *this.
     Copy.Map->Map.erase(Copy);  // Definitely destroys *this.
   }
+
   void allUsesReplacedWith(Value *new_key) override {
     assert(isa<KeySansPointerT>(new_key) &&
            "Invalid RAUW on key of ValueMap<>");
@@ -289,30 +297,34 @@
 };
 
 template<typename KeyT, typename ValueT, typename Config>
-struct DenseMapInfo<ValueMapCallbackVH<KeyT, ValueT, Config> > {
+struct DenseMapInfo<ValueMapCallbackVH<KeyT, ValueT, Config>> {
   typedef ValueMapCallbackVH<KeyT, ValueT, Config> VH;
 
   static inline VH getEmptyKey() {
     return VH(DenseMapInfo<Value *>::getEmptyKey());
   }
+
   static inline VH getTombstoneKey() {
     return VH(DenseMapInfo<Value *>::getTombstoneKey());
   }
+
   static unsigned getHashValue(const VH &Val) {
     return DenseMapInfo<KeyT>::getHashValue(Val.Unwrap());
   }
+
   static unsigned getHashValue(const KeyT &Val) {
     return DenseMapInfo<KeyT>::getHashValue(Val);
   }
+
   static bool isEqual(const VH &LHS, const VH &RHS) {
     return LHS == RHS;
   }
+
   static bool isEqual(const KeyT &LHS, const VH &RHS) {
     return LHS == RHS.getValPtr();
   }
 };
 
-
 template<typename DenseMapT, typename KeyT>
 class ValueMapIterator :
     public std::iterator<std::forward_iterator_tag,
@@ -320,10 +332,11 @@
                          ptrdiff_t> {
   typedef typename DenseMapT::iterator BaseT;
   typedef typename DenseMapT::mapped_type ValueT;
+
   BaseT I;
+
 public:
   ValueMapIterator() : I() {}
-
   ValueMapIterator(BaseT I) : I(I) {}
 
   BaseT base() const { return I; }
@@ -369,7 +382,9 @@
                          ptrdiff_t> {
   typedef typename DenseMapT::const_iterator BaseT;
   typedef typename DenseMapT::mapped_type ValueT;
+
   BaseT I;
+
 public:
   ValueMapConstIterator() : I() {}
   ValueMapConstIterator(BaseT I) : I(I) {}
@@ -414,4 +429,4 @@
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_IR_VALUEMAP_H
diff --git a/include/llvm/LTO/Config.h b/include/llvm/LTO/Config.h
index 239ea87..e162ac3 100644
--- a/include/llvm/LTO/Config.h
+++ b/include/llvm/LTO/Config.h
@@ -33,8 +33,9 @@
 /// LTO configuration. A linker can configure LTO by setting fields in this data
 /// structure and passing it to the lto::LTO constructor.
 struct Config {
+  // Note: when adding fields here, consider whether they need to be added to
+  // computeCacheKey in LTO.cpp.
   std::string CPU;
-  std::string Features;
   TargetOptions Options;
   std::vector<std::string> MAttrs;
   Reloc::Model RelocModel = Reloc::PIC_;
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index 2ce2780..f846b63 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -21,6 +21,7 @@
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
 #include <tuple>
@@ -303,7 +304,7 @@
     MCSymbol *lookupSymbol(const Twine &Name) const;
 
     /// Set value for a symbol.
-    int setSymbolValue(MCStreamer &Streamer, std::string &I);
+    void setSymbolValue(MCStreamer &Streamer, StringRef Sym, uint64_t Val);
 
     /// getSymbols - Get a reference for the symbol table for clients that
     /// want to, for example, iterate over all symbols. 'const' because we
@@ -526,7 +527,10 @@
 
     void setDwarfDebugProducer(StringRef S) { DwarfDebugProducer = S; }
     StringRef getDwarfDebugProducer() { return DwarfDebugProducer; }
-
+    dwarf::DwarfFormat getDwarfFormat() const {
+      // TODO: Support DWARF64
+      return dwarf::DWARF32;
+    }
     void setDwarfVersion(uint16_t v) { DwarfVersion = v; }
     uint16_t getDwarfVersion() const { return DwarfVersion; }
 
diff --git a/include/llvm/MC/MCParser/MCAsmLexer.h b/include/llvm/MC/MCParser/MCAsmLexer.h
index ad5bcc4..56da6f8 100644
--- a/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -128,6 +128,20 @@
   }
 };
 
+/// A callback class which is notified of each comment in an assembly file as
+/// it is lexed.
+class AsmCommentConsumer {
+public:
+  virtual ~AsmCommentConsumer() {};
+
+  /// Callback function for when a comment is lexed. Loc is the start of the
+  /// comment text (excluding the comment-start marker). CommentText is the text
+  /// of the comment, excluding the comment start and end markers, and the
+  /// newline for single-line comments.
+  virtual void HandleComment(SMLoc Loc, StringRef CommentText) = 0;
+};
+
+
 /// Generic assembler lexer interface, for use by target specific assembly
 /// lexers.
 class MCAsmLexer {
@@ -145,6 +159,7 @@
   bool SkipSpace;
   bool AllowAtInIdentifier;
   bool IsAtStartOfStatement;
+  AsmCommentConsumer *CommentConsumer;
 
   MCAsmLexer();
 
@@ -234,6 +249,10 @@
 
   bool getAllowAtInIdentifier() { return AllowAtInIdentifier; }
   void setAllowAtInIdentifier(bool v) { AllowAtInIdentifier = v; }
+
+  void setCommentConsumer(AsmCommentConsumer *CommentConsumer) {
+    this->CommentConsumer = CommentConsumer;
+  }
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCTargetOptions.h b/include/llvm/MC/MCTargetOptions.h
index 21091b3..a300c4f 100644
--- a/include/llvm/MC/MCTargetOptions.h
+++ b/include/llvm/MC/MCTargetOptions.h
@@ -38,6 +38,7 @@
   bool MCNoExecStack : 1;
   bool MCFatalWarnings : 1;
   bool MCNoWarn : 1;
+  bool MCNoDeprecatedWarn : 1;
   bool MCSaveTempLabels : 1;
   bool MCUseDwarfDirectory : 1;
   bool MCIncrementalLinkerCompatible : 1;
@@ -65,6 +66,7 @@
           ARE_EQUAL(MCNoExecStack) &&
           ARE_EQUAL(MCFatalWarnings) &&
           ARE_EQUAL(MCNoWarn) &&
+          ARE_EQUAL(MCNoDeprecatedWarn) &&
           ARE_EQUAL(MCSaveTempLabels) &&
           ARE_EQUAL(MCUseDwarfDirectory) &&
           ARE_EQUAL(MCIncrementalLinkerCompatible) &&
diff --git a/include/llvm/MC/MCTargetOptionsCommandFlags.h b/include/llvm/MC/MCTargetOptionsCommandFlags.h
index cb02084..96179be 100644
--- a/include/llvm/MC/MCTargetOptionsCommandFlags.h
+++ b/include/llvm/MC/MCTargetOptionsCommandFlags.h
@@ -53,6 +53,9 @@
 cl::opt<bool> NoWarn("no-warn", cl::desc("Suppress all warnings"));
 cl::alias NoWarnW("W", cl::desc("Alias for --no-warn"), cl::aliasopt(NoWarn));
 
+cl::opt<bool> NoDeprecatedWarn("no-deprecated-warn",
+                               cl::desc("Suppress all deprecated warnings"));
+
 cl::opt<std::string>
 ABIName("target-abi", cl::Hidden,
         cl::desc("The name of the ABI to be targeted from the backend."),
@@ -70,6 +73,7 @@
   Options.ABIName = ABIName;
   Options.MCFatalWarnings = FatalWarnings;
   Options.MCNoWarn = NoWarn;
+  Options.MCNoDeprecatedWarn = NoDeprecatedWarn;
   return Options;
 }
 
diff --git a/include/llvm/ObjectYAML/DWARFYAML.h b/include/llvm/ObjectYAML/DWARFYAML.h
new file mode 100644
index 0000000..69e3ab4
--- /dev/null
+++ b/include/llvm/ObjectYAML/DWARFYAML.h
@@ -0,0 +1,133 @@
+//===- DWARFYAML.h - DWARF YAMLIO implementation ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares classes for handling the YAML representation
+/// of DWARF Debug Info.
+///
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_OBJECTYAML_DWARFYAML_H
+#define LLVM_OBJECTYAML_DWARFYAML_H
+
+#include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/Dwarf.h"
+
+namespace llvm {
+namespace DWARFYAML {
+
+struct AttributeAbbrev {
+  llvm::dwarf::Attribute Attribute;
+  llvm::dwarf::Form Form;
+};
+
+struct Abbrev {
+  llvm::yaml::Hex32 Code;
+  llvm::dwarf::Tag Tag;
+  llvm::dwarf::Constants Children;
+  std::vector<AttributeAbbrev> Attributes;
+};
+
+struct ARangeDescriptor {
+  llvm::yaml::Hex64 Address;
+  uint64_t Length;
+};
+
+struct ARange {
+  uint32_t Length;
+  uint16_t Version;
+  uint32_t CuOffset;
+  uint8_t AddrSize;
+  uint8_t SegSize;
+  std::vector<ARangeDescriptor> Descriptors;
+};
+
+struct Data {
+  std::vector<Abbrev> AbbrevDecls;
+  std::vector<StringRef> DebugStrings;
+  std::vector<ARange> ARanges;
+
+  bool isEmpty() const;
+};
+
+} // namespace llvm::DWARFYAML
+} // namespace llvm
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::StringRef)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::AttributeAbbrev)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::Abbrev)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::ARangeDescriptor)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::ARange)
+
+namespace llvm {
+namespace yaml {
+
+template <> struct MappingTraits<DWARFYAML::Data> {
+  static void mapping(IO &IO, DWARFYAML::Data &DWARF);
+};
+
+template <> struct MappingTraits<DWARFYAML::Abbrev> {
+  static void mapping(IO &IO, DWARFYAML::Abbrev &Abbrev);
+};
+
+template <> struct MappingTraits<DWARFYAML::AttributeAbbrev> {
+  static void mapping(IO &IO, DWARFYAML::AttributeAbbrev &AttAbbrev);
+};
+
+template <> struct MappingTraits<DWARFYAML::ARangeDescriptor> {
+  static void mapping(IO &IO, DWARFYAML::ARangeDescriptor &Descriptor);
+};
+
+template <> struct MappingTraits<DWARFYAML::ARange> {
+  static void mapping(IO &IO, DWARFYAML::ARange &Range);
+};
+
+#define HANDLE_DW_TAG(unused, name)                                            \
+  io.enumCase(value, "DW_TAG_" #name, dwarf::DW_TAG_##name);
+
+template <> struct ScalarEnumerationTraits<dwarf::Tag> {
+  static void enumeration(IO &io, dwarf::Tag &value) {
+#include "llvm/Support/Dwarf.def"
+    io.enumFallback<Hex16>(value);
+  }
+};
+
+#define HANDLE_DW_AT(unused, name)                                             \
+  io.enumCase(value, "DW_AT_" #name, dwarf::DW_AT_##name);
+
+template <> struct ScalarEnumerationTraits<dwarf::Attribute> {
+  static void enumeration(IO &io, dwarf::Attribute &value) {
+#include "llvm/Support/Dwarf.def"
+    io.enumFallback<Hex16>(value);
+  }
+};
+
+#define HANDLE_DW_FORM(unused, name)                                           \
+  io.enumCase(value, "DW_FORM_" #name, dwarf::DW_FORM_##name);
+
+template <> struct ScalarEnumerationTraits<dwarf::Form> {
+  static void enumeration(IO &io, dwarf::Form &value) {
+#include "llvm/Support/Dwarf.def"
+    io.enumFallback<Hex16>(value);
+  }
+};
+
+template <> struct ScalarEnumerationTraits<dwarf::Constants> {
+  static void enumeration(IO &io, dwarf::Constants &value) {
+    io.enumCase(value, "DW_CHILDREN_no", dwarf::DW_CHILDREN_no);
+    io.enumCase(value, "DW_CHILDREN_yes", dwarf::DW_CHILDREN_yes);
+    io.enumFallback<Hex16>(value);
+  }
+};
+
+} // namespace llvm::yaml
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/ObjectYAML/MachOYAML.h b/include/llvm/ObjectYAML/MachOYAML.h
index 2fbd71f..657973d 100644
--- a/include/llvm/ObjectYAML/MachOYAML.h
+++ b/include/llvm/ObjectYAML/MachOYAML.h
@@ -17,6 +17,7 @@
 #define LLVM_OBJECTYAML_MACHOYAML_H
 
 #include "llvm/ObjectYAML/YAML.h"
+#include "llvm/ObjectYAML/DWARFYAML.h"
 #include "llvm/Support/MachO.h"
 
 namespace llvm {
@@ -109,6 +110,7 @@
   std::vector<LoadCommand> LoadCommands;
   std::vector<Section> Sections;
   LinkEditData LinkEdit;
+  DWARFYAML::Data DWARF;
 };
 
 struct FatHeader {
@@ -143,7 +145,6 @@
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MachOYAML::BindOpcode)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MachOYAML::ExportEntry)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MachOYAML::NListEntry)
-LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::StringRef)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MachOYAML::Object)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MachOYAML::FatArch)
 
diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index 515c18e..204672f 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h
@@ -837,7 +837,6 @@
   typedef OptionValue<DataType> OptVal;
 
 protected:
-  // Workaround Clang PR22793
   ~basic_parser() = default;
 };
 
diff --git a/include/llvm/Support/Dwarf.h b/include/llvm/Support/Dwarf.h
index cb6c250..a7c77ff 100644
--- a/include/llvm/Support/Dwarf.h
+++ b/include/llvm/Support/Dwarf.h
@@ -108,7 +108,8 @@
 #define HANDLE_DW_OP(ID, NAME) DW_OP_##NAME = ID,
 #include "llvm/Support/Dwarf.def"
   DW_OP_lo_user = 0xe0,
-  DW_OP_hi_user = 0xff
+  DW_OP_hi_user = 0xff,
+  DW_OP_LLVM_fragment = 0x1000 ///< Only used in LLVM metadata.
 };
 
 enum TypeKind {
diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index 4d52704..57082a3 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h
@@ -1030,6 +1030,7 @@
 
   PT_OPENBSD_RANDOMIZE = 0x65a3dbe6, // Fill with random data.
   PT_OPENBSD_WXNEEDED = 0x65a3dbe7,  // Program does W^X violations.
+  PT_OPENBSD_BOOTDATA = 0x65a41be6,  // Section for boot arguments.
 
   // ARM program header types.
   PT_ARM_ARCHEXT = 0x70000000, // Platform architecture compatibility info
diff --git a/include/llvm/Support/FormatVariadicDetails.h b/include/llvm/Support/FormatVariadicDetails.h
index edde6db..7765537 100644
--- a/include/llvm/Support/FormatVariadicDetails.h
+++ b/include/llvm/Support/FormatVariadicDetails.h
@@ -56,6 +56,7 @@
 //
 // void format(raw_ostream &, StringRef);
 //
+// It is assumed T is a non-reference type.
 template <class T, class Enable = void> class has_FormatMember {
 public:
   static bool const value = false;
@@ -63,8 +64,11 @@
 
 template <class T>
 class has_FormatMember<T,
-                       typename std::enable_if<std::is_class<T>::value>::type> {
-  using Signature_format = void (T::*)(llvm::raw_ostream &S, StringRef Options);
+                       typename std::enable_if<std::is_class<T>::value &&
+                                               std::is_const<T>::value>::type> {
+  using CleanT = typename std::remove_volatile<T>::type;
+  using Signature_format = void (CleanT::*)(llvm::raw_ostream &S,
+                                            StringRef Options) const;
 
   template <typename U>
   static char test2(SameType<Signature_format, &U::format> *);
@@ -72,7 +76,25 @@
   template <typename U> static double test2(...);
 
 public:
-  static bool const value = (sizeof(test2<T>(nullptr)) == 1);
+  static bool const value = (sizeof(test2<CleanT>(nullptr)) == 1);
+};
+
+template <class T>
+class has_FormatMember<
+    T, typename std::enable_if<std::is_class<T>::value &&
+                               !std::is_const<T>::value>::type> {
+  using CleanT = typename std::remove_cv<T>::type;
+  using Signature_format = void (CleanT::*)(llvm::raw_ostream &S,
+                                            StringRef Options);
+
+  template <typename U>
+  static char test2(SameType<Signature_format, &U::format> *);
+
+  template <typename U> static double test2(...);
+
+public:
+  static bool const value =
+      (sizeof(test2<CleanT>(nullptr)) == 1) || has_FormatMember<const T>::value;
 };
 
 // Test if format_provider<T> is defined on T and contains a member function
@@ -98,15 +120,18 @@
 // based format() invocation.
 template <typename T>
 struct uses_format_member
-    : public std::integral_constant<bool, has_FormatMember<T>::value> {};
+    : public std::integral_constant<
+          bool,
+          has_FormatMember<typename std::remove_reference<T>::type>::value> {};
 
 // Simple template that decides whether a type T should use the format_provider
 // based format() invocation.  The member function takes priority, so this test
 // will only be true if there is not ALSO a format member.
 template <typename T>
 struct uses_format_provider
-    : public std::integral_constant<bool, !has_FormatMember<T>::value &&
-                                              has_FormatProvider<T>::value> {};
+    : public std::integral_constant<
+          bool, !uses_format_member<T>::value && has_FormatProvider<T>::value> {
+};
 
 // Simple template that decides whether a type T has neither a member-function
 // nor format_provider based implementation that it can use.  Mostly used so
@@ -114,8 +139,9 @@
 // implementation can be located.
 template <typename T>
 struct uses_missing_provider
-    : public std::integral_constant<bool, !has_FormatMember<T>::value &&
-                                              !has_FormatProvider<T>::value> {};
+    : public std::integral_constant<bool,
+                                    !uses_format_member<T>::value &&
+                                        !uses_format_provider<T>::value> {};
 
 template <typename T>
 typename std::enable_if<uses_format_member<T>::value,
diff --git a/include/llvm/Support/StringSaver.h b/include/llvm/Support/StringSaver.h
index 9ef5ccf..fcddd4c 100644
--- a/include/llvm/Support/StringSaver.h
+++ b/include/llvm/Support/StringSaver.h
@@ -16,8 +16,8 @@
 
 namespace llvm {
 
-/// \brief Saves strings in the inheritor's stable storage and returns a stable
-/// raw character pointer.
+/// \brief Saves strings in the inheritor's stable storage and returns a
+/// StringRef with a stable character pointer.
 class StringSaver final {
   BumpPtrAllocator &Alloc;
 
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index f9a7697..89f3336 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -142,6 +142,13 @@
     CmpXChg, // Expand the instruction into cmpxchg; used by at least X86.
   };
 
+  /// Enum that specifies when a multiplication should be expanded.
+  enum class MulExpansionKind {
+    Always,            // Always expand the instruction.
+    OnlyLegalOrCustom, // Only expand when the resulting instructions are legal
+                       // or custom.
+  };
+
   static ISD::NodeType getExtendForContent(BooleanContent Content) {
     switch (Content) {
     case UndefinedBooleanContent:
@@ -3036,6 +3043,22 @@
   // Legalization utility functions
   //
 
+  /// Expand a MUL or [US]MUL_LOHI of n-bit values into two or four nodes,
+  /// respectively, each computing an n/2-bit part of the result.
+  /// \param Result A vector that will be filled with the parts of the result
+  ///        in little-endian order.
+  /// \param LL Low bits of the LHS of the MUL.  You can use this parameter
+  ///        if you want to control how low bits are extracted from the LHS.
+  /// \param LH High bits of the LHS of the MUL.  See LL for meaning.
+  /// \param RL Low bits of the RHS of the MUL.  See LL for meaning
+  /// \param RH High bits of the RHS of the MUL.  See LL for meaning.
+  /// \returns true if the node has been expanded, false if it has not
+  bool expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl, SDValue LHS,
+                      SDValue RHS, SmallVectorImpl<SDValue> &Result, EVT HiLoVT,
+                      SelectionDAG &DAG, MulExpansionKind Kind,
+                      SDValue LL = SDValue(), SDValue LH = SDValue(),
+                      SDValue RL = SDValue(), SDValue RH = SDValue()) const;
+
   /// Expand a MUL into two nodes.  One that computes the high bits of
   /// the result and one that computes the low bits.
   /// \param HiLoVT The value type to use for the Lo and Hi nodes.
@@ -3046,9 +3069,9 @@
   /// \param RH High bits of the RHS of the MUL.  See LL for meaning.
   /// \returns true if the node has been expanded. false if it has not
   bool expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
-                 SelectionDAG &DAG, SDValue LL = SDValue(),
-                 SDValue LH = SDValue(), SDValue RL = SDValue(),
-                 SDValue RH = SDValue()) const;
+                 SelectionDAG &DAG, MulExpansionKind Kind,
+                 SDValue LL = SDValue(), SDValue LH = SDValue(),
+                 SDValue RL = SDValue(), SDValue RH = SDValue()) const;
 
   /// Expand float(f32) to SINT(i64) conversion
   /// \param N Node to expand
diff --git a/include/llvm/Transforms/Utils/FunctionComparator.h b/include/llvm/Transforms/Utils/FunctionComparator.h
index 5e68bbb..a613fc3 100644
--- a/include/llvm/Transforms/Utils/FunctionComparator.h
+++ b/include/llvm/Transforms/Utils/FunctionComparator.h
@@ -15,13 +15,21 @@
 #ifndef LLVM_TRANSFORMS_UTILS_FUNCTIONCOMPARATOR_H
 #define LLVM_TRANSFORMS_UTILS_FUNCTIONCOMPARATOR_H
 
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/ValueMap.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include <cstdint>
+#include <tuple>
 
 namespace llvm {
 
+class GetElementPtrInst;
+
 /// GlobalNumberState assigns an integer to each global value in the program,
 /// which is used by the comparison routine to order references to globals. This
 /// state must be preserved throughout the pass, because Functions and other
@@ -44,20 +52,23 @@
   typedef ValueMap<GlobalValue *, uint64_t, Config> ValueNumberMap;
   ValueNumberMap GlobalNumbers;
   // The next unused serial number to assign to a global.
-  uint64_t NextNumber;
-  public:
-    GlobalNumberState() : GlobalNumbers(), NextNumber(0) {}
-    uint64_t getNumber(GlobalValue* Global) {
-      ValueNumberMap::iterator MapIter;
-      bool Inserted;
-      std::tie(MapIter, Inserted) = GlobalNumbers.insert({Global, NextNumber});
-      if (Inserted)
-        NextNumber++;
-      return MapIter->second;
-    }
-    void clear() {
-      GlobalNumbers.clear();
-    }
+  uint64_t NextNumber = 0;
+
+public:
+  GlobalNumberState() = default;
+
+  uint64_t getNumber(GlobalValue* Global) {
+    ValueNumberMap::iterator MapIter;
+    bool Inserted;
+    std::tie(MapIter, Inserted) = GlobalNumbers.insert({Global, NextNumber});
+    if (Inserted)
+      NextNumber++;
+    return MapIter->second;
+  }
+
+  void clear() {
+    GlobalNumbers.clear();
+  }
 };
 
 /// FunctionComparator - Compares two functions to determine whether or not
@@ -78,7 +89,6 @@
   static FunctionHash functionHash(Function &);
 
 protected:
-
   /// Start the comparison.
   void beginCompare() {
     sn_mapL.clear();
@@ -302,7 +312,6 @@
   const Function *FnL, *FnR;
 
 private:
-
   int cmpOrderings(AtomicOrdering L, AtomicOrdering R) const;
   int cmpInlineAsm(const InlineAsm *L, const InlineAsm *R) const;
   int cmpAttrs(const AttributeSet L, const AttributeSet R) const;
@@ -362,6 +371,6 @@
   GlobalNumberState* GlobalNumbers;
 };
 
-}
+} // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_FUNCTIONCOMPARATOR_H
diff --git a/lib/Analysis/CGSCCPassManager.cpp b/lib/Analysis/CGSCCPassManager.cpp
index decce99..c72b0ef 100644
--- a/lib/Analysis/CGSCCPassManager.cpp
+++ b/lib/Analysis/CGSCCPassManager.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/InstIterator.h"
 
 using namespace llvm;
 
@@ -156,52 +157,46 @@
   SmallPtrSet<Function *, 16> RetainedEdges;
   SmallSetVector<Function *, 4> PromotedRefTargets;
   SmallSetVector<Function *, 4> DemotedCallTargets;
+
   // First walk the function and handle all called functions. We do this first
   // because if there is a single call edge, whether there are ref edges is
   // irrelevant.
-  for (BasicBlock &BB : F)
-    for (Instruction &I : BB)
-      if (auto CS = CallSite(&I))
-        if (Function *Callee = CS.getCalledFunction())
-          if (Visited.insert(Callee).second && !Callee->isDeclaration()) {
-            const Edge *E = N.lookup(*Callee);
-            // FIXME: We should really handle adding new calls. While it will
-            // make downstream usage more complex, there is no fundamental
-            // limitation and it will allow passes within the CGSCC to be a bit
-            // more flexible in what transforms they can do. Until then, we
-            // verify that new calls haven't been introduced.
-            assert(E && "No function transformations should introduce *new* "
-                        "call edges! Any new calls should be modeled as "
-                        "promoted existing ref edges!");
-            RetainedEdges.insert(Callee);
-            if (!E->isCall())
-              PromotedRefTargets.insert(Callee);
-          }
+  for (Instruction &I : instructions(F))
+    if (auto CS = CallSite(&I))
+      if (Function *Callee = CS.getCalledFunction())
+        if (Visited.insert(Callee).second && !Callee->isDeclaration()) {
+          const Edge *E = N.lookup(*Callee);
+          // FIXME: We should really handle adding new calls. While it will
+          // make downstream usage more complex, there is no fundamental
+          // limitation and it will allow passes within the CGSCC to be a bit
+          // more flexible in what transforms they can do. Until then, we
+          // verify that new calls haven't been introduced.
+          assert(E && "No function transformations should introduce *new* "
+                      "call edges! Any new calls should be modeled as "
+                      "promoted existing ref edges!");
+          RetainedEdges.insert(Callee);
+          if (!E->isCall())
+            PromotedRefTargets.insert(Callee);
+        }
 
   // Now walk all references.
-  for (BasicBlock &BB : F)
-    for (Instruction &I : BB) {
-      for (Value *Op : I.operand_values())
-        if (Constant *C = dyn_cast<Constant>(Op))
-          if (Visited.insert(C).second)
-            Worklist.push_back(C);
+  for (Instruction &I : instructions(F))
+    for (Value *Op : I.operand_values())
+      if (Constant *C = dyn_cast<Constant>(Op))
+        if (Visited.insert(C).second)
+          Worklist.push_back(C);
 
-      LazyCallGraph::visitReferences(Worklist, Visited, [&](Function &Referee) {
-        // Skip declarations.
-        if (Referee.isDeclaration())
-          return;
-
-        const Edge *E = N.lookup(Referee);
-        // FIXME: Similarly to new calls, we also currently preclude
-        // introducing new references. See above for details.
-        assert(E && "No function transformations should introduce *new* ref "
-                    "edges! Any new ref edges would require IPO which "
-                    "function passes aren't allowed to do!");
-        RetainedEdges.insert(&Referee);
-        if (E->isCall())
-          DemotedCallTargets.insert(&Referee);
-      });
-    }
+  LazyCallGraph::visitReferences(Worklist, Visited, [&](Function &Referee) {
+    const Edge *E = N.lookup(Referee);
+    // FIXME: Similarly to new calls, we also currently preclude
+    // introducing new references. See above for details.
+    assert(E && "No function transformations should introduce *new* ref "
+                "edges! Any new ref edges would require IPO which "
+                "function passes aren't allowed to do!");
+    RetainedEdges.insert(&Referee);
+    if (E->isCall())
+      DemotedCallTargets.insert(&Referee);
+  });
 
   // First remove all of the edges that are no longer present in this function.
   // We have to build a list of dead targets first and then remove them as the
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 04d36ff..1c0bf01a 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -88,7 +88,7 @@
   return nullptr;
 }
 
-// Constant fold bitcast, symbolically evaluating it with DataLayout.
+/// Constant fold bitcast, symbolically evaluating it with DataLayout.
 /// This always returns a non-null constant, but it may be a
 /// ConstantExpr if unfoldable.
 Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
@@ -99,31 +99,33 @@
       !DestTy->isPtrOrPtrVectorTy()) // Don't get ones for ptr types!
     return Constant::getAllOnesValue(DestTy);
 
-  // Handle a vector->integer cast.
-  if (auto *IT = dyn_cast<IntegerType>(DestTy)) {
-    auto *VTy = dyn_cast<VectorType>(C->getType());
-    if (!VTy)
-      return ConstantExpr::getBitCast(C, DestTy);
+  if (auto *VTy = dyn_cast<VectorType>(C->getType())) {
+    // Handle a vector->scalar integer/fp cast.
+    if (isa<IntegerType>(DestTy) || DestTy->isFloatingPointTy()) {
+      unsigned NumSrcElts = VTy->getNumElements();
+      Type *SrcEltTy = VTy->getElementType();
 
-    unsigned NumSrcElts = VTy->getNumElements();
-    Type *SrcEltTy = VTy->getElementType();
+      // If the vector is a vector of floating point, convert it to vector of int
+      // to simplify things.
+      if (SrcEltTy->isFloatingPointTy()) {
+        unsigned FPWidth = SrcEltTy->getPrimitiveSizeInBits();
+        Type *SrcIVTy =
+          VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumSrcElts);
+        // Ask IR to do the conversion now that #elts line up.
+        C = ConstantExpr::getBitCast(C, SrcIVTy);
+      }
 
-    // If the vector is a vector of floating point, convert it to vector of int
-    // to simplify things.
-    if (SrcEltTy->isFloatingPointTy()) {
-      unsigned FPWidth = SrcEltTy->getPrimitiveSizeInBits();
-      Type *SrcIVTy =
-        VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumSrcElts);
-      // Ask IR to do the conversion now that #elts line up.
-      C = ConstantExpr::getBitCast(C, SrcIVTy);
+      APInt Result(DL.getTypeSizeInBits(DestTy), 0);
+      if (Constant *CE = foldConstVectorToAPInt(Result, DestTy, C,
+                                                SrcEltTy, NumSrcElts, DL))
+        return CE;
+
+      if (isa<IntegerType>(DestTy))
+        return ConstantInt::get(DestTy, Result);
+
+      APFloat FP(DestTy->getFltSemantics(), Result);
+      return ConstantFP::get(DestTy->getContext(), FP);
     }
-
-    APInt Result(IT->getBitWidth(), 0);
-    if (Constant *CE = foldConstVectorToAPInt(Result, DestTy, C,
-                                              SrcEltTy, NumSrcElts, DL))
-      return CE;
-
-    return ConstantInt::get(IT, Result);
   }
 
   // The code below only handles casts to vectors currently.
@@ -732,14 +734,15 @@
                          Type *ResultTy, Optional<unsigned> InRangeIndex,
                          const DataLayout &DL, const TargetLibraryInfo *TLI) {
   Type *IntPtrTy = DL.getIntPtrType(ResultTy);
+  Type *IntPtrScalarTy = IntPtrTy->getScalarType();
 
   bool Any = false;
   SmallVector<Constant*, 32> NewIdxs;
   for (unsigned i = 1, e = Ops.size(); i != e; ++i) {
     if ((i == 1 ||
-         !isa<StructType>(GetElementPtrInst::getIndexedType(SrcElemTy,
-             Ops.slice(1, i - 1)))) &&
-        Ops[i]->getType() != IntPtrTy) {
+         !isa<StructType>(GetElementPtrInst::getIndexedType(
+             SrcElemTy, Ops.slice(1, i - 1)))) &&
+        Ops[i]->getType() != (i == 1 ? IntPtrTy : IntPtrScalarTy)) {
       Any = true;
       NewIdxs.push_back(ConstantExpr::getCast(CastInst::getCastOpcode(Ops[i],
                                                                       true,
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 02a2753..0228a1b 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -1486,7 +1486,7 @@
 
   // Don't inline functions which can be interposed at link-time.  Don't inline
   // functions marked noinline or call sites marked noinline.
-  // Note: inlining non-exact non-interposable fucntions is fine, since we know
+  // Note: inlining non-exact non-interposable functions is fine, since we know
   // we have *a* correct implementation of the source level function.
   if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
       CS.isNoInline())
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index c1ba88f..e620e09 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -1127,6 +1127,10 @@
   if (match(Op1, m_Undef()))
     return Op1;
 
+  // X / 1.0 -> X
+  if (match(Op1, m_FPOne()))
+    return Op0;
+
   // 0 / X -> 0
   // Requires that NaNs are off (X could be zero) and signed zeroes are
   // ignored (X could be positive or negative, so the output sign is unknown).
@@ -1518,10 +1522,40 @@
   return nullptr;
 }
 
+/// Commuted variants are assumed to be handled by calling this function again
+/// with the parameters swapped.
+static Value *simplifyAndOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) {
+  ICmpInst::Predicate Pred0, Pred1;
+  Value *A ,*B;
+  if (!match(Op0, m_ICmp(Pred0, m_Value(A), m_Value(B))) ||
+      !match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B))))
+    return nullptr;
+
+  // We have (icmp Pred0, A, B) & (icmp Pred1, A, B).
+  // If Op1 is always implied true by Op0, then Op0 is a subset of Op1, and we
+  // can eliminate Op1 from this 'and'.
+  if (ICmpInst::isImpliedTrueByMatchingCmp(Pred0, Pred1))
+    return Op0;
+
+  // Check for any combination of predicates that are guaranteed to be disjoint.
+  if ((Pred0 == ICmpInst::getInversePredicate(Pred1)) ||
+      (Pred0 == ICmpInst::ICMP_EQ && ICmpInst::isFalseWhenEqual(Pred1)) ||
+      (Pred0 == ICmpInst::ICMP_SLT && Pred1 == ICmpInst::ICMP_SGT) ||
+      (Pred0 == ICmpInst::ICMP_ULT && Pred1 == ICmpInst::ICMP_UGT))
+    return getFalse(Op0->getType());
+
+  return nullptr;
+}
+
+/// Commuted variants are assumed to be handled by calling this function again
+/// with the parameters swapped.
 static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true))
     return X;
 
+  if (Value *X = simplifyAndOfICmpsWithSameOperands(Op0, Op1))
+    return X;
+
   // Look for this pattern: (icmp V, C0) & (icmp V, C1)).
   Type *ITy = Op0->getType();
   ICmpInst::Predicate Pred0, Pred1;
@@ -1700,12 +1734,41 @@
                            RecursionLimit);
 }
 
-/// Simplify (or (icmp ...) (icmp ...)) to true when we can tell that the union
-/// contains all possible values.
+/// Commuted variants are assumed to be handled by calling this function again
+/// with the parameters swapped.
+static Value *simplifyOrOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) {
+  ICmpInst::Predicate Pred0, Pred1;
+  Value *A ,*B;
+  if (!match(Op0, m_ICmp(Pred0, m_Value(A), m_Value(B))) ||
+      !match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B))))
+    return nullptr;
+
+  // We have (icmp Pred0, A, B) | (icmp Pred1, A, B).
+  // If Op1 is always implied true by Op0, then Op0 is a subset of Op1, and we
+  // can eliminate Op0 from this 'or'.
+  if (ICmpInst::isImpliedTrueByMatchingCmp(Pred0, Pred1))
+    return Op1;
+
+  // Check for any combination of predicates that cover the entire range of
+  // possibilities.
+  if ((Pred0 == ICmpInst::getInversePredicate(Pred1)) ||
+      (Pred0 == ICmpInst::ICMP_NE && ICmpInst::isTrueWhenEqual(Pred1)) ||
+      (Pred0 == ICmpInst::ICMP_SLE && Pred1 == ICmpInst::ICMP_SGE) ||
+      (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_UGE))
+    return getTrue(Op0->getType());
+
+  return nullptr;
+}
+
+/// Commuted variants are assumed to be handled by calling this function again
+/// with the parameters swapped.
 static Value *SimplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/false))
     return X;
 
+  if (Value *X = simplifyOrOfICmpsWithSameOperands(Op0, Op1))
+    return X;
+
   // (icmp (add V, C0), C1) | (icmp V, C0)
   ICmpInst::Predicate Pred0, Pred1;
   const APInt *C0, *C1;
@@ -4034,6 +4097,8 @@
     return SimplifyFSubInst(LHS, RHS, FMF, Q, MaxRecurse);
   case Instruction::FMul:
     return SimplifyFMulInst(LHS, RHS, FMF, Q, MaxRecurse);
+  case Instruction::FDiv:
+    return SimplifyFDivInst(LHS, RHS, FMF, Q, MaxRecurse);
   default:
     return SimplifyBinOp(Opcode, LHS, RHS, Q, MaxRecurse);
   }
diff --git a/lib/Analysis/LazyCallGraph.cpp b/lib/Analysis/LazyCallGraph.cpp
index fe635c4..9bc0747 100644
--- a/lib/Analysis/LazyCallGraph.cpp
+++ b/lib/Analysis/LazyCallGraph.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instructions.h"
@@ -25,21 +26,11 @@
 static void addEdge(SmallVectorImpl<LazyCallGraph::Edge> &Edges,
                     DenseMap<Function *, int> &EdgeIndexMap, Function &F,
                     LazyCallGraph::Edge::Kind EK) {
-  // Note that we consider *any* function with a definition to be a viable
-  // edge. Even if the function's definition is subject to replacement by
-  // some other module (say, a weak definition) there may still be
-  // optimizations which essentially speculate based on the definition and
-  // a way to check that the specific definition is in fact the one being
-  // used. For example, this could be done by moving the weak definition to
-  // a strong (internal) definition and making the weak definition be an
-  // alias. Then a test of the address of the weak function against the new
-  // strong definition's address would be an effective way to determine the
-  // safety of optimizing a direct call edge.
-  if (!F.isDeclaration() &&
-      EdgeIndexMap.insert({&F, Edges.size()}).second) {
-    DEBUG(dbgs() << "    Added callable function: " << F.getName() << "\n");
-    Edges.emplace_back(LazyCallGraph::Edge(F, EK));
-  }
+  if (!EdgeIndexMap.insert({&F, Edges.size()}).second)
+    return;
+
+  DEBUG(dbgs() << "    Added callable function: " << F.getName() << "\n");
+  Edges.emplace_back(LazyCallGraph::Edge(F, EK));
 }
 
 LazyCallGraph::Node::Node(LazyCallGraph &G, Function &F)
@@ -56,14 +47,26 @@
   // are trivially added, but to accumulate the latter we walk the instructions
   // and add every operand which is a constant to the worklist to process
   // afterward.
+  //
+  // Note that we consider *any* function with a definition to be a viable
+  // edge. Even if the function's definition is subject to replacement by
+  // some other module (say, a weak definition) there may still be
+  // optimizations which essentially speculate based on the definition and
+  // a way to check that the specific definition is in fact the one being
+  // used. For example, this could be done by moving the weak definition to
+  // a strong (internal) definition and making the weak definition be an
+  // alias. Then a test of the address of the weak function against the new
+  // strong definition's address would be an effective way to determine the
+  // safety of optimizing a direct call edge.
   for (BasicBlock &BB : F)
     for (Instruction &I : BB) {
       if (auto CS = CallSite(&I))
         if (Function *Callee = CS.getCalledFunction())
-          if (Callees.insert(Callee).second) {
-            Visited.insert(Callee);
-            addEdge(Edges, EdgeIndexMap, *Callee, LazyCallGraph::Edge::Call);
-          }
+          if (!Callee->isDeclaration())
+            if (Callees.insert(Callee).second) {
+              Visited.insert(Callee);
+              addEdge(Edges, EdgeIndexMap, *Callee, LazyCallGraph::Edge::Call);
+            }
 
       for (Value *Op : I.operand_values())
         if (Constant *C = dyn_cast<Constant>(Op))
@@ -258,6 +261,9 @@
            "SCC doesn't think it is inside this RefSCC!");
     bool Inserted = SCCSet.insert(C).second;
     assert(Inserted && "Found a duplicate SCC!");
+    auto IndexIt = SCCIndices.find(C);
+    assert(IndexIt != SCCIndices.end() &&
+           "Found an SCC that doesn't have an index!");
   }
 
   // Check that our indices map correctly.
@@ -286,6 +292,20 @@
                "Edge to a RefSCC missing us in its parent set.");
       }
   }
+
+  // Check that our parents are actually parents.
+  for (RefSCC *ParentRC : Parents) {
+    assert(ParentRC != this && "Cannot be our own parent!");
+    auto HasConnectingEdge = [&] {
+      for (SCC &C : *ParentRC)
+        for (Node &N : C)
+          for (Edge &E : N)
+            if (G->lookupRefSCC(*E.getNode()) == this)
+              return true;
+      return false;
+    };
+    assert(HasConnectingEdge() && "No edge connects the parent to us!");
+  }
 }
 #endif
 
@@ -934,9 +954,13 @@
           SourceC, *this, G->PostOrderRefSCCs, G->RefSCCIndices,
           ComputeSourceConnectedSet, ComputeTargetConnectedSet);
 
-  // Build a set so we can do fast tests for whether a merge is occuring.
+  // Build a set so we can do fast tests for whether a RefSCC will end up as
+  // part of the merged RefSCC.
   SmallPtrSet<RefSCC *, 16> MergeSet(MergeRange.begin(), MergeRange.end());
 
+  // This RefSCC will always be part of that set, so just insert it here.
+  MergeSet.insert(this);
+
   // Now that we have identified all of the SCCs which need to be merged into
   // a connected set with the inserted edge, merge all of them into this SCC.
   SmallVector<SCC *, 16> MergedSCCs;
@@ -1203,9 +1227,8 @@
           }
 
           // If this child isn't currently in this RefSCC, no need to process
-          // it.
-          // However, we do need to remove this RefSCC from its RefSCC's parent
-          // set.
+          // it. However, we do need to remove this RefSCC from its RefSCC's
+          // parent set.
           RefSCC &ChildRC = *G->lookupRefSCC(ChildN);
           ChildRC.Parents.erase(this);
           ++I;
@@ -1305,7 +1328,7 @@
     RefSCC &RC = *Result[SCCNumber - 1];
     int SCCIndex = RC.SCCs.size();
     RC.SCCs.push_back(C);
-    SCCIndices[C] = SCCIndex;
+    RC.SCCIndices[C] = SCCIndex;
     C->OuterRefSCC = &RC;
   }
 
@@ -1358,12 +1381,13 @@
   SmallVector<RefSCC *, 4> OldParents(Parents.begin(), Parents.end());
   Parents.clear();
   for (RefSCC *ParentRC : OldParents)
-    for (SCC *ParentC : ParentRC->SCCs)
-      for (Node &ParentN : *ParentC)
+    for (SCC &ParentC : *ParentRC)
+      for (Node &ParentN : ParentC)
         for (Edge &E : ParentN) {
           assert(E.getNode() && "Cannot have a missing node in a visited SCC!");
           RefSCC &RC = *G->lookupRefSCC(*E.getNode());
-          RC.Parents.insert(ParentRC);
+          if (&RC != ParentRC)
+            RC.Parents.insert(ParentRC);
         }
 
   // If this SCC stopped being a leaf through this edge removal, remove it from
@@ -1376,6 +1400,12 @@
         std::remove(G->LeafRefSCCs.begin(), G->LeafRefSCCs.end(), this),
         G->LeafRefSCCs.end());
 
+#ifndef NDEBUG
+  // Verify all of the new RefSCCs.
+  for (RefSCC *RC : Result)
+    RC->verify();
+#endif
+
   // Return the new list of SCCs.
   return Result;
 }
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 646895a..e51e821 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -71,12 +71,14 @@
     /// "nothing known yet".
     undefined,
 
-    /// This Value has a specific constant value.  (For integers, constantrange
-    /// is used instead.)
+    /// This Value has a specific constant value.  (For constant integers,
+    /// constantrange is used instead.  Integer typed constantexprs can appear
+    /// as constant.) 
     constant,
 
-    /// This Value is known to not have the specified value.  (For integers,
-    /// constantrange is used instead.)
+    /// This Value is known to not have the specified value.  (For constant
+    /// integers, constantrange is used instead.  As above, integer typed
+    /// constantexprs can appear here.)
     notconstant,
 
     /// The Value falls within this range. (Used only for integer typed values.)
@@ -140,37 +142,37 @@
     return Range;
   }
 
-  /// Return true if this is a change in status.
-  bool markOverdefined() {
+private:
+  void markOverdefined() {
     if (isOverdefined())
-      return false;
+      return;
     Tag = overdefined;
-    return true;
   }
 
-  /// Return true if this is a change in status.
-  bool markConstant(Constant *V) {
+  void markConstant(Constant *V) {
     assert(V && "Marking constant with NULL");
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
-      return markConstantRange(ConstantRange(CI->getValue()));
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      markConstantRange(ConstantRange(CI->getValue()));
+      return;
+    }
     if (isa<UndefValue>(V))
-      return false;
+      return;
 
     assert((!isConstant() || getConstant() == V) &&
            "Marking constant with different value");
     assert(isUndefined());
     Tag = constant;
     Val = V;
-    return true;
   }
 
-  /// Return true if this is a change in status.
-  bool markNotConstant(Constant *V) {
+  void markNotConstant(Constant *V) {
     assert(V && "Marking constant with NULL");
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
-      return markConstantRange(ConstantRange(CI->getValue()+1, CI->getValue()));
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      markConstantRange(ConstantRange(CI->getValue()+1, CI->getValue()));
+      return;
+    }
     if (isa<UndefValue>(V))
-      return false;
+      return;
 
     assert((!isConstant() || getConstant() != V) &&
            "Marking constant !constant with same value");
@@ -179,100 +181,70 @@
     assert(isUndefined() || isConstant());
     Tag = notconstant;
     Val = V;
-    return true;
   }
 
-  /// Return true if this is a change in status.
-  bool markConstantRange(ConstantRange NewR) {
+  void markConstantRange(ConstantRange NewR) {
     if (isConstantRange()) {
       if (NewR.isEmptySet())
-        return markOverdefined();
-
-      bool changed = Range != NewR;
-      Range = std::move(NewR);
-      return changed;
+        markOverdefined();
+      else {
+        Range = std::move(NewR);
+      }
+      return;
     }
 
     assert(isUndefined());
     if (NewR.isEmptySet())
-      return markOverdefined();
-
-    Tag = constantrange;
-    Range = std::move(NewR);
-    return true;
+      markOverdefined();
+    else {
+      Tag = constantrange;
+      Range = std::move(NewR);
+    }
   }
 
+public:
+
   /// Merge the specified lattice value into this one, updating this
   /// one and returning true if anything changed.
-  bool mergeIn(const LVILatticeVal &RHS, const DataLayout &DL) {
-    if (RHS.isUndefined() || isOverdefined()) return false;
-    if (RHS.isOverdefined()) return markOverdefined();
+  void mergeIn(const LVILatticeVal &RHS, const DataLayout &DL) {
+    if (RHS.isUndefined() || isOverdefined())
+      return;
+    if (RHS.isOverdefined()) {
+      markOverdefined();
+      return;
+    }
 
     if (isUndefined()) {
-      Tag = RHS.Tag;
-      Val = RHS.Val;
-      Range = RHS.Range;
-      return true;
+      *this = RHS;
+      return;
     }
 
     if (isConstant()) {
-      if (RHS.isConstant()) {
-        if (Val == RHS.Val)
-          return false;
-        return markOverdefined();
-      }
-
-      if (RHS.isNotConstant()) {
-        if (Val == RHS.Val)
-          return markOverdefined();
-
-        // Unless we can prove that the two Constants are different, we must
-        // move to overdefined.
-        if (ConstantInt *Res =
-                dyn_cast<ConstantInt>(ConstantFoldCompareInstOperands(
-                    CmpInst::ICMP_NE, getConstant(), RHS.getNotConstant(), DL)))
-          if (Res->isOne())
-            return markNotConstant(RHS.getNotConstant());
-
-        return markOverdefined();
-      }
-
-      return markOverdefined();
+      if (RHS.isConstant() && Val == RHS.Val)
+          return;
+      markOverdefined();
+      return;
     }
 
     if (isNotConstant()) {
-      if (RHS.isConstant()) {
-        if (Val == RHS.Val)
-          return markOverdefined();
-
-        // Unless we can prove that the two Constants are different, we must
-        // move to overdefined.
-        if (ConstantInt *Res =
-                dyn_cast<ConstantInt>(ConstantFoldCompareInstOperands(
-                    CmpInst::ICMP_NE, getNotConstant(), RHS.getConstant(), DL)))
-          if (Res->isOne())
-            return false;
-
-        return markOverdefined();
-      }
-
-      if (RHS.isNotConstant()) {
-        if (Val == RHS.Val)
-          return false;
-        return markOverdefined();
-      }
-
-      return markOverdefined();
+      if (RHS.isNotConstant() && Val == RHS.Val)
+          return;
+      markOverdefined();
+      return;
     }
 
     assert(isConstantRange() && "New LVILattice type?");
-    if (!RHS.isConstantRange())
-      return markOverdefined();
-
+    if (!RHS.isConstantRange()) {
+      // We can get here if we've encountered a constantexpr of integer type
+      // and merge it with a constantrange.
+      markOverdefined();
+      return;
+    }
     ConstantRange NewR = Range.unionWith(RHS.getConstantRange());
     if (NewR.isFullSet())
-      return markOverdefined();
-    return markConstantRange(NewR);
+      markOverdefined();
+    else
+      markConstantRange(NewR);
   }
 };
 
@@ -620,6 +592,7 @@
   // returned means that the work item was not completely processed and must
   // be revisited after going through the new items.
   bool solveBlockValue(Value *Val, BasicBlock *BB);
+  bool solveBlockValueImpl(LVILatticeVal &Res, Value *Val, BasicBlock *BB);
   bool solveBlockValueNonLocal(LVILatticeVal &BBLV, Value *Val, BasicBlock *BB);
   bool solveBlockValuePHINode(LVILatticeVal &BBLV, PHINode *PN, BasicBlock *BB);
   bool solveBlockValueSelect(LVILatticeVal &BBLV, SelectInst *S,
@@ -744,28 +717,26 @@
   // Hold off inserting this value into the Cache in case we have to return
   // false and come back later.
   LVILatticeVal Res;
+  if (!solveBlockValueImpl(Res, Val, BB))
+    // Work pushed, will revisit
+    return false;
+
+  TheCache.insertResult(Val, BB, Res);
+  return true;
+}
+
+bool LazyValueInfoImpl::solveBlockValueImpl(LVILatticeVal &Res,
+                                            Value *Val, BasicBlock *BB) {
 
   Instruction *BBI = dyn_cast<Instruction>(Val);
-  if (!BBI || BBI->getParent() != BB) {
-    if (!solveBlockValueNonLocal(Res, Val, BB))
-      return false;
-   TheCache.insertResult(Val, BB, Res);
-   return true;
-  }
+  if (!BBI || BBI->getParent() != BB)
+    return solveBlockValueNonLocal(Res, Val, BB);
 
-  if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
-    if (!solveBlockValuePHINode(Res, PN, BB))
-      return false;
-    TheCache.insertResult(Val, BB, Res);
-    return true;
-  }
+  if (PHINode *PN = dyn_cast<PHINode>(BBI))
+    return solveBlockValuePHINode(Res, PN, BB);
 
-  if (auto *SI = dyn_cast<SelectInst>(BBI)) {
-    if (!solveBlockValueSelect(Res, SI, BB))
-      return false;
-    TheCache.insertResult(Val, BB, Res);
-    return true;
-  }
+  if (auto *SI = dyn_cast<SelectInst>(BBI))
+    return solveBlockValueSelect(Res, SI, BB);
 
   // If this value is a nonnull pointer, record it's range and bailout.  Note
   // that for all other pointer typed values, we terminate the search at the
@@ -779,29 +750,20 @@
   PointerType *PT = dyn_cast<PointerType>(BBI->getType());
   if (PT && isKnownNonNull(BBI)) {
     Res = LVILatticeVal::getNot(ConstantPointerNull::get(PT));
-    TheCache.insertResult(Val, BB, Res);
     return true;
   }
   if (BBI->getType()->isIntegerTy()) {
-    if (isa<CastInst>(BBI)) {
-      if (!solveBlockValueCast(Res, BBI, BB))
-        return false;
-      TheCache.insertResult(Val, BB, Res);
-      return true;
-    }
+    if (isa<CastInst>(BBI))
+      return solveBlockValueCast(Res, BBI, BB);
+    
     BinaryOperator *BO = dyn_cast<BinaryOperator>(BBI);
-    if (BO && isa<ConstantInt>(BO->getOperand(1))) {
-      if (!solveBlockValueBinaryOp(Res, BBI, BB))
-        return false;
-      TheCache.insertResult(Val, BB, Res);
-      return true;
-    }
+    if (BO && isa<ConstantInt>(BO->getOperand(1)))
+      return solveBlockValueBinaryOp(Res, BBI, BB);
   }
 
   DEBUG(dbgs() << " compute BB '" << BB->getName()
                  << "' - unknown inst def found.\n");
   Res = getFromRangeMetadata(BBI);
-  TheCache.insertResult(Val, BB, Res);
   return true;
 }
 
@@ -869,7 +831,7 @@
       PointerType *PTy = cast<PointerType>(Val->getType());
       Result = LVILatticeVal::getNot(ConstantPointerNull::get(PTy));
     } else {
-      Result.markOverdefined();
+      Result = LVILatticeVal::getOverdefined();
     }
     BBLV = Result;
     return true;
@@ -993,28 +955,28 @@
   if (!hasBlockValue(SI->getTrueValue(), BB)) {
     if (pushBlockValue(std::make_pair(BB, SI->getTrueValue())))
       return false;
-    BBLV.markOverdefined();
+    BBLV = LVILatticeVal::getOverdefined();
     return true;
   }
   LVILatticeVal TrueVal = getBlockValue(SI->getTrueValue(), BB);
   // If we hit overdefined, don't ask more queries.  We want to avoid poisoning
   // extra slots in the table if we can.
   if (TrueVal.isOverdefined()) {
-    BBLV.markOverdefined();
+    BBLV = LVILatticeVal::getOverdefined();
     return true;
   }
 
   if (!hasBlockValue(SI->getFalseValue(), BB)) {
     if (pushBlockValue(std::make_pair(BB, SI->getFalseValue())))
       return false;
-    BBLV.markOverdefined();
+    BBLV = LVILatticeVal::getOverdefined();
     return true;
   }
   LVILatticeVal FalseVal = getBlockValue(SI->getFalseValue(), BB);
   // If we hit overdefined, don't ask more queries.  We want to avoid poisoning
   // extra slots in the table if we can.
   if (FalseVal.isOverdefined()) {
-    BBLV.markOverdefined();
+    BBLV = LVILatticeVal::getOverdefined();
     return true;
   }
 
@@ -1028,22 +990,22 @@
     // ValueTracking getting smarter looking back past our immediate inputs.)
     if (SelectPatternResult::isMinOrMax(SPR.Flavor) &&
         LHS == SI->getTrueValue() && RHS == SI->getFalseValue()) {
-      switch (SPR.Flavor) {
-      default:
-        llvm_unreachable("unexpected minmax type!");
-      case SPF_SMIN:                   /// Signed minimum
-        BBLV.markConstantRange(TrueCR.smin(FalseCR));
-        return true;
-      case SPF_UMIN:                   /// Unsigned minimum
-        BBLV.markConstantRange(TrueCR.umin(FalseCR));
-        return true;
-      case SPF_SMAX:                   /// Signed maximum
-        BBLV.markConstantRange(TrueCR.smax(FalseCR));
-        return true;
-      case SPF_UMAX:                   /// Unsigned maximum
-        BBLV.markConstantRange(TrueCR.umax(FalseCR));
-        return true;
-      };
+      ConstantRange ResultCR = [&]() {
+        switch (SPR.Flavor) {
+        default:
+          llvm_unreachable("unexpected minmax type!");
+        case SPF_SMIN:                   /// Signed minimum
+          return TrueCR.smin(FalseCR);
+        case SPF_UMIN:                   /// Unsigned minimum
+          return TrueCR.umin(FalseCR);
+        case SPF_SMAX:                   /// Signed maximum
+          return TrueCR.smax(FalseCR);
+        case SPF_UMAX:                   /// Unsigned maximum
+          return TrueCR.umax(FalseCR);
+        };
+      }();
+      BBLV = LVILatticeVal::getRange(ResultCR);
+      return true;
     }
 
     // TODO: ABS, NABS from the SelectPatternResult
@@ -1113,7 +1075,7 @@
   if (!BBI->getOperand(0)->getType()->isSized()) {
     // Without knowing how wide the input is, we can't analyze it in any useful
     // way.
-    BBLV.markOverdefined();
+    BBLV = LVILatticeVal::getOverdefined();
     return true;
   }
 
@@ -1130,7 +1092,7 @@
     // Unhandled instructions are overdefined.
     DEBUG(dbgs() << " compute BB '" << BB->getName()
                  << "' - overdefined (unknown cast).\n");
-    BBLV.markOverdefined();
+    BBLV = LVILatticeVal::getOverdefined();
     return true;
   }
 
@@ -1159,10 +1121,8 @@
   // NOTE: We're currently limited by the set of operations that ConstantRange
   // can evaluate symbolically.  Enhancing that set will allows us to analyze
   // more definitions.
-  LVILatticeVal Result;
   auto CastOp = (Instruction::CastOps) BBI->getOpcode();
-  Result.markConstantRange(LHSRange.castOp(CastOp, ResultBitWidth));
-  BBLV = Result;
+  BBLV = LVILatticeVal::getRange(LHSRange.castOp(CastOp, ResultBitWidth));
   return true;
 }
 
@@ -1191,7 +1151,7 @@
     // Unhandled instructions are overdefined.
     DEBUG(dbgs() << " compute BB '" << BB->getName()
                  << "' - overdefined (unknown binary operator).\n");
-    BBLV.markOverdefined();
+    BBLV = LVILatticeVal::getOverdefined();
     return true;
   };
 
@@ -1220,10 +1180,8 @@
   // NOTE: We're currently limited by the set of operations that ConstantRange
   // can evaluate symbolically.  Enhancing that set will allows us to analyze
   // more definitions.
-  LVILatticeVal Result;
   auto BinOp = (Instruction::BinaryOps) BBI->getOpcode();
-  Result.markConstantRange(LHSRange.binaryOp(BinOp, RHSRange));
-  BBLV = Result;
+  BBLV = LVILatticeVal::getRange(LHSRange.binaryOp(BinOp, RHSRange));
   return true;
 }
 
@@ -1405,7 +1363,7 @@
   if (!getEdgeValueLocal(Val, BBFrom, BBTo, LocalResult))
     // If we couldn't constrain the value on the edge, LocalResult doesn't
     // provide any information.
-    LocalResult.markOverdefined();
+    LocalResult = LVILatticeVal::getOverdefined();
 
   if (hasSingleValue(LocalResult)) {
     // Can't get any more precise here
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index 01a2f46..2f3dca3 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1870,18 +1870,24 @@
   Value *Ptr = PtrRtChecking.Pointers[CG->Members[0]].PointerValue;
   const SCEV *Sc = SE->getSCEV(Ptr);
 
+  unsigned AS = Ptr->getType()->getPointerAddressSpace();
+  LLVMContext &Ctx = Loc->getContext();
+
+  // Use this type for pointer arithmetic.
+  Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
+
   if (SE->isLoopInvariant(Sc, TheLoop)) {
     DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:" << *Ptr
                  << "\n");
-    return {Ptr, Ptr};
+    // Ptr could be in the loop body. If so, expand a new one at the correct
+    // location.
+    Instruction *Inst = dyn_cast<Instruction>(Ptr);
+    Value *NewPtr = (Inst && TheLoop->contains(Inst))
+                        ? Exp.expandCodeFor(Sc, PtrArithTy, Loc)
+                        : Ptr;
+    return {NewPtr, NewPtr};
   } else {
-    unsigned AS = Ptr->getType()->getPointerAddressSpace();
-    LLVMContext &Ctx = Loc->getContext();
-
-    // Use this type for pointer arithmetic.
-    Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
     Value *Start = nullptr, *End = nullptr;
-
     DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
     Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc);
     End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc);
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index e7220f8..950a2fb 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -3345,11 +3345,11 @@
   if (const Argument *A = dyn_cast<Argument>(V))
     return A->hasByValOrInAllocaAttr() || A->hasNonNullAttr();
 
-  // A global variable in address space 0 is non null unless extern weak.
-  // Other address spaces may have null as a valid address for a global,
-  // so we can't assume anything.
+  // A global variable in address space 0 is non null unless extern weak
+  // or an absolute symbol reference. Other address spaces may have null as a
+  // valid address for a global, so we can't assume anything.
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
-    return !GV->hasExternalWeakLinkage() &&
+    return !GV->isAbsoluteSymbolRef() && !GV->hasExternalWeakLinkage() &&
            GV->getType()->getAddressSpace() == 0;
 
   // A Load tagged with nonnull metadata is never null.
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 383a8f5..635e960 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -3000,7 +3000,14 @@
       if (Record.size() < 1)
         return error("Invalid record");
 
-      IsDistinct = Record[0];
+      IsDistinct = Record[0] & 1;
+      bool HasOpFragment = Record[0] & 2;
+      auto Elts = MutableArrayRef<uint64_t>(Record).slice(1);
+      if (!HasOpFragment)
+        if (unsigned N = Elts.size())
+          if (N >= 3 && Elts[N - 3] == dwarf::DW_OP_bit_piece)
+            Elts[N-3] = dwarf::DW_OP_LLVM_fragment;
+
       MetadataList.assignValue(
           GET_OR_DISTINCT(DIExpression,
                           (Context, makeArrayRef(Record).slice(1))),
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index c324100..cff2fd0 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1727,7 +1727,8 @@
                                             unsigned Abbrev) {
   Record.reserve(N->getElements().size() + 1);
 
-  Record.push_back(N->isDistinct());
+  const uint64_t HasOpFragmentFlag = 1 << 1;
+  Record.push_back((uint64_t)N->isDistinct() | HasOpFragmentFlag);
   Record.append(N->elements_begin(), N->elements_end());
 
   Stream.EmitRecord(bitc::METADATA_EXPRESSION, Record, Abbrev);
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index eb70879..b43d739 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -713,9 +713,9 @@
   OS << V->getName();
 
   const DIExpression *Expr = MI->getDebugExpression();
-  if (Expr->isBitPiece())
-    OS << " [bit_piece offset=" << Expr->getBitPieceOffset()
-       << " size=" << Expr->getBitPieceSize() << "]";
+  if (Expr->isFragment())
+    OS << " [fragment offset=" << Expr->getFragmentOffsetInBits()
+       << " size=" << Expr->getFragmentSizeInBits() << "]";
   OS << " <- ";
 
   // The second operand is only an offset if it's an immediate.
@@ -724,7 +724,7 @@
 
   for (unsigned i = 0; i < Expr->getNumElements(); ++i) {
     uint64_t Op = Expr->getElement(i);
-    if (Op == dwarf::DW_OP_bit_piece) {
+    if (Op == dwarf::DW_OP_LLVM_fragment) {
       // There can't be any operands after this in a valid expression
       break;
     } else if (Deref) {
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index 7efe74f..c104c3c 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -191,8 +191,8 @@
                          "nop (could not find a dwarf register number)");
 
     // Attempt to find a valid super- or sub-register.
-    if (!Expr.AddMachineRegPiece(*MF->getSubtarget().getRegisterInfo(),
-                                 MLoc.getReg()))
+    if (!Expr.AddMachineRegFragment(*MF->getSubtarget().getRegisterInfo(),
+                                    MLoc.getReg()))
       Expr.EmitOp(dwarf::DW_OP_nop,
                   "nop (could not find a dwarf register number)");
     return;
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 21b78ff..8e17032 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -943,10 +943,10 @@
       bool IsSubfield = false;
       unsigned StructOffset = 0;
 
-      // Handle bitpieces.
-      if (DIExpr && DIExpr->isBitPiece()) {
+      // Handle fragments.
+      if (DIExpr && DIExpr->isFragment()) {
         IsSubfield = true;
-        StructOffset = DIExpr->getBitPieceOffset() / 8;
+        StructOffset = DIExpr->getFragmentOffsetInBits() / 8;
       } else if (DIExpr && DIExpr->getNumElements() > 0) {
         continue; // Ignore unrecognized exprs.
       }
@@ -985,7 +985,8 @@
         // This range is valid until the next overlapping bitpiece. In the
         // common case, ranges will not be bitpieces, so they will overlap.
         auto J = std::next(I);
-        while (J != E && !piecesOverlap(DIExpr, J->first->getDebugExpression()))
+        while (J != E &&
+               !fragmentsOverlap(DIExpr, J->first->getDebugExpression()))
           ++J;
         if (J != E)
           End = getLabelBeforeInsn(J->first);
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 06023fc..3fbb52f 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -108,6 +108,51 @@
 LLVM_DUMP_METHOD
 void DIEAbbrev::dump() { print(dbgs()); }
 
+//===----------------------------------------------------------------------===//
+// DIEAbbrevSet Implementation
+//===----------------------------------------------------------------------===//
+
+DIEAbbrevSet::~DIEAbbrevSet() {
+  for (DIEAbbrev *Abbrev : Abbreviations)
+    Abbrev->~DIEAbbrev();
+}
+
+DIEAbbrev &DIEAbbrevSet::uniqueAbbreviation(DIE &Die) {
+
+  FoldingSetNodeID ID;
+  DIEAbbrev Abbrev = Die.generateAbbrev();
+  Abbrev.Profile(ID);
+
+  void *InsertPos;
+  if (DIEAbbrev *Existing =
+          AbbreviationsSet.FindNodeOrInsertPos(ID, InsertPos)) {
+    Die.setAbbrevNumber(Existing->getNumber());
+    return *Existing;
+  }
+
+  // Move the abbreviation to the heap and assign a number.
+  DIEAbbrev *New = new (Alloc) DIEAbbrev(std::move(Abbrev));
+  Abbreviations.push_back(New);
+  New->setNumber(Abbreviations.size());
+  Die.setAbbrevNumber(Abbreviations.size());
+
+  // Store it for lookup.
+  AbbreviationsSet.InsertNode(New, InsertPos);
+  return *New;
+}
+
+void DIEAbbrevSet::Emit(const AsmPrinter *AP, MCSection *Section) const {
+  if (!Abbreviations.empty()) {
+    // Start the debug abbrev section.
+    AP->OutStreamer->SwitchSection(Section);
+    AP->emitDwarfAbbrevs(Abbreviations);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// DIE Implementation
+//===----------------------------------------------------------------------===//
+
 DIE *DIE::getParent() const {
   return Owner.dyn_cast<DIE*>();
 }
@@ -198,6 +243,45 @@
   print(dbgs());
 }
 
+unsigned DIE::computeOffsetsAndAbbrevs(const AsmPrinter *AP,
+                                       DIEAbbrevSet &AbbrevSet,
+                                       unsigned CUOffset) {
+  // Unique the abbreviation and fill in the abbreviation number so this DIE
+  // can be emitted.
+  const DIEAbbrev &Abbrev = AbbrevSet.uniqueAbbreviation(*this);
+
+  // Set compile/type unit relative offset of this DIE.
+  setOffset(CUOffset);
+
+  // Add the byte size of the abbreviation code.
+  CUOffset += getULEB128Size(getAbbrevNumber());
+
+  // Add the byte size of all the DIE attribute values.
+  for (const auto &V : values())
+    CUOffset += V.SizeOf(AP);
+
+  // Let the children compute their offsets and abbreviation numbers.
+  if (hasChildren()) {
+    (void)Abbrev;
+    assert(Abbrev.hasChildren() && "Children flag not set");
+
+    for (auto &Child : children())
+      CUOffset = Child.computeOffsetsAndAbbrevs(AP, AbbrevSet, CUOffset);
+
+    // Each child chain is terminated with a zero byte, adjust the offset.
+    CUOffset += sizeof(int8_t);
+  }
+
+  // Compute the byte size of this DIE and all of its children correctly. This
+  // is needed so that top level DIE can help the compile unit set its length
+  // correctly.
+  setSize(CUOffset - getOffset());
+  return CUOffset;
+}
+
+//===----------------------------------------------------------------------===//
+// DIEUnit Implementation
+//===----------------------------------------------------------------------===//
 DIEUnit::DIEUnit(uint16_t V, uint8_t A, dwarf::Tag UnitTag)
     : Die(UnitTag), Section(nullptr), Offset(0), Length(0), Version(V),
       AddrSize(A)
@@ -257,38 +341,65 @@
 /// EmitValue - Emit integer of appropriate size.
 ///
 void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
-  unsigned Size = ~0U;
   switch (Form) {
   case dwarf::DW_FORM_flag_present:
     // Emit something to keep the lines and comments in sync.
     // FIXME: Is there a better way to do this?
     Asm->OutStreamer->AddBlankLine();
     return;
-  case dwarf::DW_FORM_flag:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_ref1:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data1: Size = 1; break;
-  case dwarf::DW_FORM_ref2:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data2: Size = 2; break;
-  case dwarf::DW_FORM_sec_offset: LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_strp:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_ref4:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data4: Size = 4; break;
-  case dwarf::DW_FORM_ref8:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_ref_sig8:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data8: Size = 8; break;
-  case dwarf::DW_FORM_GNU_str_index: Asm->EmitULEB128(Integer); return;
-  case dwarf::DW_FORM_GNU_addr_index: Asm->EmitULEB128(Integer); return;
-  case dwarf::DW_FORM_udata: Asm->EmitULEB128(Integer); return;
-  case dwarf::DW_FORM_sdata: Asm->EmitSLEB128(Integer); return;
+  case dwarf::DW_FORM_flag:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_ref1:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_data1:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_ref2:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_data2:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_strp:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_ref4:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_data4:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_ref8:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_ref_sig8:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_data8:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_GNU_ref_alt:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_GNU_strp_alt:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_line_strp:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_sec_offset:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_strp_sup:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_ref_sup:
+    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_addr:
-    Size = Asm->getPointerSize();
-    break;
+    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref_addr:
-    Size = SizeOf(Asm, dwarf::DW_FORM_ref_addr);
-    break;
+    Asm->OutStreamer->EmitIntValue(Integer, SizeOf(Asm, Form));
+    return;
+  case dwarf::DW_FORM_GNU_str_index:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_GNU_addr_index:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_ref_udata:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_udata:
+    Asm->EmitULEB128(Integer);
+    return;
+  case dwarf::DW_FORM_sdata:
+    Asm->EmitSLEB128(Integer);
+    return;
   default: llvm_unreachable("DIE Value form not supported yet");
   }
-  Asm->OutStreamer->EmitIntValue(Integer, Size);
 }
 
 /// SizeOf - Determine size of integer value in bytes.
@@ -301,23 +412,47 @@
   case dwarf::DW_FORM_data1: return sizeof(int8_t);
   case dwarf::DW_FORM_ref2:  LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_data2: return sizeof(int16_t);
-  case dwarf::DW_FORM_sec_offset: LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_strp:  LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref4:  LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_data4: return sizeof(int32_t);
   case dwarf::DW_FORM_ref8:  LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref_sig8:  LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_data8: return sizeof(int64_t);
-  case dwarf::DW_FORM_GNU_str_index: return getULEB128Size(Integer);
-  case dwarf::DW_FORM_GNU_addr_index: return getULEB128Size(Integer);
-  case dwarf::DW_FORM_udata: return getULEB128Size(Integer);
-  case dwarf::DW_FORM_sdata: return getSLEB128Size(Integer);
-  case dwarf::DW_FORM_addr:
-    return AP->getPointerSize();
   case dwarf::DW_FORM_ref_addr:
     if (AP->getDwarfVersion() == 2)
       return AP->getPointerSize();
-    return sizeof(int32_t);
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_strp:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_GNU_ref_alt:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_GNU_strp_alt:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_line_strp:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_sec_offset:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_strp_sup:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_ref_sup:
+    switch (AP->OutStreamer->getContext().getDwarfFormat()) {
+    case dwarf::DWARF32:
+      return 4;
+    case dwarf::DWARF64:
+      return 8;
+    }
+    llvm_unreachable("Invalid DWARF format");
+  case dwarf::DW_FORM_GNU_str_index:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_GNU_addr_index:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_ref_udata:
+    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_udata:
+    return getULEB128Size(Integer);
+  case dwarf::DW_FORM_sdata:
+    return getSLEB128Size(Integer);
+  case dwarf::DW_FORM_addr:
+    return AP->getPointerSize();
   default: llvm_unreachable("DIE Value form not supported yet");
   }
 }
@@ -452,6 +587,29 @@
 }
 
 //===----------------------------------------------------------------------===//
+// DIEInlineString Implementation
+//===----------------------------------------------------------------------===//
+void DIEInlineString::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  if (Form == dwarf::DW_FORM_string) {
+    for (char ch : S)
+      AP->EmitInt8(ch);
+    AP->EmitInt8(0);
+    return;
+  }
+  llvm_unreachable("Expected valid string form");
+}
+
+unsigned DIEInlineString::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+  // Emit string bytes + NULL byte.
+  return S.size() + 1;
+}
+
+LLVM_DUMP_METHOD
+void DIEInlineString::print(raw_ostream &O) const {
+  O << "InlineString: " << S.c_str();
+}
+
+//===----------------------------------------------------------------------===//
 // DIEEntry Implementation
 //===----------------------------------------------------------------------===//
 
@@ -459,33 +617,69 @@
 ///
 void DIEEntry::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
 
-  if (Form == dwarf::DW_FORM_ref_addr) {
+  switch (Form) {
+  case dwarf::DW_FORM_ref1:
+  case dwarf::DW_FORM_ref2:
+  case dwarf::DW_FORM_ref4:
+  case dwarf::DW_FORM_ref8:
+    AP->OutStreamer->EmitIntValue(Entry->getOffset(), SizeOf(AP, Form));
+    return;
+
+  case dwarf::DW_FORM_ref_udata:
+    AP->EmitULEB128(Entry->getOffset());
+    return;
+
+  case dwarf::DW_FORM_ref_addr: {
     // Get the absolute offset for this DIE within the debug info/types section.
     unsigned Addr = Entry->getDebugSectionOffset();
     if (AP->MAI->doesDwarfUseRelocationsAcrossSections()) {
       const DwarfDebug *DD = AP->getDwarfDebug();
       if (DD)
-        assert(!DD->useSplitDwarf() && "TODO: dwo files can't have relocations.");
+        assert(!DD->useSplitDwarf() &&
+               "TODO: dwo files can't have relocations.");
       const DIEUnit *Unit = Entry->getUnit();
       assert(Unit && "CUDie should belong to a CU.");
       MCSection *Section = Unit->getSection();
-      assert(Section && "Must have a section if we are doing relocations");
-      const MCSymbol *SectionSym = Section->getBeginSymbol();
-      AP->EmitLabelPlusOffset(SectionSym, Addr, DIEEntry::getRefAddrSize(AP));
-    } else
-      AP->OutStreamer->EmitIntValue(Addr, DIEEntry::getRefAddrSize(AP));
-  } else
-    AP->EmitInt32(Entry->getOffset());
+      if (Section) {
+        const MCSymbol *SectionSym = Section->getBeginSymbol();
+        AP->EmitLabelPlusOffset(SectionSym, Addr, SizeOf(AP, Form));
+        return;
+      }
+    }
+    AP->OutStreamer->EmitIntValue(Addr, SizeOf(AP, Form));
+    return;
+  }
+  default:
+    llvm_unreachable("Improper form for DIE reference");
+  }
 }
 
-unsigned DIEEntry::getRefAddrSize(const AsmPrinter *AP) {
-  // DWARF4: References that use the attribute form DW_FORM_ref_addr are
-  // specified to be four bytes in the DWARF 32-bit format and eight bytes
-  // in the DWARF 64-bit format, while DWARF Version 2 specifies that such
-  // references have the same size as an address on the target system.
-  if (AP->getDwarfVersion() == 2)
-    return AP->getPointerSize();
-  return sizeof(int32_t);
+unsigned DIEEntry::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_ref1:
+    return 1;
+  case dwarf::DW_FORM_ref2:
+    return 2;
+  case dwarf::DW_FORM_ref4:
+    return 4;
+  case dwarf::DW_FORM_ref8:
+    return 8;
+  case dwarf::DW_FORM_ref_udata:
+    return getULEB128Size(Entry->getOffset());
+  case dwarf::DW_FORM_ref_addr:
+    if (AP->getDwarfVersion() == 2)
+      return AP->getPointerSize();
+    switch (AP->OutStreamer->getContext().getDwarfFormat()) {
+    case dwarf::DWARF32:
+      return 4;
+    case dwarf::DWARF64:
+      return 8;
+    }
+    llvm_unreachable("Invalid DWARF format");
+
+  default:
+    llvm_unreachable("Improper form for DIE reference");
+  }
 }
 
 LLVM_DUMP_METHOD
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp
index 74c47d1..d8ecc7c 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -330,6 +330,12 @@
     addULEB128(dwarf::DW_FORM_string);
     addString(Value.getDIEString().getString());
     break;
+  case DIEValue::isInlineString:
+    addULEB128('A');
+    addULEB128(Attribute);
+    addULEB128(dwarf::DW_FORM_string);
+    addString(Value.getDIEInlineString().getString());
+    break;
   case DIEValue::isBlock:
   case DIEValue::isLoc:
   case DIEValue::isLocList:
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index d30f106..ce57f17 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -63,14 +63,12 @@
   return LabelsAfterInsn.lookup(MI);
 }
 
-// Determine the relative position of the pieces described by P1 and P2.
-// Returns  -1 if P1 is entirely before P2, 0 if P1 and P2 overlap,
-// 1 if P1 is entirely after P2.
-int DebugHandlerBase::pieceCmp(const DIExpression *P1, const DIExpression *P2) {
-  unsigned l1 = P1->getBitPieceOffset();
-  unsigned l2 = P2->getBitPieceOffset();
-  unsigned r1 = l1 + P1->getBitPieceSize();
-  unsigned r2 = l2 + P2->getBitPieceSize();
+int DebugHandlerBase::fragmentCmp(const DIExpression *P1,
+                                  const DIExpression *P2) {
+  unsigned l1 = P1->getFragmentOffsetInBits();
+  unsigned l2 = P2->getFragmentOffsetInBits();
+  unsigned r1 = l1 + P1->getFragmentSizeInBits();
+  unsigned r2 = l2 + P2->getFragmentSizeInBits();
   if (r1 <= l2)
     return -1;
   else if (r2 <= l1)
@@ -79,11 +77,11 @@
     return 0;
 }
 
-/// Determine whether two variable pieces overlap.
-bool DebugHandlerBase::piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
-  if (!P1->isBitPiece() || !P2->isBitPiece())
+bool DebugHandlerBase::fragmentsOverlap(const DIExpression *P1,
+                                        const DIExpression *P2) {
+  if (!P1->isFragment() || !P2->isFragment())
     return true;
-  return pieceCmp(P1, P2) == 0;
+  return fragmentCmp(P1, P2) == 0;
 }
 
 /// If this type is derived from a base type then return base type size.
@@ -142,14 +140,15 @@
     if (DIVar->isParameter() &&
         getDISubprogram(DIVar->getScope())->describes(MF->getFunction())) {
       LabelsBeforeInsn[Ranges.front().first] = Asm->getFunctionBegin();
-      if (Ranges.front().first->getDebugExpression()->isBitPiece()) {
-        // Mark all non-overlapping initial pieces.
+      if (Ranges.front().first->getDebugExpression()->isFragment()) {
+        // Mark all non-overlapping initial fragments.
         for (auto I = Ranges.begin(); I != Ranges.end(); ++I) {
-          const DIExpression *Piece = I->first->getDebugExpression();
+          const DIExpression *Fragment = I->first->getDebugExpression();
           if (std::all_of(Ranges.begin(), I,
                           [&](DbgValueHistoryMap::InstrRange Pred) {
-                return !piecesOverlap(Piece, Pred.first->getDebugExpression());
-              }))
+                            return !fragmentsOverlap(
+                                Fragment, Pred.first->getDebugExpression());
+                          }))
             LabelsBeforeInsn[I->first] = Asm->getFunctionBegin();
           else
             break;
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
index b8bbcec..7219b05 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
@@ -92,13 +92,13 @@
   /// Return Label immediately following the instruction.
   MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
 
-  /// Determine the relative position of the pieces described by P1 and P2.
-  /// Returns  -1 if P1 is entirely before P2, 0 if P1 and P2 overlap,
-  /// 1 if P1 is entirely after P2.
-  static int pieceCmp(const DIExpression *P1, const DIExpression *P2);
+  /// Determine the relative position of the fragments described by P1 and P2.
+  /// Returns -1 if P1 is entirely before P2, 0 if P1 and P2 overlap, 1 if P1 is
+  /// entirely after P2.
+  static int fragmentCmp(const DIExpression *P1, const DIExpression *P2);
 
-  /// Determine whether two variable pieces overlap.
-  static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2);
+  /// Determine whether two variable fragments overlap.
+  static bool fragmentsOverlap(const DIExpression *P1, const DIExpression *P2);
 
   /// If this type is derived from a base type then return base type size.
   static uint64_t getBaseTypeSize(const DITypeRef TyRef);
diff --git a/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
index dd12c32..9444fad 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
@@ -72,7 +72,7 @@
     const ConstantFP *getConstantFP() const { return Constant.CFP; }
     const ConstantInt *getConstantInt() const { return Constant.CIP; }
     MachineLocation getLoc() const { return Loc; }
-    bool isBitPiece() const { return getExpression()->isBitPiece(); }
+    bool isFragment() const { return getExpression()->isFragment(); }
     const DIExpression *getExpression() const { return Expression; }
     friend bool operator==(const Value &, const Value &);
     friend bool operator<(const Value &, const Value &);
@@ -129,7 +129,7 @@
     Values.append(Vals.begin(), Vals.end());
     sortUniqueValues();
     assert(all_of(Values, [](DebugLocEntry::Value V) {
-          return V.isBitPiece();
+          return V.isFragment();
         }) && "value must be a piece");
   }
 
@@ -172,11 +172,11 @@
   llvm_unreachable("unhandled EntryKind");
 }
 
-/// \brief Compare two pieces based on their offset.
+/// Compare two fragments based on their offset.
 inline bool operator<(const DebugLocEntry::Value &A,
                       const DebugLocEntry::Value &B) {
-  return A.getExpression()->getBitPieceOffset() <
-         B.getExpression()->getBitPieceOffset();
+  return A.getExpression()->getFragmentOffsetInBits() <
+         B.getExpression()->getFragmentOffsetInBits();
 }
 
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 84981ac..c615cea 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -730,7 +730,7 @@
 
   bool validReg;
   if (Location.isReg())
-    validReg = addRegisterOpPiece(*Loc, Location.getReg());
+    validReg = addRegisterFragment(*Loc, Location.getReg());
   else
     validReg = addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index acb8ce0..91b30ba 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -776,7 +776,7 @@
   llvm_unreachable("Unexpected 4-operand DBG_VALUE instruction!");
 }
 
-/// \brief If this and Next are describing different pieces of the same
+/// \brief If this and Next are describing different fragments of the same
 /// variable, merge them by appending Next's values to the current
 /// list of values.
 /// Return true if the merge was successful.
@@ -784,15 +784,15 @@
   if (Begin == Next.Begin) {
     auto *FirstExpr = cast<DIExpression>(Values[0].Expression);
     auto *FirstNextExpr = cast<DIExpression>(Next.Values[0].Expression);
-    if (!FirstExpr->isBitPiece() || !FirstNextExpr->isBitPiece())
+    if (!FirstExpr->isFragment() || !FirstNextExpr->isFragment())
       return false;
 
-    // We can only merge entries if none of the pieces overlap any others.
+    // We can only merge entries if none of the fragments overlap any others.
     // In doing so, we can take advantage of the fact that both lists are
     // sorted.
     for (unsigned i = 0, j = 0; i < Values.size(); ++i) {
       for (; j < Next.Values.size(); ++j) {
-        int res = DebugHandlerBase::pieceCmp(
+        int res = DebugHandlerBase::fragmentCmp(
             cast<DIExpression>(Values[i].Expression),
             cast<DIExpression>(Next.Values[j].Expression));
         if (res == 0) // The two expressions overlap, we can't merge.
@@ -815,27 +815,27 @@
 
 /// Build the location list for all DBG_VALUEs in the function that
 /// describe the same variable.  If the ranges of several independent
-/// pieces of the same variable overlap partially, split them up and
+/// fragments of the same variable overlap partially, split them up and
 /// combine the ranges. The resulting DebugLocEntries are will have
 /// strict monotonically increasing begin addresses and will never
 /// overlap.
 //
 // Input:
 //
-//   Ranges History [var, loc, piece ofs size]
-// 0 |      [x, (reg0, piece 0, 32)]
-// 1 | |    [x, (reg1, piece 32, 32)] <- IsPieceOfPrevEntry
+//   Ranges History [var, loc, fragment ofs size]
+// 0 |      [x, (reg0, fragment 0, 32)]
+// 1 | |    [x, (reg1, fragment 32, 32)] <- IsFragmentOfPrevEntry
 // 2 | |    ...
 // 3   |    [clobber reg0]
-// 4        [x, (mem, piece 0, 64)] <- overlapping with both previous pieces of
+// 4        [x, (mem, fragment 0, 64)] <- overlapping with both previous fragments of
 //                                     x.
 //
 // Output:
 //
-// [0-1]    [x, (reg0, piece  0, 32)]
-// [1-3]    [x, (reg0, piece  0, 32), (reg1, piece 32, 32)]
-// [3-4]    [x, (reg1, piece 32, 32)]
-// [4- ]    [x, (mem,  piece  0, 64)]
+// [0-1]    [x, (reg0, fragment  0, 32)]
+// [1-3]    [x, (reg0, fragment  0, 32), (reg1, fragment 32, 32)]
+// [3-4]    [x, (reg1, fragment 32, 32)]
+// [4- ]    [x, (mem,  fragment  0, 64)]
 void
 DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
                               const DbgValueHistoryMap::InstrRanges &Ranges) {
@@ -853,10 +853,10 @@
       continue;
     }
 
-    // If this piece overlaps with any open ranges, truncate them.
+    // If this fragment overlaps with any open ranges, truncate them.
     const DIExpression *DIExpr = Begin->getDebugExpression();
     auto Last = remove_if(OpenRanges, [&](DebugLocEntry::Value R) {
-      return piecesOverlap(DIExpr, R.getExpression());
+      return fragmentsOverlap(DIExpr, R.getExpression());
     });
     OpenRanges.erase(Last, OpenRanges.end());
 
@@ -878,12 +878,12 @@
     DebugLocEntry Loc(StartLabel, EndLabel, Value);
     bool couldMerge = false;
 
-    // If this is a piece, it may belong to the current DebugLocEntry.
-    if (DIExpr->isBitPiece()) {
+    // If this is a fragment, it may belong to the current DebugLocEntry.
+    if (DIExpr->isFragment()) {
       // Add this value to the list of open ranges.
       OpenRanges.push_back(Value);
 
-      // Attempt to add the piece to the last entry.
+      // Attempt to add the fragment to the last entry.
       if (!DebugLoc.empty())
         if (DebugLoc.back().MergeValues(Loc))
           couldMerge = true;
@@ -891,7 +891,7 @@
 
     if (!couldMerge) {
       // Need to add a new DebugLocEntry. Add all values from still
-      // valid non-overlapping pieces.
+      // valid non-overlapping fragments.
       if (OpenRanges.size())
         Loc.addValues(OpenRanges);
 
@@ -1413,7 +1413,7 @@
 static void emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
                               ByteStreamer &Streamer,
                               const DebugLocEntry::Value &Value,
-                              unsigned PieceOffsetInBits) {
+                              unsigned FragmentOffsetInBits) {
   DIExpressionCursor ExprCursor(Value.getExpression());
   DebugLocDwarfExpression DwarfExpr(AP.getDwarfVersion(), Streamer);
   // Regular entry.
@@ -1435,13 +1435,13 @@
         DwarfExpr.AddMachineRegIndirect(TRI, Loc.getReg(), Loc.getOffset());
       else
         DwarfExpr.AddMachineRegExpression(TRI, ExprCursor, Loc.getReg(),
-                                          PieceOffsetInBits);
+                                          FragmentOffsetInBits);
     }
   } else if (Value.isConstantFP()) {
     APInt RawBytes = Value.getConstantFP()->getValueAPF().bitcastToAPInt();
     DwarfExpr.AddUnsignedConstant(RawBytes);
   }
-  DwarfExpr.AddExpression(std::move(ExprCursor), PieceOffsetInBits);
+  DwarfExpr.AddExpression(std::move(ExprCursor), FragmentOffsetInBits);
 }
 
 void DebugLocEntry::finalize(const AsmPrinter &AP,
@@ -1450,32 +1450,32 @@
   DebugLocStream::EntryBuilder Entry(List, Begin, End);
   BufferByteStreamer Streamer = Entry.getStreamer();
   const DebugLocEntry::Value &Value = Values[0];
-  if (Value.isBitPiece()) {
-    // Emit all pieces that belong to the same variable and range.
+  if (Value.isFragment()) {
+    // Emit all fragments that belong to the same variable and range.
     assert(all_of(Values, [](DebugLocEntry::Value P) {
-          return P.isBitPiece();
-        }) && "all values are expected to be pieces");
+          return P.isFragment();
+        }) && "all values are expected to be fragments");
     assert(std::is_sorted(Values.begin(), Values.end()) &&
-           "pieces are expected to be sorted");
+           "fragments are expected to be sorted");
    
     unsigned Offset = 0;
-    for (auto Piece : Values) {
-      const DIExpression *Expr = Piece.getExpression();
-      unsigned PieceOffset = Expr->getBitPieceOffset();
-      unsigned PieceSize = Expr->getBitPieceSize();
-      assert(Offset <= PieceOffset && "overlapping or duplicate pieces");
-      if (Offset < PieceOffset) {
-        // The DWARF spec seriously mandates pieces with no locations for gaps.
+    for (auto Fragment : Values) {
+      const DIExpression *Expr = Fragment.getExpression();
+      unsigned FragmentOffset = Expr->getFragmentOffsetInBits();
+      unsigned FragmentSize = Expr->getFragmentSizeInBits();
+      assert(Offset <= FragmentOffset && "overlapping or duplicate fragments");
+      if (Offset < FragmentOffset) {
+        // DWARF represents gaps as pieces with no locations.
         DebugLocDwarfExpression Expr(AP.getDwarfVersion(), Streamer);
-        Expr.AddOpPiece(PieceOffset-Offset, 0);
-        Offset += PieceOffset-Offset;
+        Expr.AddOpPiece(FragmentOffset-Offset, 0);
+        Offset += FragmentOffset-Offset;
       }
-      Offset += PieceSize;
+      Offset += FragmentSize;
 
-      emitDebugLocValue(AP, BT, Streamer, Piece, PieceOffset);
+      emitDebugLocValue(AP, BT, Streamer, Fragment, FragmentOffset);
     }
   } else {
-    assert(Values.size() == 1 && "only pieces may have >1 value");
+    assert(Values.size() == 1 && "only fragments may have >1 value");
     emitDebugLocValue(AP, BT, Streamer, Value, 0);
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 3a9fa8b..51eca07 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -135,7 +135,7 @@
     Expr.append(V.Expr.begin(), V.Expr.end());
     FrameIndex.append(V.FrameIndex.begin(), V.FrameIndex.end());
     assert(all_of(Expr, [](const DIExpression *E) {
-             return E && E->isBitPiece();
+             return E && E->isFragment();
            }) && "conflicting locations for variable");
   }
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index 12273aa..fe999ef 100644
--- a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -82,10 +82,10 @@
   return true;
 }
 
-bool DwarfExpression::AddMachineRegPiece(const TargetRegisterInfo &TRI,
-                                         unsigned MachineReg,
-                                         unsigned PieceSizeInBits,
-                                         unsigned PieceOffsetInBits) {
+bool DwarfExpression::AddMachineRegFragment(const TargetRegisterInfo &TRI,
+                                            unsigned MachineReg,
+                                            unsigned FragmentSizeInBits,
+                                            unsigned FragmentOffsetInBits) {
   if (!TRI.isPhysicalRegister(MachineReg))
     return false;
 
@@ -94,13 +94,13 @@
   // If this is a valid register number, emit it.
   if (Reg >= 0) {
     AddReg(Reg);
-    if (PieceSizeInBits)
-      AddOpPiece(PieceSizeInBits, PieceOffsetInBits);
+    if (FragmentSizeInBits)
+      AddOpPiece(FragmentSizeInBits, FragmentOffsetInBits);
     return true;
   }
 
   // Walk up the super-register chain until we find a valid number.
-  // For example, EAX on x86_64 is a 32-bit piece of RAX with offset 0.
+  // For example, EAX on x86_64 is a 32-bit fragment of RAX with offset 0.
   for (MCSuperRegIterator SR(MachineReg, &TRI); SR.isValid(); ++SR) {
     Reg = TRI.getDwarfRegNum(*SR, false);
     if (Reg >= 0) {
@@ -108,16 +108,16 @@
       unsigned Size = TRI.getSubRegIdxSize(Idx);
       unsigned RegOffset = TRI.getSubRegIdxOffset(Idx);
       AddReg(Reg, "super-register");
-      if (PieceOffsetInBits == RegOffset) {
+      if (FragmentOffsetInBits == RegOffset) {
         AddOpPiece(Size, RegOffset);
       } else {
-        // If this is part of a variable in a sub-register at a
-        // non-zero offset, we need to manually shift the value into
-        // place, since the DW_OP_piece describes the part of the
-        // variable, not the position of the subregister.
+        // If this is part of a variable in a sub-register at a non-zero offset,
+        // we need to manually shift the value into place, since the
+        // DW_OP_LLVM_fragment describes the part of the variable, not the
+        // position of the subregister.
         if (RegOffset)
           AddShr(RegOffset);
-        AddOpPiece(Size, PieceOffsetInBits);
+        AddOpPiece(Size, FragmentOffsetInBits);
       }
       return true;
     }
@@ -125,10 +125,7 @@
 
   // Otherwise, attempt to find a covering set of sub-register numbers.
   // For example, Q0 on ARM is a composition of D0+D1.
-  //
-  // Keep track of the current position so we can emit the more
-  // efficient DW_OP_piece.
-  unsigned CurPos = PieceOffsetInBits;
+  unsigned CurPos = FragmentOffsetInBits;
   // The size of the register in bits, assuming 8 bits per byte.
   unsigned RegSize = TRI.getMinimalPhysRegClass(MachineReg)->getSize() * 8;
   // Keep track of the bits in the register we already emitted, so we
@@ -158,7 +155,7 @@
     }
   }
 
-  return CurPos > PieceOffsetInBits;
+  return CurPos > FragmentOffsetInBits;
 }
 
 void DwarfExpression::AddStackValue() {
@@ -195,31 +192,31 @@
 }
 
 static unsigned getOffsetOrZero(unsigned OffsetInBits,
-                                unsigned PieceOffsetInBits) {
-  if (OffsetInBits == PieceOffsetInBits)
+                                unsigned FragmentOffsetInBits) {
+  if (OffsetInBits == FragmentOffsetInBits)
     return 0;
-  assert(OffsetInBits >= PieceOffsetInBits && "overlapping pieces");
+  assert(OffsetInBits >= FragmentOffsetInBits && "overlapping fragments");
   return OffsetInBits;
 }
 
 bool DwarfExpression::AddMachineRegExpression(const TargetRegisterInfo &TRI,
                                               DIExpressionCursor &ExprCursor,
                                               unsigned MachineReg,
-                                              unsigned PieceOffsetInBits) {
+                                              unsigned FragmentOffsetInBits) {
   if (!ExprCursor)
-    return AddMachineRegPiece(TRI, MachineReg);
+    return AddMachineRegFragment(TRI, MachineReg);
 
   // Pattern-match combinations for which more efficient representations exist
   // first.
   bool ValidReg = false;
   auto Op = ExprCursor.peek();
   switch (Op->getOp()) {
-  case dwarf::DW_OP_bit_piece: {
+  case dwarf::DW_OP_LLVM_fragment: {
     unsigned OffsetInBits = Op->getArg(0);
     unsigned SizeInBits = Op->getArg(1);
     // Piece always comes at the end of the expression.
-    AddMachineRegPiece(TRI, MachineReg, SizeInBits,
-                       getOffsetOrZero(OffsetInBits, PieceOffsetInBits));
+    AddMachineRegFragment(TRI, MachineReg, SizeInBits,
+                          getOffsetOrZero(OffsetInBits, FragmentOffsetInBits));
     ExprCursor.take();
     break;
   }
@@ -234,7 +231,7 @@
           TRI, MachineReg, Op->getOp() == dwarf::DW_OP_plus ? Offset : -Offset);
       ExprCursor.consume(2);
     } else
-      ValidReg = AddMachineRegPiece(TRI, MachineReg);
+      ValidReg = AddMachineRegFragment(TRI, MachineReg);
     break;
   }
   case dwarf::DW_OP_deref:
@@ -248,14 +245,15 @@
 }
 
 void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor,
-                                    unsigned PieceOffsetInBits) {
+                                    unsigned FragmentOffsetInBits) {
   while (ExprCursor) {
     auto Op = ExprCursor.take();
     switch (Op->getOp()) {
-    case dwarf::DW_OP_bit_piece: {
+    case dwarf::DW_OP_LLVM_fragment: {
       unsigned OffsetInBits = Op->getArg(0);
       unsigned SizeInBits   = Op->getArg(1);
-      AddOpPiece(SizeInBits, getOffsetOrZero(OffsetInBits, PieceOffsetInBits));
+      AddOpPiece(SizeInBits,
+                 getOffsetOrZero(OffsetInBits, FragmentOffsetInBits));
       break;
     }
     case dwarf::DW_OP_plus:
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.h b/lib/CodeGen/AsmPrinter/DwarfExpression.h
index 06fa59b..b24074b 100644
--- a/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -76,7 +76,6 @@
 /// entry.
 class DwarfExpression {
 protected:
-  // Various convenience accessors that extract things out of AsmPrinter.
   unsigned DwarfVersion;
 
 public:
@@ -98,10 +97,14 @@
   /// Emit an (double-)indirect dwarf register operation.
   void AddRegIndirect(int DwarfReg, int Offset, bool Deref = false);
 
-  /// Emit DW_OP_piece operation.
+  /// Emit a DW_OP_piece operation for a variable fragment.
+  /// \param OffsetInBits    This is the offset where the fragment appears
+  ///                        inside the *source variable*.
   void AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0);
+
   /// Emit a shift-right dwarf expression.
   void AddShr(unsigned ShiftBy);
+
   /// Emit a DW_OP_stack_value, if supported.
   ///
   /// The proper way to describe a constant value is DW_OP_constu <const>,
@@ -121,22 +124,22 @@
                              int Offset = 0);
 
   /// Emit a partial DWARF register operation.
-  /// \param MachineReg        the register
-  /// \param PieceSizeInBits   size and
-  /// \param PieceOffsetInBits offset of the piece in bits, if this is one
-  ///                          piece of an aggregate value.
   ///
-  /// If size and offset is zero an operation for the entire
-  /// register is emitted: Some targets do not provide a DWARF
-  /// register number for every register.  If this is the case, this
-  /// function will attempt to emit a DWARF register by emitting a
-  /// piece of a super-register or by piecing together multiple
-  /// subregisters that alias the register.
+  /// \param MachineReg           the register,
+  /// \param FragmentSizeInBits   size and
+  /// \param FragmentOffsetInBits offset of the fragment in bits, if this is
+  ///                             a fragment of an aggregate value.
+  ///
+  /// If size and offset is zero an operation for the entire register is
+  /// emitted: Some targets do not provide a DWARF register number for every
+  /// register.  If this is the case, this function will attempt to emit a DWARF
+  /// register by emitting a fragment of a super-register or by piecing together
+  /// multiple subregisters that alias the register.
   ///
   /// \return false if no DWARF register exists for MachineReg.
-  bool AddMachineRegPiece(const TargetRegisterInfo &TRI, unsigned MachineReg,
-                          unsigned PieceSizeInBits = 0,
-                          unsigned PieceOffsetInBits = 0);
+  bool AddMachineRegFragment(const TargetRegisterInfo &TRI, unsigned MachineReg,
+                          unsigned FragmentSizeInBits = 0,
+                          unsigned FragmentOffsetInBits = 0);
 
   /// Emit a signed constant.
   void AddSignedConstant(int64_t Value);
@@ -149,17 +152,21 @@
   /// the prefix of a DwarfExpression if a more efficient representation for
   /// combining the register location and the first operation exists.
   ///
-  /// \param PieceOffsetInBits     If this is one piece out of a fragmented
-  /// location, this is the offset of the piece inside the entire variable.
-  /// \return false if no DWARF register exists for MachineReg.
+  /// \param FragmentOffsetInBits     If this is one fragment out of a fragmented
+  ///                                 location, this is the offset of the
+  ///                                 fragment inside the entire variable.
+  /// \return                         false if no DWARF register exists
+  ///                                 for MachineReg.
   bool AddMachineRegExpression(const TargetRegisterInfo &TRI,
                                DIExpressionCursor &Expr, unsigned MachineReg,
-                               unsigned PieceOffsetInBits = 0);
+                               unsigned FragmentOffsetInBits = 0);
   /// Emit all remaining operations in the DIExpressionCursor.
-  /// \param PieceOffsetInBits     If this is one piece out of a fragmented
-  /// location, this is the offset of the piece inside the entire variable.
+  ///
+  /// \param FragmentOffsetInBits     If this is one fragment out of multiple
+  ///                                 locations, this is the offset of the
+  ///                                 fragment inside the entire variable.
   void AddExpression(DIExpressionCursor &&Expr,
-                     unsigned PieceOffsetInBits = 0);
+                     unsigned FragmentOffsetInBits = 0);
 };
 
 /// DwarfExpression implementation for .debug_loc entries.
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index 3179ab1..595f1d9 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -19,37 +19,7 @@
 
 namespace llvm {
 DwarfFile::DwarfFile(AsmPrinter *AP, StringRef Pref, BumpPtrAllocator &DA)
-    : Asm(AP), StrPool(DA, *Asm, Pref) {}
-
-DwarfFile::~DwarfFile() {
-  for (DIEAbbrev *Abbrev : Abbreviations)
-    Abbrev->~DIEAbbrev();
-}
-
-// Define a unique number for the abbreviation.
-//
-DIEAbbrev &DwarfFile::assignAbbrevNumber(DIE &Die) {
-  FoldingSetNodeID ID;
-  DIEAbbrev Abbrev = Die.generateAbbrev();
-  Abbrev.Profile(ID);
-
-  void *InsertPos;
-  if (DIEAbbrev *Existing =
-          AbbreviationsSet.FindNodeOrInsertPos(ID, InsertPos)) {
-    Die.setAbbrevNumber(Existing->getNumber());
-    return *Existing;
-  }
-
-  // Move the abbreviation to the heap and assign a number.
-  DIEAbbrev *New = new (AbbrevAllocator) DIEAbbrev(std::move(Abbrev));
-  Abbreviations.push_back(New);
-  New->setNumber(Abbreviations.size());
-  Die.setAbbrevNumber(Abbreviations.size());
-
-  // Store it for lookup.
-  AbbreviationsSet.InsertNode(New, InsertPos);
-  return *New;
-}
+    : Asm(AP), Abbrevs(AbbrevAllocator), StrPool(DA, *Asm, Pref) {}
 
 void DwarfFile::addUnit(std::unique_ptr<DwarfCompileUnit> U) {
   CUs.push_back(std::move(U));
@@ -98,44 +68,10 @@
 // Compute the size and offset of a DIE. The offset is relative to start of the
 // CU. It returns the offset after laying out the DIE.
 unsigned DwarfFile::computeSizeAndOffset(DIE &Die, unsigned Offset) {
-  // Record the abbreviation.
-  const DIEAbbrev &Abbrev = assignAbbrevNumber(Die);
-
-  // Set DIE offset
-  Die.setOffset(Offset);
-
-  // Start the size with the size of abbreviation code.
-  Offset += getULEB128Size(Die.getAbbrevNumber());
-
-  // Size the DIE attribute values.
-  for (const auto &V : Die.values())
-    // Size attribute value.
-    Offset += V.SizeOf(Asm);
-
-  // Size the DIE children if any.
-  if (Die.hasChildren()) {
-    (void)Abbrev;
-    assert(Abbrev.hasChildren() && "Children flag not set");
-
-    for (auto &Child : Die.children())
-      Offset = computeSizeAndOffset(Child, Offset);
-
-    // End of children marker.
-    Offset += sizeof(int8_t);
-  }
-
-  Die.setSize(Offset - Die.getOffset());
-  return Offset;
+  return Die.computeOffsetsAndAbbrevs(Asm, Abbrevs, Offset);
 }
 
-void DwarfFile::emitAbbrevs(MCSection *Section) {
-  // Check to see if it is worth the effort.
-  if (!Abbreviations.empty()) {
-    // Start the debug abbrev section.
-    Asm->OutStreamer->SwitchSection(Section);
-    Asm->emitDwarfAbbrevs(Abbreviations);
-  }
-}
+void DwarfFile::emitAbbrevs(MCSection *Section) { Abbrevs.Emit(Asm, Section); }
 
 // Emit strings into a string section.
 void DwarfFile::emitStrings(MCSection *StrSection, MCSection *OffsetSection) {
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index b73d89b..d4d2ed2 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -16,10 +16,10 @@
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Allocator.h"
 #include <memory>
-#include <vector>
 
 namespace llvm {
 class AsmPrinter;
@@ -41,10 +41,7 @@
   BumpPtrAllocator AbbrevAllocator;
 
   // Used to uniquely define abbreviations.
-  FoldingSet<DIEAbbrev> AbbreviationsSet;
-
-  // A list of all the unique abbreviations in use.
-  std::vector<DIEAbbrev *> Abbreviations;
+  DIEAbbrevSet Abbrevs;
 
   // A pointer to all units in the section.
   SmallVector<std::unique_ptr<DwarfCompileUnit>, 1> CUs;
@@ -65,8 +62,6 @@
 public:
   DwarfFile(AsmPrinter *AP, StringRef Pref, BumpPtrAllocator &DA);
 
-  ~DwarfFile();
-
   const SmallVectorImpl<std::unique_ptr<DwarfCompileUnit>> &getUnits() {
     return CUs;
   }
@@ -81,12 +76,6 @@
   /// \returns The size of the root DIE.
   unsigned computeSizeAndOffsetsForUnit(DwarfUnit *TheU);
 
-  /// Define a unique number for the abbreviation.
-  ///
-  /// Compute the abbreviation for \c Die, look up its unique number, and
-  /// return a reference to it in the uniquing table.
-  DIEAbbrev &assignAbbrevNumber(DIE &Die);
-
   /// \brief Add a unit to the list of CUs.
   void addUnit(std::unique_ptr<DwarfCompileUnit> U);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index b6b72b9..68fb5c9 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -371,11 +371,12 @@
   addSourceLine(Die, NS->getLine(), NS->getFilename(), NS->getDirectory());
 }
 
-bool DwarfUnit::addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
-                                   unsigned SizeInBits, unsigned OffsetInBits) {
+bool DwarfUnit::addRegisterFragment(DIELoc &TheDie, unsigned Reg,
+                                    unsigned SizeInBits,
+                                    unsigned OffsetInBits) {
   DIEDwarfExpression Expr(*Asm, *this, TheDie);
-  Expr.AddMachineRegPiece(*Asm->MF->getSubtarget().getRegisterInfo(), Reg,
-                          SizeInBits, OffsetInBits);
+  Expr.AddMachineRegFragment(*Asm->MF->getSubtarget().getRegisterInfo(), Reg,
+                             SizeInBits, OffsetInBits);
   return true;
 }
 
@@ -481,7 +482,7 @@
 
   bool validReg;
   if (Location.isReg())
-    validReg = addRegisterOpPiece(*Loc, Location.getReg());
+    validReg = addRegisterFragment(*Loc, Location.getReg());
   else
     validReg = addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index f365930..ed975cc 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -235,11 +235,13 @@
   /// Add template parameters in buffer.
   void addTemplateParams(DIE &Buffer, DINodeArray TParams);
 
-  /// Add register operand.
+  /// Add register operand for a source variable fragment of the specified size
+  /// and offset.
+  ///
   /// \returns false if the register does not exist, e.g., because it was never
-  /// materialized.
-  bool addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
-                          unsigned SizeInBits = 0, unsigned OffsetInBits = 0);
+  ///          materialized.
+  bool addRegisterFragment(DIELoc &TheDie, unsigned Reg,
+                           unsigned SizeInBits = 0, unsigned OffsetInBits = 0);
 
   /// Add register offset.
   /// \returns false if the register does not exist, e.g., because it was never
diff --git a/lib/CodeGen/AsmPrinter/LLVMBuild.txt b/lib/CodeGen/AsmPrinter/LLVMBuild.txt
index c40c5e7..2bb66d1 100644
--- a/lib/CodeGen/AsmPrinter/LLVMBuild.txt
+++ b/lib/CodeGen/AsmPrinter/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = AsmPrinter
 parent = Libraries
-required_libraries = Analysis CodeGen Core DebugInfoCodeView DebugInfoMSF MC MCParser Support Target TransformUtils
+required_libraries = Analysis CodeGen Core DebugInfoCodeView DebugInfoMSF MC MCParser Support Target
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index 75cb43d..919fc48 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -221,12 +221,12 @@
     const MCSymbol *PersHandlerSym =
         TLOF.getCFIPersonalitySymbol(PerFn, Asm->TM, MMI);
 
-    // Classify the personality routine so that we may reason about it.
-    EHPersonality Per = classifyEHPersonality(PerFn);
-
-    // Do not emit a .seh_handler directive if it is a C++ cleanup funclet.
-    if (Per != EHPersonality::MSVC_CXX ||
-        !CurrentFuncletEntry->isCleanupFuncletEntry())
+    // Do not emit a .seh_handler directives for cleanup funclets.
+    // FIXME: This means cleanup funclets cannot handle exceptions. Given that
+    // Clang doesn't produce EH constructs inside cleanup funclets and LLVM's
+    // inliner doesn't allow inlining them, this isn't a major problem in
+    // practice.
+    if (!CurrentFuncletEntry->isCleanupFuncletEntry())
       Asm->OutStreamer->EmitWinEHHandler(PersHandlerSym, true, true);
   }
 }
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 20c36a9..20dc880 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -353,7 +353,7 @@
   if (!UpdateLiveIns)
     return;
 
-  LiveRegs.init(TRI);
+  LiveRegs.init(*TRI);
   LiveRegs.addLiveOutsNoPristines(MBB);
   for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend()))
     LiveRegs.stepBackward(MI);
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index 54164c9..e7c6b03 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -612,7 +612,7 @@
     return;
 
   // Collect this block's live out register units.
-  LiveRegSet.init(TRI);
+  LiveRegSet.init(*TRI);
   // We do not need to care about pristine registers as they are just preserved
   // but not actually used in the function.
   LiveRegSet.addLiveOutsNoPristines(*MBB);
diff --git a/lib/CodeGen/GlobalISel/CallLowering.cpp b/lib/CodeGen/GlobalISel/CallLowering.cpp
index 0e37ee6..7f17d34 100644
--- a/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -107,6 +107,7 @@
                                      ValueHandler &Handler) const {
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = *MF.getFunction();
+  const DataLayout &DL = F.getParent()->getDataLayout();
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -124,7 +125,9 @@
     if (VA.isRegLoc())
       Handler.assignValueToReg(Args[i].Reg, VA.getLocReg(), VA);
     else if (VA.isMemLoc()) {
-      unsigned Size = VA.getValVT().getSizeInBits() / 8;
+      unsigned Size = VA.getValVT() == MVT::iPTR
+                          ? DL.getPointerSize()
+                          : alignTo(VA.getValVT().getSizeInBits(), 8) / 8;
       unsigned Offset = VA.getLocMemOffset();
       MachinePointerInfo MPO;
       unsigned StackAddr = Handler.getStackAddress(Size, Offset, MPO);
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index e8d1ce0..936e1b8 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -72,9 +72,9 @@
       bool Success = translate(*CV, VReg);
       if (!Success) {
         if (!TPC->isGlobalISelAbortEnabled()) {
-          MIRBuilder.getMF().getProperties().set(
+          MF->getProperties().set(
               MachineFunctionProperties::Property::FailedISel);
-          return 0;
+          return VReg;
         }
         reportTranslationError(Val, "unable to translate constant");
       }
@@ -87,7 +87,6 @@
   if (FrameIndices.find(&AI) != FrameIndices.end())
     return FrameIndices[&AI];
 
-  MachineFunction &MF = MIRBuilder.getMF();
   unsigned ElementSize = DL->getTypeStoreSize(AI.getAllocatedType());
   unsigned Size =
       ElementSize * cast<ConstantInt>(AI.getArraySize())->getZExtValue();
@@ -100,7 +99,7 @@
     Alignment = DL->getABITypeAlignment(AI.getAllocatedType());
 
   int &FI = FrameIndices[&AI];
-  FI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false, &AI);
+  FI = MF->getFrameInfo().CreateStackObject(Size, Alignment, false, &AI);
   return FI;
 }
 
@@ -114,7 +113,7 @@
     Alignment = LI->getAlignment();
     ValTy = LI->getType();
   } else if (!TPC->isGlobalISelAbortEnabled()) {
-    MIRBuilder.getMF().getProperties().set(
+    MF->getProperties().set(
         MachineFunctionProperties::Property::FailedISel);
     return 1;
   } else
@@ -126,14 +125,14 @@
 MachineBasicBlock &IRTranslator::getOrCreateBB(const BasicBlock &BB) {
   MachineBasicBlock *&MBB = BBToMBB[&BB];
   if (!MBB) {
-    MachineFunction &MF = MIRBuilder.getMF();
-    MBB = MF.CreateMachineBasicBlock();
-    MF.push_back(MBB);
+    MBB = MF->CreateMachineBasicBlock();
+    MF->push_back(MBB);
   }
   return *MBB;
 }
 
-bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U) {
+bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
+                                     MachineIRBuilder &MIRBuilder) {
   // FIXME: handle signed/unsigned wrapping flags.
 
   // Get or create a virtual register for each value.
@@ -147,7 +146,8 @@
   return true;
 }
 
-bool IRTranslator::translateCompare(const User &U) {
+bool IRTranslator::translateCompare(const User &U,
+                                    MachineIRBuilder &MIRBuilder) {
   const CmpInst *CI = dyn_cast<CmpInst>(&U);
   unsigned Op0 = getOrCreateVReg(*U.getOperand(0));
   unsigned Op1 = getOrCreateVReg(*U.getOperand(1));
@@ -164,7 +164,7 @@
   return true;
 }
 
-bool IRTranslator::translateRet(const User &U) {
+bool IRTranslator::translateRet(const User &U, MachineIRBuilder &MIRBuilder) {
   const ReturnInst &RI = cast<ReturnInst>(U);
   const Value *Ret = RI.getReturnValue();
   // The target may mess up with the insertion point, but
@@ -173,7 +173,7 @@
   return CLI->lowerReturn(MIRBuilder, Ret, !Ret ? 0 : getOrCreateVReg(*Ret));
 }
 
-bool IRTranslator::translateBr(const User &U) {
+bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
   const BranchInst &BrInst = cast<BranchInst>(U);
   unsigned Succ = 0;
   if (!BrInst.isUnconditional()) {
@@ -195,7 +195,7 @@
   return true;
 }
 
-bool IRTranslator::translateLoad(const User &U) {
+bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
   const LoadInst &LI = cast<LoadInst>(U);
 
   if (!TPC->isGlobalISelAbortEnabled() && LI.isAtomic())
@@ -206,19 +206,18 @@
                                : MachineMemOperand::MONone;
   Flags |= MachineMemOperand::MOLoad;
 
-  MachineFunction &MF = MIRBuilder.getMF();
   unsigned Res = getOrCreateVReg(LI);
   unsigned Addr = getOrCreateVReg(*LI.getPointerOperand());
   LLT VTy{*LI.getType(), *DL}, PTy{*LI.getPointerOperand()->getType(), *DL};
   MIRBuilder.buildLoad(
       Res, Addr,
-      *MF.getMachineMemOperand(MachinePointerInfo(LI.getPointerOperand()),
-                               Flags, DL->getTypeStoreSize(LI.getType()),
-                               getMemOpAlignment(LI)));
+      *MF->getMachineMemOperand(MachinePointerInfo(LI.getPointerOperand()),
+                                Flags, DL->getTypeStoreSize(LI.getType()),
+                                getMemOpAlignment(LI)));
   return true;
 }
 
-bool IRTranslator::translateStore(const User &U) {
+bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
   const StoreInst &SI = cast<StoreInst>(U);
 
   if (!TPC->isGlobalISelAbortEnabled() && SI.isAtomic())
@@ -229,21 +228,22 @@
                                : MachineMemOperand::MONone;
   Flags |= MachineMemOperand::MOStore;
 
-  MachineFunction &MF = MIRBuilder.getMF();
   unsigned Val = getOrCreateVReg(*SI.getValueOperand());
   unsigned Addr = getOrCreateVReg(*SI.getPointerOperand());
   LLT VTy{*SI.getValueOperand()->getType(), *DL},
       PTy{*SI.getPointerOperand()->getType(), *DL};
 
   MIRBuilder.buildStore(
-      Val, Addr, *MF.getMachineMemOperand(
-                     MachinePointerInfo(SI.getPointerOperand()), Flags,
-                     DL->getTypeStoreSize(SI.getValueOperand()->getType()),
-                     getMemOpAlignment(SI)));
+      Val, Addr,
+      *MF->getMachineMemOperand(
+          MachinePointerInfo(SI.getPointerOperand()), Flags,
+          DL->getTypeStoreSize(SI.getValueOperand()->getType()),
+          getMemOpAlignment(SI)));
   return true;
 }
 
-bool IRTranslator::translateExtractValue(const User &U) {
+bool IRTranslator::translateExtractValue(const User &U,
+                                         MachineIRBuilder &MIRBuilder) {
   const Value *Src = U.getOperand(0);
   Type *Int32Ty = Type::getInt32Ty(U.getContext());
   SmallVector<Value *, 1> Indices;
@@ -268,7 +268,8 @@
   return true;
 }
 
-bool IRTranslator::translateInsertValue(const User &U) {
+bool IRTranslator::translateInsertValue(const User &U,
+                                        MachineIRBuilder &MIRBuilder) {
   const Value *Src = U.getOperand(0);
   Type *Int32Ty = Type::getInt32Ty(U.getContext());
   SmallVector<Value *, 1> Indices;
@@ -295,14 +296,16 @@
   return true;
 }
 
-bool IRTranslator::translateSelect(const User &U) {
+bool IRTranslator::translateSelect(const User &U,
+                                   MachineIRBuilder &MIRBuilder) {
   MIRBuilder.buildSelect(getOrCreateVReg(U), getOrCreateVReg(*U.getOperand(0)),
                          getOrCreateVReg(*U.getOperand(1)),
                          getOrCreateVReg(*U.getOperand(2)));
   return true;
 }
 
-bool IRTranslator::translateBitCast(const User &U) {
+bool IRTranslator::translateBitCast(const User &U,
+                                    MachineIRBuilder &MIRBuilder) {
   if (LLT{*U.getOperand(0)->getType(), *DL} == LLT{*U.getType(), *DL}) {
     unsigned &Reg = ValToVReg[&U];
     if (Reg)
@@ -311,17 +314,19 @@
       Reg = getOrCreateVReg(*U.getOperand(0));
     return true;
   }
-  return translateCast(TargetOpcode::G_BITCAST, U);
+  return translateCast(TargetOpcode::G_BITCAST, U, MIRBuilder);
 }
 
-bool IRTranslator::translateCast(unsigned Opcode, const User &U) {
+bool IRTranslator::translateCast(unsigned Opcode, const User &U,
+                                 MachineIRBuilder &MIRBuilder) {
   unsigned Op = getOrCreateVReg(*U.getOperand(0));
   unsigned Res = getOrCreateVReg(U);
   MIRBuilder.buildInstr(Opcode).addDef(Res).addUse(Op);
   return true;
 }
 
-bool IRTranslator::translateGetElementPtr(const User &U) {
+bool IRTranslator::translateGetElementPtr(const User &U,
+                                          MachineIRBuilder &MIRBuilder) {
   // FIXME: support vector GEPs.
   if (U.getType()->isVectorTy())
     return false;
@@ -391,7 +396,8 @@
   return true;
 }
 
-bool IRTranslator::translateMemcpy(const CallInst &CI) {
+bool IRTranslator::translateMemcpy(const CallInst &CI,
+                                   MachineIRBuilder &MIRBuilder) {
   LLT SizeTy{*CI.getArgOperand(2)->getType(), *DL};
   if (cast<PointerType>(CI.getArgOperand(0)->getType())->getAddressSpace() !=
           0 ||
@@ -412,73 +418,28 @@
                         CallLowering::ArgInfo(0, CI.getType()), Args);
 }
 
-void IRTranslator::getStackGuard(unsigned DstReg) {
+void IRTranslator::getStackGuard(unsigned DstReg,
+                                 MachineIRBuilder &MIRBuilder) {
   auto MIB = MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD);
   MIB.addDef(DstReg);
 
-  auto &MF = MIRBuilder.getMF();
-  auto &TLI = *MF.getSubtarget().getTargetLowering();
-  Value *Global = TLI.getSDagStackGuard(*MF.getFunction()->getParent());
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  Value *Global = TLI.getSDagStackGuard(*MF->getFunction()->getParent());
   if (!Global)
     return;
 
   MachinePointerInfo MPInfo(Global);
-  MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
+  MachineInstr::mmo_iterator MemRefs = MF->allocateMemRefsArray(1);
   auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
                MachineMemOperand::MODereferenceable;
   *MemRefs =
-      MF.getMachineMemOperand(MPInfo, Flags, DL->getPointerSizeInBits() / 8,
-                              DL->getPointerABIAlignment());
+      MF->getMachineMemOperand(MPInfo, Flags, DL->getPointerSizeInBits() / 8,
+                               DL->getPointerABIAlignment());
   MIB.setMemRefs(MemRefs, MemRefs + 1);
 }
 
-bool IRTranslator::translateKnownIntrinsic(const CallInst &CI,
-                                           Intrinsic::ID ID) {
-  unsigned Op = 0;
-  switch (ID) {
-  default: return false;
-  case Intrinsic::uadd_with_overflow: Op = TargetOpcode::G_UADDE; break;
-  case Intrinsic::sadd_with_overflow: Op = TargetOpcode::G_SADDO; break;
-  case Intrinsic::usub_with_overflow: Op = TargetOpcode::G_USUBE; break;
-  case Intrinsic::ssub_with_overflow: Op = TargetOpcode::G_SSUBO; break;
-  case Intrinsic::umul_with_overflow: Op = TargetOpcode::G_UMULO; break;
-  case Intrinsic::smul_with_overflow: Op = TargetOpcode::G_SMULO; break;
-  case Intrinsic::memcpy:
-    return translateMemcpy(CI);
-  case Intrinsic::eh_typeid_for: {
-    GlobalValue *GV = ExtractTypeInfo(CI.getArgOperand(0));
-    unsigned Reg = getOrCreateVReg(CI);
-    unsigned TypeID = MIRBuilder.getMF().getTypeIDFor(GV);
-    MIRBuilder.buildConstant(Reg, TypeID);
-    return true;
-  }
-  case Intrinsic::objectsize: {
-    // If we don't know by now, we're never going to know.
-    const ConstantInt *Min = cast<ConstantInt>(CI.getArgOperand(1));
-
-    MIRBuilder.buildConstant(getOrCreateVReg(CI), Min->isZero() ? -1ULL : 0);
-    return true;
-  }
-  case Intrinsic::stackguard:
-    getStackGuard(getOrCreateVReg(CI));
-    return true;
-  case Intrinsic::stackprotector: {
-    MachineFunction &MF = MIRBuilder.getMF();
-    LLT PtrTy{*CI.getArgOperand(0)->getType(), *DL};
-    unsigned GuardVal = MRI->createGenericVirtualRegister(PtrTy);
-    getStackGuard(GuardVal);
-
-    AllocaInst *Slot = cast<AllocaInst>(CI.getArgOperand(1));
-    MIRBuilder.buildStore(
-        GuardVal, getOrCreateVReg(*Slot),
-        *MF.getMachineMemOperand(
-            MachinePointerInfo::getFixedStack(MF, getOrCreateFrameIndex(*Slot)),
-            MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
-            PtrTy.getSizeInBits() / 8, 8));
-    return true;
-  }
-  }
-
+bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
+                                              MachineIRBuilder &MIRBuilder) {
   LLT Ty{*CI.getOperand(0)->getType(), *DL};
   LLT s1 = LLT::scalar(1);
   unsigned Width = Ty.getSizeInBits();
@@ -500,9 +461,70 @@
   return true;
 }
 
-bool IRTranslator::translateCall(const User &U) {
+bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
+                                           MachineIRBuilder &MIRBuilder) {
+  switch (ID) {
+  default:
+    break;
+  case Intrinsic::dbg_declare:
+  case Intrinsic::dbg_value:
+    // FIXME: these obviously need to be supported properly.
+    MF->getProperties().set(
+          MachineFunctionProperties::Property::FailedISel);
+    return true;
+  case Intrinsic::uadd_with_overflow:
+    return translateOverflowIntrinsic(CI, TargetOpcode::G_UADDE, MIRBuilder);
+  case Intrinsic::sadd_with_overflow:
+    return translateOverflowIntrinsic(CI, TargetOpcode::G_SADDO, MIRBuilder);
+  case Intrinsic::usub_with_overflow:
+    return translateOverflowIntrinsic(CI, TargetOpcode::G_USUBE, MIRBuilder);
+  case Intrinsic::ssub_with_overflow:
+    return translateOverflowIntrinsic(CI, TargetOpcode::G_SSUBO, MIRBuilder);
+  case Intrinsic::umul_with_overflow:
+    return translateOverflowIntrinsic(CI, TargetOpcode::G_UMULO, MIRBuilder);
+  case Intrinsic::smul_with_overflow:
+    return translateOverflowIntrinsic(CI, TargetOpcode::G_SMULO, MIRBuilder);
+  case Intrinsic::memcpy:
+    return translateMemcpy(CI, MIRBuilder);
+  case Intrinsic::eh_typeid_for: {
+    GlobalValue *GV = ExtractTypeInfo(CI.getArgOperand(0));
+    unsigned Reg = getOrCreateVReg(CI);
+    unsigned TypeID = MF->getTypeIDFor(GV);
+    MIRBuilder.buildConstant(Reg, TypeID);
+    return true;
+  }
+  case Intrinsic::objectsize: {
+    // If we don't know by now, we're never going to know.
+    const ConstantInt *Min = cast<ConstantInt>(CI.getArgOperand(1));
+
+    MIRBuilder.buildConstant(getOrCreateVReg(CI), Min->isZero() ? -1ULL : 0);
+    return true;
+  }
+  case Intrinsic::stackguard:
+    getStackGuard(getOrCreateVReg(CI), MIRBuilder);
+    return true;
+  case Intrinsic::stackprotector: {
+    LLT PtrTy{*CI.getArgOperand(0)->getType(), *DL};
+    unsigned GuardVal = MRI->createGenericVirtualRegister(PtrTy);
+    getStackGuard(GuardVal, MIRBuilder);
+
+    AllocaInst *Slot = cast<AllocaInst>(CI.getArgOperand(1));
+    MIRBuilder.buildStore(
+        GuardVal, getOrCreateVReg(*Slot),
+        *MF->getMachineMemOperand(
+            MachinePointerInfo::getFixedStack(*MF,
+                                              getOrCreateFrameIndex(*Slot)),
+            MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
+            PtrTy.getSizeInBits() / 8, 8));
+    return true;
+  }
+  }
+  return false;
+}
+
+bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   const CallInst &CI = cast<CallInst>(U);
-  auto TII = MIRBuilder.getMF().getTarget().getIntrinsicInfo();
+  auto TII = MF->getTarget().getIntrinsicInfo();
   const Function *F = CI.getCalledFunction();
 
   if (!F || !F->isIntrinsic()) {
@@ -522,7 +544,7 @@
 
   assert(ID != Intrinsic::not_intrinsic && "unknown intrinsic");
 
-  if (translateKnownIntrinsic(CI, ID))
+  if (translateKnownIntrinsic(CI, ID, MIRBuilder))
     return true;
 
   unsigned Res = CI.getType()->isVoidTy() ? 0 : getOrCreateVReg(CI);
@@ -538,10 +560,10 @@
   return true;
 }
 
-bool IRTranslator::translateInvoke(const User &U) {
+bool IRTranslator::translateInvoke(const User &U,
+                                   MachineIRBuilder &MIRBuilder) {
   const InvokeInst &I = cast<InvokeInst>(U);
-  MachineFunction &MF = MIRBuilder.getMF();
-  MCContext &Context = MF.getContext();
+  MCContext &Context = MF->getContext();
 
   const BasicBlock *ReturnBB = I.getSuccessor(0);
   const BasicBlock *EHPadBB = I.getSuccessor(1);
@@ -584,26 +606,26 @@
   // FIXME: track probabilities.
   MachineBasicBlock &EHPadMBB = getOrCreateBB(*EHPadBB),
                     &ReturnMBB = getOrCreateBB(*ReturnBB);
-  MF.addInvoke(&EHPadMBB, BeginSymbol, EndSymbol);
+  MF->addInvoke(&EHPadMBB, BeginSymbol, EndSymbol);
   MIRBuilder.getMBB().addSuccessor(&ReturnMBB);
   MIRBuilder.getMBB().addSuccessor(&EHPadMBB);
 
   return true;
 }
 
-bool IRTranslator::translateLandingPad(const User &U) {
+bool IRTranslator::translateLandingPad(const User &U,
+                                       MachineIRBuilder &MIRBuilder) {
   const LandingPadInst &LP = cast<LandingPadInst>(U);
 
   MachineBasicBlock &MBB = MIRBuilder.getMBB();
-  MachineFunction &MF = MIRBuilder.getMF();
   addLandingPadInfo(LP, MBB);
 
   MBB.setIsEHPad();
 
   // If there aren't registers to copy the values into (e.g., during SjLj
   // exceptions), then don't bother.
-  auto &TLI = *MF.getSubtarget().getTargetLowering();
-  const Constant *PersonalityFn = MF.getFunction()->getPersonalityFn();
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  const Constant *PersonalityFn = MF->getFunction()->getPersonalityFn();
   if (TLI.getExceptionPointerRegister(PersonalityFn) == 0 &&
       TLI.getExceptionSelectorRegister(PersonalityFn) == 0)
     return true;
@@ -618,7 +640,7 @@
   // Add a label to mark the beginning of the landing pad.  Deletion of the
   // landing pad can thus be detected via the MachineModuleInfo.
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL)
-    .addSym(MF.addLandingPad(&MBB));
+    .addSym(MF->addLandingPad(&MBB));
 
   // Mark exception register as live in.
   SmallVector<unsigned, 2> Regs;
@@ -642,7 +664,8 @@
   return true;
 }
 
-bool IRTranslator::translateStaticAlloca(const AllocaInst &AI) {
+bool IRTranslator::translateStaticAlloca(const AllocaInst &AI,
+                                         MachineIRBuilder &MIRBuilder) {
   if (!TPC->isGlobalISelAbortEnabled() && !AI.isStaticAlloca())
     return false;
 
@@ -653,7 +676,7 @@
   return true;
 }
 
-bool IRTranslator::translatePHI(const User &U) {
+bool IRTranslator::translatePHI(const User &U, MachineIRBuilder &MIRBuilder) {
   const PHINode &PI = cast<PHINode>(U);
   auto MIB = MIRBuilder.buildInstr(TargetOpcode::PHI);
   MIB.addDef(getOrCreateVReg(PI));
@@ -665,7 +688,7 @@
 void IRTranslator::finishPendingPhis() {
   for (std::pair<const PHINode *, MachineInstr *> &Phi : PendingPHIs) {
     const PHINode *PI = Phi.first;
-    MachineInstrBuilder MIB(MIRBuilder.getMF(), Phi.second);
+    MachineInstrBuilder MIB(*MF, Phi.second);
 
     // All MachineBasicBlocks exist, add them to the PHI. We assume IRTranslator
     // won't create extra control flow here, otherwise we need to find the
@@ -678,15 +701,13 @@
       MIB.addMBB(BBToMBB[PI->getIncomingBlock(i)]);
     }
   }
-
-  PendingPHIs.clear();
 }
 
 bool IRTranslator::translate(const Instruction &Inst) {
-  MIRBuilder.setDebugLoc(Inst.getDebugLoc());
+  CurBuilder.setDebugLoc(Inst.getDebugLoc());
   switch(Inst.getOpcode()) {
 #define HANDLE_INST(NUM, OPCODE, CLASS) \
-    case Instruction::OPCODE: return translate##OPCODE(Inst);
+    case Instruction::OPCODE: return translate##OPCODE(Inst, CurBuilder);
 #include "llvm/IR/Instruction.def"
   default:
     if (!TPC->isGlobalISelAbortEnabled())
@@ -697,21 +718,19 @@
 
 bool IRTranslator::translate(const Constant &C, unsigned Reg) {
   if (auto CI = dyn_cast<ConstantInt>(&C))
-    EntryBuilder.buildConstant(Reg, CI->getZExtValue());
+    EntryBuilder.buildConstant(Reg, *CI);
   else if (auto CF = dyn_cast<ConstantFP>(&C))
     EntryBuilder.buildFConstant(Reg, *CF);
   else if (isa<UndefValue>(C))
     EntryBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(Reg);
   else if (isa<ConstantPointerNull>(C))
-    EntryBuilder.buildInstr(TargetOpcode::G_CONSTANT)
-        .addDef(Reg)
-        .addImm(0);
+    EntryBuilder.buildConstant(Reg, 0);
   else if (auto GV = dyn_cast<GlobalValue>(&C))
     EntryBuilder.buildGlobalValue(Reg, GV);
   else if (auto CE = dyn_cast<ConstantExpr>(&C)) {
     switch(CE->getOpcode()) {
 #define HANDLE_INST(NUM, OPCODE, CLASS)                         \
-      case Instruction::OPCODE: return translate##OPCODE(*CE);
+      case Instruction::OPCODE: return translate##OPCODE(*CE, EntryBuilder);
 #include "llvm/IR/Instruction.def"
     default:
       if (!TPC->isGlobalISelAbortEnabled())
@@ -727,73 +746,78 @@
 }
 
 void IRTranslator::finalizeFunction() {
-  finishPendingPhis();
-
   // Release the memory used by the different maps we
   // needed during the translation.
+  PendingPHIs.clear();
   ValToVReg.clear();
   FrameIndices.clear();
   Constants.clear();
 }
 
-bool IRTranslator::runOnMachineFunction(MachineFunction &MF) {
-  const Function &F = *MF.getFunction();
+bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
+  MF = &CurMF;
+  const Function &F = *MF->getFunction();
   if (F.empty())
     return false;
-  CLI = MF.getSubtarget().getCallLowering();
-  MIRBuilder.setMF(MF);
-  EntryBuilder.setMF(MF);
-  MRI = &MF.getRegInfo();
+  CLI = MF->getSubtarget().getCallLowering();
+  CurBuilder.setMF(*MF);
+  EntryBuilder.setMF(*MF);
+  MRI = &MF->getRegInfo();
   DL = &F.getParent()->getDataLayout();
   TPC = &getAnalysis<TargetPassConfig>();
 
   assert(PendingPHIs.empty() && "stale PHIs");
 
-  // Setup the arguments.
-  MachineBasicBlock &MBB = getOrCreateBB(F.front());
-  MIRBuilder.setMBB(MBB);
+  // Setup a separate basic-block for the arguments and constants, falling
+  // through to the IR-level Function's entry block.
+  MachineBasicBlock *EntryBB = MF->CreateMachineBasicBlock();
+  MF->push_back(EntryBB);
+  EntryBB->addSuccessor(&getOrCreateBB(F.front()));
+  EntryBuilder.setMBB(*EntryBB);
+
+  // Lower the actual args into this basic block.
   SmallVector<unsigned, 8> VRegArgs;
   for (const Argument &Arg: F.args())
     VRegArgs.push_back(getOrCreateVReg(Arg));
-  bool Succeeded = CLI->lowerFormalArguments(MIRBuilder, F, VRegArgs);
+  bool Succeeded = CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs);
   if (!Succeeded) {
     if (!TPC->isGlobalISelAbortEnabled()) {
-      MIRBuilder.getMF().getProperties().set(
+      MF->getProperties().set(
           MachineFunctionProperties::Property::FailedISel);
+      finalizeFunction();
       return false;
     }
     report_fatal_error("Unable to lower arguments");
   }
 
-  // Now that we've got the ABI handling code, it's safe to set a location for
-  // any Constants we find in the IR.
-  if (MBB.empty())
-    EntryBuilder.setMBB(MBB);
-  else
-    EntryBuilder.setInstr(MBB.back(), /* Before */ false);
-
+  // And translate the function!
   for (const BasicBlock &BB: F) {
     MachineBasicBlock &MBB = getOrCreateBB(BB);
     // Set the insertion point of all the following translations to
     // the end of this basic block.
-    MIRBuilder.setMBB(MBB);
+    CurBuilder.setMBB(MBB);
 
     for (const Instruction &Inst: BB) {
-      bool Succeeded = translate(Inst);
+      Succeeded &= translate(Inst);
       if (!Succeeded) {
         if (TPC->isGlobalISelAbortEnabled())
           reportTranslationError(Inst, "unable to translate instruction");
-        MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+        MF->getProperties().set(
+            MachineFunctionProperties::Property::FailedISel);
         break;
       }
     }
   }
 
-  finalizeFunction();
+  if (Succeeded) {
+    finishPendingPhis();
 
-  // Now that the MachineFrameInfo has been configured, no further changes to
-  // the reserved registers are possible.
-  MRI->freezeReservedRegs(MF);
+    // Now that the MachineFrameInfo has been configured, no further changes to
+    // the reserved registers are possible.
+    MRI->freezeReservedRegs(*MF);
+  }
+
+  finalizeFunction();
 
   return false;
 }
diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index ffb22b2..eb25b6c 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -244,7 +244,7 @@
   }
   case TargetOpcode::G_CONSTANT: {
     unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildConstant(DstExt, MI.getOperand(1).getImm());
+    MIRBuilder.buildConstant(DstExt, *MI.getOperand(1).getCImm());
     MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
     MI.eraseFromParent();
     return Legalized;
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index ebbed12..e496620 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -71,6 +71,11 @@
   // These *have* to be implemented for now, they're the fundamental basis of
   // how everything else is transformed.
 
+  // Nothing is going to go well with types that aren't a power of 2 yet, so
+  // don't even try because we might make things worse.
+  if (!isPowerOf2_64(Aspect.Type.getSizeInBits()))
+      return std::make_pair(Unsupported, LLT());
+
   // FIXME: the long-term plan calls for expansion in terms of load/store (if
   // they're not legal).
   if (Aspect.Opcode == TargetOpcode::G_SEQUENCE ||
@@ -88,7 +93,9 @@
     if (DefaultAction != DefaultActions.end() && DefaultAction->second == Legal)
       return std::make_pair(Legal, Ty);
 
-    assert(DefaultAction->second == NarrowScalar && "unexpected default");
+    if (DefaultAction == DefaultActions.end() ||
+        DefaultAction->second != NarrowScalar)
+      return std::make_pair(Unsupported, LLT());
     return findLegalAction(Aspect, NarrowScalar);
   }
 
diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 22f8d98..c04f6e4a 100644
--- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -27,34 +27,29 @@
   this->MRI = &MF.getRegInfo();
   this->TII = MF.getSubtarget().getInstrInfo();
   this->DL = DebugLoc();
-  this->MI = nullptr;
+  this->II = MachineBasicBlock::iterator();
   this->InsertedInstr = nullptr;
 }
 
-void MachineIRBuilder::setMBB(MachineBasicBlock &MBB, bool Beginning) {
+void MachineIRBuilder::setMBB(MachineBasicBlock &MBB) {
   this->MBB = &MBB;
-  this->MI = nullptr;
-  Before = Beginning;
+  this->II = MBB.end();
   assert(&getMF() == MBB.getParent() &&
          "Basic block is in a different function");
 }
 
-void MachineIRBuilder::setInstr(MachineInstr &MI, bool Before) {
+void MachineIRBuilder::setInstr(MachineInstr &MI) {
   assert(MI.getParent() && "Instruction is not part of a basic block");
   setMBB(*MI.getParent());
-  this->MI = &MI;
-  this->Before = Before;
+  this->II = MI.getIterator();
 }
 
-MachineBasicBlock::iterator MachineIRBuilder::getInsertPt() {
-  if (MI) {
-    if (Before)
-      return MI;
-    if (!MI->getNextNode())
-      return getMBB().end();
-    return MI->getNextNode();
-  }
-  return Before ? getMBB().begin() : getMBB().end();
+void MachineIRBuilder::setInsertPt(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator II) {
+  assert(MBB.getParent() == &getMF() &&
+         "Basic block is in a different function");
+  this->MBB = &MBB;
+  this->II = II;
 }
 
 void MachineIRBuilder::recordInsertions(
@@ -165,10 +160,26 @@
   return buildInstr(TargetOpcode::COPY).addDef(Res).addUse(Op);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildConstant(unsigned Res, int64_t Val) {
-  assert(MRI->getType(Res).isScalar() && "invalid operand type");
+MachineInstrBuilder MachineIRBuilder::buildConstant(unsigned Res,
+                                                    const ConstantInt &Val) {
+  LLT Ty = MRI->getType(Res);
 
-  return buildInstr(TargetOpcode::G_CONSTANT).addDef(Res).addImm(Val);
+  assert((Ty.isScalar() || Ty.isPointer()) && "invalid operand type");
+
+  const ConstantInt *NewVal = &Val;
+  if (Ty.getSizeInBits() != Val.getBitWidth())
+    NewVal = ConstantInt::get(MF->getFunction()->getContext(),
+                              Val.getValue().sextOrTrunc(Ty.getSizeInBits()));
+
+  return buildInstr(TargetOpcode::G_CONSTANT).addDef(Res).addCImm(NewVal);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildConstant(unsigned Res,
+                                                    int64_t Val) {
+  auto IntN = IntegerType::get(MF->getFunction()->getContext(),
+                               MRI->getType(Res).getSizeInBits());
+  ConstantInt *CI = ConstantInt::get(IntN, Val, true);
+  return buildConstant(Res, *CI);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildFConstant(unsigned Res,
@@ -376,11 +387,12 @@
 MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst,
                                                   unsigned Op0, unsigned Op1) {
 #ifndef NDEBUG
-  assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) &&
+  LLT ResTy = MRI->getType(Res);
+  assert((ResTy.isScalar() || ResTy.isVector() || ResTy.isPointer()) &&
          "invalid operand type");
-  assert(MRI->getType(Res) == MRI->getType(Op0) &&
-         MRI->getType(Res) == MRI->getType(Op1) && "type mismatch");
-  if (MRI->getType(Res).isScalar())
+  assert(ResTy == MRI->getType(Op0) && ResTy == MRI->getType(Op1) &&
+         "type mismatch");
+  if (ResTy.isScalar() || ResTy.isPointer())
     assert(MRI->getType(Tst).isScalar() && "type mismatch");
   else
     assert(MRI->getType(Tst).isVector() &&
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 2a20e43..04bb7ca 100644
--- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -367,6 +367,9 @@
     const RegBankSelect::MappingCost *BestCost) {
   assert((MBFI || !BestCost) && "Costs comparison require MBFI");
 
+  if (!InstrMapping.isValid())
+    return MappingCost::ImpossibleCost();
+
   // If mapped with InstrMapping, MI will have the recorded cost.
   MappingCost Cost(MBFI ? MBFI->getBlockFreq(MI.getParent()) : 1);
   bool Saturated = Cost.addLocalCost(InstrMapping.getCost());
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index e81af66..56a25e0 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -1517,13 +1517,13 @@
 
   // Initialize liveins to the first BB. These are potentiall redefined by
   // predicated instructions.
-  Redefs.init(TRI);
+  Redefs.init(*TRI);
   Redefs.addLiveIns(CvtMBB);
   Redefs.addLiveIns(NextMBB);
 
   // Compute a set of registers which must not be killed by instructions in
   // BB1: This is everything live-in to BB2.
-  DontKill.init(TRI);
+  DontKill.init(*TRI);
   DontKill.addLiveIns(NextMBB);
 
   if (CvtMBB.pred_size() > 1) {
@@ -1621,7 +1621,7 @@
 
   // Initialize liveins to the first BB. These are potentially redefined by
   // predicated instructions.
-  Redefs.init(TRI);
+  Redefs.init(*TRI);
   Redefs.addLiveIns(CvtMBB);
   Redefs.addLiveIns(NextMBB);
 
@@ -1785,7 +1785,7 @@
   // - BB1 live-out regs need implicit uses before being redefined by BB2
   //   instructions. We start with BB1 live-ins so we have the live-out regs
   //   after tracking the BB1 instructions.
-  Redefs.init(TRI);
+  Redefs.init(*TRI);
   Redefs.addLiveIns(MBB1);
   Redefs.addLiveIns(MBB2);
 
@@ -1811,7 +1811,7 @@
   // Compute a set of registers which must not be killed by instructions in BB1:
   // This is everything used+live in BB2 after the duplicated instructions. We
   // can compute this set by simulating liveness backwards from the end of BB2.
-  DontKill.init(TRI);
+  DontKill.init(*TRI);
   for (const MachineInstr &MI : make_range(MBB2.rbegin(), ++DI2.getReverse()))
     DontKill.stepBackward(MI);
 
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 3e5ae5f..422f2dc 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -768,6 +768,11 @@
       FoldOps.push_back(Idx);
   }
 
+  // If we only have implicit uses, we won't be able to fold that.
+  // Moreover, TargetInstrInfo::foldMemoryOperand will assert if we try!
+  if (FoldOps.empty())
+    return false;
+
   MachineInstrSpan MIS(MI);
 
   MachineInstr *FoldMI =
diff --git a/lib/CodeGen/LLVMBuild.txt b/lib/CodeGen/LLVMBuild.txt
index 36d6cc1..86d3624 100644
--- a/lib/CodeGen/LLVMBuild.txt
+++ b/lib/CodeGen/LLVMBuild.txt
@@ -22,4 +22,4 @@
 type = Library
 name = CodeGen
 parent = Libraries
-required_libraries = Analysis BitReader BitWriter Core Instrumentation MC ProfileData Scalar Support Target TransformUtils
+required_libraries = Analysis BitReader BitWriter Core MC Scalar Support Target TransformUtils
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 590b10b..0fa0b91 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7137,6 +7137,15 @@
       return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
   }
 
+  // fold (sext_in_reg (zext x)) -> (sext x)
+  // iff we are extending the source sign bit.
+  if (N0.getOpcode() == ISD::ZERO_EXTEND) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getScalarValueSizeInBits() == EVTBits &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
+      return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
+  }
+
   // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
   if (DAG.MaskedValueIsZero(N0, APInt::getBitsSet(VTBits, EVTBits-1, EVTBits)))
     return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType());
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index d970ff4..3485e35 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3312,17 +3312,49 @@
   }
   case ISD::MULHU:
   case ISD::MULHS: {
-    unsigned ExpandOpcode = Node->getOpcode() == ISD::MULHU ? ISD::UMUL_LOHI :
-                                                              ISD::SMUL_LOHI;
+    unsigned ExpandOpcode =
+        Node->getOpcode() == ISD::MULHU ? ISD::UMUL_LOHI : ISD::SMUL_LOHI;
     EVT VT = Node->getValueType(0);
     SDVTList VTs = DAG.getVTList(VT, VT);
-    assert(TLI.isOperationLegalOrCustom(ExpandOpcode, VT) &&
-           "If this wasn't legal, it shouldn't have been created!");
+
     Tmp1 = DAG.getNode(ExpandOpcode, dl, VTs, Node->getOperand(0),
                        Node->getOperand(1));
     Results.push_back(Tmp1.getValue(1));
     break;
   }
+  case ISD::UMUL_LOHI:
+  case ISD::SMUL_LOHI: {
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+    MVT VT = LHS.getSimpleValueType();
+    unsigned MULHOpcode =
+        Node->getOpcode() == ISD::UMUL_LOHI ? ISD::MULHU : ISD::MULHS;
+
+    if (TLI.isOperationLegalOrCustom(MULHOpcode, VT)) {
+      Results.push_back(DAG.getNode(ISD::MUL, dl, VT, LHS, RHS));
+      Results.push_back(DAG.getNode(MULHOpcode, dl, VT, LHS, RHS));
+      break;
+    }
+
+    SmallVector<SDValue, 4> Halves;
+    EVT HalfType = EVT(VT).getHalfSizedIntegerVT(*DAG.getContext());
+    assert(TLI.isTypeLegal(HalfType));
+    if (TLI.expandMUL_LOHI(Node->getOpcode(), VT, Node, LHS, RHS, Halves,
+                           HalfType, DAG,
+                           TargetLowering::MulExpansionKind::Always)) {
+      for (unsigned i = 0; i < 2; ++i) {
+        SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Halves[2 * i]);
+        SDValue Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Halves[2 * i + 1]);
+        SDValue Shift = DAG.getConstant(
+            HalfType.getScalarSizeInBits(), dl,
+            TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
+        Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
+        Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
+      }
+      break;
+    }
+    break;
+  }
   case ISD::MUL: {
     EVT VT = Node->getValueType(0);
     SDVTList VTs = DAG.getVTList(VT, VT);
@@ -3357,7 +3389,8 @@
         TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND, VT) &&
         TLI.isOperationLegalOrCustom(ISD::SHL, VT) &&
         TLI.isOperationLegalOrCustom(ISD::OR, VT) &&
-        TLI.expandMUL(Node, Lo, Hi, HalfType, DAG)) {
+        TLI.expandMUL(Node, Lo, Hi, HalfType, DAG,
+                      TargetLowering::MulExpansionKind::OnlyLegalOrCustom)) {
       Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Lo);
       Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Hi);
       SDValue Shift =
@@ -3515,6 +3548,15 @@
       TopHalf = DAG.getSetCC(dl, getSetCCResultType(VT), TopHalf,
                              DAG.getConstant(0, dl, VT), ISD::SETNE);
     }
+
+    // Truncate the result if SetCC returns a larger type than needed.
+    EVT RType = Node->getValueType(1);
+    if (RType.getSizeInBits() < TopHalf.getValueSizeInBits())
+      TopHalf = DAG.getNode(ISD::TRUNCATE, dl, RType, TopHalf);
+
+    assert(RType.getSizeInBits() == TopHalf.getValueSizeInBits() &&
+           "Unexpected result type for S/UMULO legalization");
+
     Results.push_back(BottomHalf);
     Results.push_back(TopHalf);
     break;
@@ -4185,6 +4227,24 @@
     Results.push_back(DAG.getNode(TruncOp, dl, OVT, Tmp1));
     break;
   }
+  case ISD::UMUL_LOHI:
+  case ISD::SMUL_LOHI: {
+    // Promote to a multiply in a wider integer type.
+    unsigned ExtOp = Node->getOpcode() == ISD::UMUL_LOHI ? ISD::ZERO_EXTEND
+                                                         : ISD::SIGN_EXTEND;
+    Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    Tmp1 = DAG.getNode(ISD::MUL, dl, NVT, Tmp1, Tmp2);
+
+    auto &DL = DAG.getDataLayout();
+    unsigned OriginalSize = OVT.getScalarSizeInBits();
+    Tmp2 = DAG.getNode(
+        ISD::SRL, dl, NVT, Tmp1,
+        DAG.getConstant(OriginalSize, dl, TLI.getScalarShiftAmountTy(DL, NVT)));
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp2));
+    break;
+  }
   case ISD::SELECT: {
     unsigned ExtOp, TruncOp;
     if (Node->getValueType(0).isVector() ||
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 12fe168..9d07371 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2189,7 +2189,9 @@
   GetExpandedInteger(N->getOperand(0), LL, LH);
   GetExpandedInteger(N->getOperand(1), RL, RH);
 
-  if (TLI.expandMUL(N, Lo, Hi, NVT, DAG, LL, LH, RL, RH))
+  if (TLI.expandMUL(N, Lo, Hi, NVT, DAG,
+                    TargetLowering::MulExpansionKind::OnlyLegalOrCustom,
+                    LL, LH, RL, RH))
     return;
 
   // If nothing else, we can make a libcall.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 8eba6a3..d4fa20f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -333,6 +333,8 @@
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI:
     QueryType = Node->getValueType(0);
     break;
   case ISD::FP_ROUND_INREG:
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index fc0cdee..cbe5a24 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2039,7 +2039,8 @@
   if (!DemandedElts)
     return;  // No demanded elts, better to assume we don't know anything.
 
-  switch (Op.getOpcode()) {
+  unsigned Opcode = Op.getOpcode();
+  switch (Opcode) {
   case ISD::Constant:
     // We know all of the bits for a constant!
     KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue();
@@ -2082,16 +2083,16 @@
     const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
     assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
     for (unsigned i = 0; i != NumElts; ++i) {
+      if (!DemandedElts[i])
+        continue;
+
       int M = SVN->getMaskElt(i);
       if (M < 0) {
         // For UNDEF elements, we don't know anything about the common state of
         // the shuffle result.
-        // FIXME: Is this too pessimistic?
         KnownZero = KnownOne = APInt(BitWidth, 0);
         break;
       }
-      if (!DemandedElts[i])
-        continue;
 
       if ((unsigned)M < NumElts)
         DemandedLHS.setBit((unsigned)M % NumElts);
@@ -2147,6 +2148,50 @@
     }
     break;
   }
+  case ISD::BITCAST: {
+    SDValue N0 = Op.getOperand(0);
+    unsigned SubBitWidth = N0.getScalarValueSizeInBits();
+
+    // Ignore bitcasts from floating point.
+    if (!N0.getValueType().isInteger())
+      break;
+
+    // Fast handling of 'identity' bitcasts.
+    if (BitWidth == SubBitWidth) {
+      computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1);
+      break;
+    }
+
+    // Support big-endian targets when it becomes useful.
+    bool IsLE = getDataLayout().isLittleEndian();
+    if (!IsLE)
+      break;
+
+    // Bitcast 'small element' vector to 'large element' scalar/vector.
+    if ((BitWidth % SubBitWidth) == 0) {
+      assert(N0.getValueType().isVector() && "Expected bitcast from vector");
+
+      // Collect known bits for the (larger) output by collecting the known
+      // bits from each set of sub elements and shift these into place.
+      // We need to separately call computeKnownBits for each set of
+      // sub elements as the knownbits for each is likely to be different.
+      unsigned SubScale = BitWidth / SubBitWidth;
+      APInt SubDemandedElts(NumElts * SubScale, 0);
+      for (unsigned i = 0; i != NumElts; ++i)
+        if (DemandedElts[i])
+          SubDemandedElts.setBit(i * SubScale);
+
+      for (unsigned i = 0; i != SubScale; ++i) {
+        computeKnownBits(N0, KnownZero2, KnownOne2, SubDemandedElts.shl(i),
+                         Depth + 1);
+        KnownOne |= KnownOne2.zext(BitWidth).shl(SubBitWidth * i);
+        KnownZero |= KnownZero2.zext(BitWidth).shl(SubBitWidth * i);
+      }
+    }
+
+    // TODO - support ((SubBitWidth % BitWidth) == 0) when it becomes useful.
+    break;
+  }
   case ISD::AND:
     // If either the LHS or the RHS are Zero, the result is zero.
     computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, DemandedElts,
@@ -2478,7 +2523,7 @@
     KnownZeroLow = std::min(KnownZeroLow,
                             KnownZero2.countTrailingOnes());
 
-    if (Op.getOpcode() == ISD::ADD) {
+    if (Opcode == ISD::ADD) {
       KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroLow);
       if (KnownZeroHigh > 1)
         KnownZero |= APInt::getHighBitsSet(BitWidth, KnownZeroHigh - 1);
@@ -2561,9 +2606,6 @@
     break;
   }
   case ISD::EXTRACT_VECTOR_ELT: {
-    // At the moment we keep this simple and skip tracking the specific
-    // element. This way we get the lowest common denominator for all elements
-    // of the vector.
     SDValue InVec = Op.getOperand(0);
     SDValue EltNo = Op.getOperand(1);
     EVT VecVT = InVec.getValueType();
@@ -2605,8 +2647,10 @@
   case ISD::UMAX: {
     APInt Op0Zero, Op0One;
     APInt Op1Zero, Op1One;
-    computeKnownBits(Op.getOperand(0), Op0Zero, Op0One, Depth+1);
-    computeKnownBits(Op.getOperand(1), Op1Zero, Op1One, Depth+1);
+    computeKnownBits(Op.getOperand(0), Op0Zero, Op0One, DemandedElts,
+                     Depth + 1);
+    computeKnownBits(Op.getOperand(1), Op1Zero, Op1One, DemandedElts,
+                     Depth + 1);
 
     KnownZero = Op0Zero & Op1Zero;
     KnownOne = Op0One & Op1One;
@@ -2622,7 +2666,7 @@
     break;
 
   default:
-    if (Op.getOpcode() < ISD::BUILTIN_OP_END)
+    if (Opcode < ISD::BUILTIN_OP_END)
       break;
     LLVM_FALLTHROUGH;
   case ISD::INTRINSIC_WO_CHAIN:
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 08373e0..4cc04f3 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -554,16 +554,30 @@
     // simplify the LHS, here we're using information from the LHS to simplify
     // the RHS.
     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      SDValue Op0 = Op.getOperand(0);
       APInt LHSZero, LHSOne;
       // Do not increment Depth here; that can cause an infinite loop.
-      TLO.DAG.computeKnownBits(Op.getOperand(0), LHSZero, LHSOne, Depth);
+      TLO.DAG.computeKnownBits(Op0, LHSZero, LHSOne, Depth);
       // If the LHS already has zeros where RHSC does, this and is dead.
       if ((LHSZero & NewMask) == (~RHSC->getAPIntValue() & NewMask))
-        return TLO.CombineTo(Op, Op.getOperand(0));
+        return TLO.CombineTo(Op, Op0);
+
       // If any of the set bits in the RHS are known zero on the LHS, shrink
       // the constant.
       if (TLO.ShrinkDemandedConstant(Op, ~LHSZero & NewMask))
         return true;
+
+      // Bitwise-not (xor X, -1) is a special case: we don't usually shrink its
+      // constant, but if this 'and' is only clearing bits that were just set by
+      // the xor, then this 'and' can be eliminated by shrinking the mask of
+      // the xor. For example, for a 32-bit X:
+      // and (xor (srl X, 31), -1), 1 --> xor (srl X, 31), 1
+      if (isBitwiseNot(Op0) && Op0.hasOneUse() &&
+          LHSOne == ~RHSC->getAPIntValue()) {
+        SDValue Xor = TLO.DAG.getNode(ISD::XOR, dl, Op.getValueType(),
+                                      Op0.getOperand(0), Op.getOperand(1));
+        return TLO.CombineTo(Op, Xor);
+      }
     }
 
     if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero,
@@ -679,10 +693,10 @@
 
     // If the RHS is a constant, see if we can simplify it.
     // for XOR, we prefer to force bits to 1 if they will make a -1.
-    // if we can't force bits, try to shrink constant
+    // If we can't force bits, try to shrink the constant.
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       APInt Expanded = C->getAPIntValue() | (~NewMask);
-      // if we can expand it to have all bits set, do it
+      // If we can expand it to have all bits set, do it.
       if (Expanded.isAllOnesValue()) {
         if (Expanded != C->getAPIntValue()) {
           EVT VT = Op.getValueType();
@@ -690,7 +704,7 @@
                                         TLO.DAG.getConstant(Expanded, dl, VT));
           return TLO.CombineTo(Op, New);
         }
-        // if it already has all the bits set, nothing to change
+        // If it already has all the bits set, nothing to change
         // but don't shrink either!
       } else if (TLO.ShrinkDemandedConstant(Op, NewMask)) {
         return true;
@@ -3079,24 +3093,29 @@
 // Legalization Utilities
 //===----------------------------------------------------------------------===//
 
-bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
-                               SelectionDAG &DAG, SDValue LL, SDValue LH,
-                               SDValue RL, SDValue RH) const {
-  EVT VT = N->getValueType(0);
-  SDLoc dl(N);
+bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl,
+                                    SDValue LHS, SDValue RHS,
+                                    SmallVectorImpl<SDValue> &Result,
+                                    EVT HiLoVT, SelectionDAG &DAG,
+                                    MulExpansionKind Kind, SDValue LL,
+                                    SDValue LH, SDValue RL, SDValue RH) const {
+  assert(Opcode == ISD::MUL || Opcode == ISD::UMUL_LOHI ||
+         Opcode == ISD::SMUL_LOHI);
 
-  bool HasMULHS = isOperationLegalOrCustom(ISD::MULHS, HiLoVT);
-  bool HasMULHU = isOperationLegalOrCustom(ISD::MULHU, HiLoVT);
-  bool HasSMUL_LOHI = isOperationLegalOrCustom(ISD::SMUL_LOHI, HiLoVT);
-  bool HasUMUL_LOHI = isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT);
+  bool HasMULHS = (Kind == MulExpansionKind::Always) ||
+                  isOperationLegalOrCustom(ISD::MULHS, HiLoVT);
+  bool HasMULHU = (Kind == MulExpansionKind::Always) ||
+                  isOperationLegalOrCustom(ISD::MULHU, HiLoVT);
+  bool HasSMUL_LOHI = (Kind == MulExpansionKind::Always) ||
+                      isOperationLegalOrCustom(ISD::SMUL_LOHI, HiLoVT);
+  bool HasUMUL_LOHI = (Kind == MulExpansionKind::Always) ||
+                      isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT);
 
   if (!HasMULHU && !HasMULHS && !HasUMUL_LOHI && !HasSMUL_LOHI)
     return false;
 
   unsigned OuterBitSize = VT.getScalarSizeInBits();
   unsigned InnerBitSize = HiLoVT.getScalarSizeInBits();
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
   unsigned LHSSB = DAG.ComputeNumSignBits(LHS);
   unsigned RHSSB = DAG.ComputeNumSignBits(RHS);
 
@@ -3120,6 +3139,8 @@
     return false;
   };
 
+  SDValue Lo, Hi;
+
   if (!LL.getNode() && !RL.getNode() &&
       isOperationLegalOrCustom(ISD::TRUNCATE, HiLoVT)) {
     LL = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, LHS);
@@ -3130,20 +3151,41 @@
     return false;
 
   APInt HighMask = APInt::getHighBitsSet(OuterBitSize, InnerBitSize);
-  if (DAG.MaskedValueIsZero(N->getOperand(0), HighMask) &&
-      DAG.MaskedValueIsZero(N->getOperand(1), HighMask)) {
+  if (DAG.MaskedValueIsZero(LHS, HighMask) &&
+      DAG.MaskedValueIsZero(RHS, HighMask)) {
     // The inputs are both zero-extended.
-    if (MakeMUL_LOHI(LL, RL, Lo, Hi, false))
+    if (MakeMUL_LOHI(LL, RL, Lo, Hi, false)) {
+      Result.push_back(Lo);
+      Result.push_back(Hi);
+      if (Opcode != ISD::MUL) {
+        SDValue Zero = DAG.getConstant(0, dl, HiLoVT);
+        Result.push_back(Zero);
+        Result.push_back(Zero);
+      }
       return true;
-  }
-  if (LHSSB > InnerBitSize && RHSSB > InnerBitSize) {
-    // The input values are both sign-extended.
-    if (MakeMUL_LOHI(LL, RL, Lo, Hi, true))
-      return true;
+    }
   }
 
-  SDValue Shift = DAG.getConstant(OuterBitSize - InnerBitSize, dl,
-                                  getShiftAmountTy(VT, DAG.getDataLayout()));
+  if (!VT.isVector() && Opcode == ISD::MUL && LHSSB > InnerBitSize &&
+      RHSSB > InnerBitSize) {
+    // The input values are both sign-extended.
+    // TODO non-MUL case?
+    if (MakeMUL_LOHI(LL, RL, Lo, Hi, true)) {
+      Result.push_back(Lo);
+      Result.push_back(Hi);
+      return true;
+    }
+  }
+
+  unsigned ShiftAmount = OuterBitSize - InnerBitSize;
+  EVT ShiftAmountTy = getShiftAmountTy(VT, DAG.getDataLayout());
+  if (APInt::getMaxValue(ShiftAmountTy.getSizeInBits()).ult(ShiftAmount)) {
+    // FIXME getShiftAmountTy does not always return a sensible result when VT
+    // is an illegal type, and so the type may be too small to fit the shift
+    // amount. Override it with i32. The shift will have to be legalized.
+    ShiftAmountTy = MVT::i32;
+  }
+  SDValue Shift = DAG.getConstant(ShiftAmount, dl, ShiftAmountTy);
 
   if (!LH.getNode() && !RH.getNode() &&
       isOperationLegalOrCustom(ISD::SRL, VT) &&
@@ -3157,15 +3199,84 @@
   if (!LH.getNode())
     return false;
 
-  if (MakeMUL_LOHI(LL, RL, Lo, Hi, false)) {
+  if (!MakeMUL_LOHI(LL, RL, Lo, Hi, false))
+    return false;
+
+  Result.push_back(Lo);
+
+  if (Opcode == ISD::MUL) {
     RH = DAG.getNode(ISD::MUL, dl, HiLoVT, LL, RH);
     LH = DAG.getNode(ISD::MUL, dl, HiLoVT, LH, RL);
     Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, RH);
     Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, LH);
+    Result.push_back(Hi);
     return true;
   }
 
-  return false;
+  // Compute the full width result.
+  auto Merge = [&](SDValue Lo, SDValue Hi) -> SDValue {
+    Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Lo);
+    Hi = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Hi);
+    Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
+    return DAG.getNode(ISD::OR, dl, VT, Lo, Hi);
+  };
+
+  SDValue Next = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Hi);
+  if (!MakeMUL_LOHI(LL, RH, Lo, Hi, false))
+    return false;
+
+  // This is effectively the add part of a multiply-add of half-sized operands,
+  // so it cannot overflow.
+  Next = DAG.getNode(ISD::ADD, dl, VT, Next, Merge(Lo, Hi));
+
+  if (!MakeMUL_LOHI(LH, RL, Lo, Hi, false))
+    return false;
+
+  Next = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), Next,
+                     Merge(Lo, Hi));
+
+  SDValue Carry = Next.getValue(1);
+  Result.push_back(DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, Next));
+  Next = DAG.getNode(ISD::SRL, dl, VT, Next, Shift);
+
+  if (!MakeMUL_LOHI(LH, RH, Lo, Hi, Opcode == ISD::SMUL_LOHI))
+    return false;
+
+  SDValue Zero = DAG.getConstant(0, dl, HiLoVT);
+  Hi = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(HiLoVT, MVT::Glue), Hi, Zero,
+                   Carry);
+  Next = DAG.getNode(ISD::ADD, dl, VT, Next, Merge(Lo, Hi));
+
+  if (Opcode == ISD::SMUL_LOHI) {
+    SDValue NextSub = DAG.getNode(ISD::SUB, dl, VT, Next,
+                                  DAG.getNode(ISD::ZERO_EXTEND, dl, VT, RL));
+    Next = DAG.getSelectCC(dl, LH, Zero, NextSub, Next, ISD::SETLT);
+
+    NextSub = DAG.getNode(ISD::SUB, dl, VT, Next,
+                          DAG.getNode(ISD::ZERO_EXTEND, dl, VT, LL));
+    Next = DAG.getSelectCC(dl, RH, Zero, NextSub, Next, ISD::SETLT);
+  }
+
+  Result.push_back(DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, Next));
+  Next = DAG.getNode(ISD::SRL, dl, VT, Next, Shift);
+  Result.push_back(DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, Next));
+  return true;
+}
+
+bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
+                               SelectionDAG &DAG, MulExpansionKind Kind,
+                               SDValue LL, SDValue LH, SDValue RL,
+                               SDValue RH) const {
+  SmallVector<SDValue, 2> Result;
+  bool Ok = expandMUL_LOHI(N->getOpcode(), N->getValueType(0), N,
+                           N->getOperand(0), N->getOperand(1), Result, HiLoVT,
+                           DAG, Kind, LL, LH, RL, RH);
+  if (Ok) {
+    assert(Result.size() == 2);
+    Lo = Result[0];
+    Hi = Result[1];
+  }
+  return Ok;
 }
 
 bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
diff --git a/lib/CodeGen/StackMapLivenessAnalysis.cpp b/lib/CodeGen/StackMapLivenessAnalysis.cpp
index 546e632..a5ef7c8 100644
--- a/lib/CodeGen/StackMapLivenessAnalysis.cpp
+++ b/lib/CodeGen/StackMapLivenessAnalysis.cpp
@@ -125,7 +125,7 @@
   // For all basic blocks in the function.
   for (auto &MBB : MF) {
     DEBUG(dbgs() << "****** BB " << MBB.getName() << " ******\n");
-    LiveRegs.init(TRI);
+    LiveRegs.init(*TRI);
     // FIXME: This should probably be addLiveOuts().
     LiveRegs.addLiveOutsNoPristines(MBB);
     bool HasStackMap = false;
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index 2798036..e7ea2b4 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -38,8 +38,8 @@
 
 using namespace llvm;
 
-static cl::opt<bool> DisablePostRA("disable-post-ra", cl::Hidden,
-    cl::desc("Disable Post Regalloc"));
+static cl::opt<bool> DisablePostRASched("disable-post-ra", cl::Hidden,
+    cl::desc("Disable Post Regalloc Scheduler"));
 static cl::opt<bool> DisableBranchFold("disable-branch-fold", cl::Hidden,
     cl::desc("Disable branch folding"));
 static cl::opt<bool> DisableTailDuplicate("disable-tail-duplicate", cl::Hidden,
@@ -157,7 +157,7 @@
 static IdentifyingPassPtr overridePass(AnalysisID StandardID,
                                        IdentifyingPassPtr TargetID) {
   if (StandardID == &PostRASchedulerID)
-    return applyDisable(TargetID, DisablePostRA);
+    return applyDisable(TargetID, DisablePostRASched);
 
   if (StandardID == &BranchFolderPassID)
     return applyDisable(TargetID, DisableBranchFold);
diff --git a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
index d11c5ae..c666f29 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
@@ -269,6 +269,15 @@
   return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
+int64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsSignedConstant(
+    const DWARFUnit *U, dwarf::Attribute Attr, int64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<int64_t> Result = FormValue.getAsSignedConstant();
+  return Result.hasValue() ? Result.getValue() : FailValue;
+}
+
 uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsUnsignedConstant(
     const DWARFUnit *U, dwarf::Attribute Attr, 
     uint64_t FailValue) const {
diff --git a/lib/DebugInfo/MSF/MappedBlockStream.cpp b/lib/DebugInfo/MSF/MappedBlockStream.cpp
index 403c02b..e52c88a 100644
--- a/lib/DebugInfo/MSF/MappedBlockStream.cpp
+++ b/lib/DebugInfo/MSF/MappedBlockStream.cpp
@@ -63,6 +63,7 @@
 MappedBlockStream::createIndexedStream(const MSFLayout &Layout,
                                        const ReadableStream &MsfData,
                                        uint32_t StreamIndex) {
+  assert(StreamIndex < Layout.StreamMap.size() && "Invalid stream index");
   MSFStreamLayout SL;
   SL.Blocks = Layout.StreamMap[StreamIndex];
   SL.Length = Layout.StreamSizes[StreamIndex];
@@ -334,6 +335,7 @@
 WritableMappedBlockStream::createIndexedStream(const MSFLayout &Layout,
                                                const WritableStream &MsfData,
                                                uint32_t StreamIndex) {
+  assert(StreamIndex < Layout.StreamMap.size() && "Invalid stream index");
   MSFStreamLayout SL;
   SL.Blocks = Layout.StreamMap[StreamIndex];
   SL.Length = Layout.StreamSizes[StreamIndex];
diff --git a/lib/DebugInfo/PDB/Raw/PDBFile.cpp b/lib/DebugInfo/PDB/Raw/PDBFile.cpp
index b429d21..5349151 100644
--- a/lib/DebugInfo/PDB/Raw/PDBFile.cpp
+++ b/lib/DebugInfo/PDB/Raw/PDBFile.cpp
@@ -227,9 +227,10 @@
     if (!DbiS)
       return DbiS.takeError();
 
-    auto GlobalS = MappedBlockStream::createIndexedStream(
+    auto GlobalS = safelyCreateIndexedStream(
         ContainerLayout, *Buffer, DbiS->getGlobalSymbolStreamIndex());
-    auto TempGlobals = llvm::make_unique<GlobalsStream>(std::move(GlobalS));
+    if (!GlobalS) return GlobalS.takeError();
+    auto TempGlobals = llvm::make_unique<GlobalsStream>(std::move(*GlobalS));
     if (auto EC = TempGlobals->reload())
       return std::move(EC);
     Globals = std::move(TempGlobals);
@@ -239,9 +240,9 @@
 
 Expected<InfoStream &> PDBFile::getPDBInfoStream() {
   if (!Info) {
-    auto InfoS = MappedBlockStream::createIndexedStream(ContainerLayout,
-                                                        *Buffer, StreamPDB);
-    auto TempInfo = llvm::make_unique<InfoStream>(std::move(InfoS));
+    auto InfoS = safelyCreateIndexedStream(ContainerLayout, *Buffer, StreamPDB);
+    if (!InfoS) return InfoS.takeError();
+    auto TempInfo = llvm::make_unique<InfoStream>(std::move(*InfoS));
     if (auto EC = TempInfo->reload())
       return std::move(EC);
     Info = std::move(TempInfo);
@@ -251,9 +252,9 @@
 
 Expected<DbiStream &> PDBFile::getPDBDbiStream() {
   if (!Dbi) {
-    auto DbiS = MappedBlockStream::createIndexedStream(ContainerLayout, *Buffer,
-                                                       StreamDBI);
-    auto TempDbi = llvm::make_unique<DbiStream>(*this, std::move(DbiS));
+    auto DbiS = safelyCreateIndexedStream(ContainerLayout, *Buffer, StreamDBI);
+    if (!DbiS) return DbiS.takeError();
+    auto TempDbi = llvm::make_unique<DbiStream>(*this, std::move(*DbiS));
     if (auto EC = TempDbi->reload())
       return std::move(EC);
     Dbi = std::move(TempDbi);
@@ -263,9 +264,9 @@
 
 Expected<TpiStream &> PDBFile::getPDBTpiStream() {
   if (!Tpi) {
-    auto TpiS = MappedBlockStream::createIndexedStream(ContainerLayout, *Buffer,
-                                                       StreamTPI);
-    auto TempTpi = llvm::make_unique<TpiStream>(*this, std::move(TpiS));
+    auto TpiS = safelyCreateIndexedStream(ContainerLayout, *Buffer, StreamTPI);
+    if (!TpiS) return TpiS.takeError();
+    auto TempTpi = llvm::make_unique<TpiStream>(*this, std::move(*TpiS));
     if (auto EC = TempTpi->reload())
       return std::move(EC);
     Tpi = std::move(TempTpi);
@@ -275,9 +276,9 @@
 
 Expected<TpiStream &> PDBFile::getPDBIpiStream() {
   if (!Ipi) {
-    auto IpiS = MappedBlockStream::createIndexedStream(ContainerLayout, *Buffer,
-                                                       StreamIPI);
-    auto TempIpi = llvm::make_unique<TpiStream>(*this, std::move(IpiS));
+    auto IpiS = safelyCreateIndexedStream(ContainerLayout, *Buffer, StreamIPI);
+    if (!IpiS) return IpiS.takeError();
+    auto TempIpi = llvm::make_unique<TpiStream>(*this, std::move(*IpiS));
     if (auto EC = TempIpi->reload())
       return std::move(EC);
     Ipi = std::move(TempIpi);
@@ -291,12 +292,11 @@
     if (!DbiS)
       return DbiS.takeError();
 
-    uint32_t PublicsStreamNum = DbiS->getPublicSymbolStreamIndex();
-
-    auto PublicS = MappedBlockStream::createIndexedStream(
-        ContainerLayout, *Buffer, PublicsStreamNum);
+    auto PublicS = safelyCreateIndexedStream(
+        ContainerLayout, *Buffer, DbiS->getPublicSymbolStreamIndex());
+    if (!PublicS) return PublicS.takeError();
     auto TempPublics =
-        llvm::make_unique<PublicsStream>(*this, std::move(PublicS));
+        llvm::make_unique<PublicsStream>(*this, std::move(*PublicS));
     if (auto EC = TempPublics->reload())
       return std::move(EC);
     Publics = std::move(TempPublics);
@@ -311,10 +311,11 @@
       return DbiS.takeError();
 
     uint32_t SymbolStreamNum = DbiS->getSymRecordStreamIndex();
-    auto SymbolS = MappedBlockStream::createIndexedStream(
-        ContainerLayout, *Buffer, SymbolStreamNum);
+    auto SymbolS =
+        safelyCreateIndexedStream(ContainerLayout, *Buffer, SymbolStreamNum);
+    if (!SymbolS) return SymbolS.takeError();
 
-    auto TempSymbols = llvm::make_unique<SymbolStream>(std::move(SymbolS));
+    auto TempSymbols = llvm::make_unique<SymbolStream>(std::move(*SymbolS));
     if (auto EC = TempSymbols->reload())
       return std::move(EC);
     Symbols = std::move(TempSymbols);
@@ -330,19 +331,61 @@
 
     uint32_t NameStreamIndex = IS->getNamedStreamIndex("/names");
 
-    if (NameStreamIndex == 0)
-      return make_error<RawError>(raw_error_code::no_stream);
-    if (NameStreamIndex >= getNumStreams())
-      return make_error<RawError>(raw_error_code::no_stream);
-    auto NS = MappedBlockStream::createIndexedStream(ContainerLayout, *Buffer,
-                                                     NameStreamIndex);
+    auto NS =
+        safelyCreateIndexedStream(ContainerLayout, *Buffer, NameStreamIndex);
+    if (!NS) return NS.takeError();
 
-    StreamReader Reader(*NS);
+    StreamReader Reader(**NS);
     auto N = llvm::make_unique<NameHashTable>();
     if (auto EC = N->load(Reader))
       return std::move(EC);
     StringTable = std::move(N);
-    StringTableStream = std::move(NS);
+    StringTableStream = std::move(*NS);
   }
   return *StringTable;
 }
+
+bool PDBFile::hasPDBDbiStream() const { return StreamDBI < getNumStreams(); }
+
+bool PDBFile::hasPDBGlobalsStream() {
+  auto DbiS = getPDBDbiStream();
+  if (!DbiS) return false;
+  return DbiS->getGlobalSymbolStreamIndex() < getNumStreams();
+}
+
+bool PDBFile::hasPDBInfoStream() { return StreamPDB < getNumStreams(); }
+
+bool PDBFile::hasPDBIpiStream() const { return StreamIPI < getNumStreams(); }
+
+bool PDBFile::hasPDBPublicsStream() {
+  auto DbiS = getPDBDbiStream();
+  if (!DbiS) return false;
+  return DbiS->getPublicSymbolStreamIndex() < getNumStreams();
+}
+
+bool PDBFile::hasPDBSymbolStream() {
+  auto DbiS = getPDBDbiStream();
+  if (!DbiS) return false;
+  return DbiS->getSymRecordStreamIndex() < getNumStreams();
+}
+
+bool PDBFile::hasPDBTpiStream() const { return StreamTPI < getNumStreams(); }
+
+bool PDBFile::hasStringTable() {
+  auto IS = getPDBInfoStream();
+  if (!IS) return false;
+  return IS->getNamedStreamIndex("/names") < getNumStreams();
+}
+
+/// Wrapper around MappedBlockStream::createIndexedStream()
+/// that checks if a stream with that index actually exists.
+/// If it does not, the return value will have an MSFError with
+/// code msf_error_code::no_stream. Else, the return value will
+/// contain the stream returned by createIndexedStream().
+Expected<std::unique_ptr<MappedBlockStream>> PDBFile::safelyCreateIndexedStream(
+    const MSFLayout &Layout, const ReadableStream &MsfData,
+    uint32_t StreamIndex) const {
+  if (StreamIndex >= getNumStreams())
+    return make_error<RawError>(raw_error_code::no_stream);
+  return MappedBlockStream::createIndexedStream(Layout, MsfData, StreamIndex);
+}
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 974a603..3b3a03f 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -511,79 +511,9 @@
                << " Type: " << format("%x", Type)
                << " Addend: " << format("%x", Addend) << "\n");
 
-  uint32_t Insn = readBytesUnaligned(TargetPtr, 4);
+  Value = evaluateMIPS32Relocation(Section, Offset, Value, Type);
 
-  switch (Type) {
-  default:
-    llvm_unreachable("Not implemented relocation type!");
-    break;
-  case ELF::R_MIPS_32:
-    writeBytesUnaligned(Value, TargetPtr, 4);
-    break;
-  case ELF::R_MIPS_26:
-    Insn &= 0xfc000000;
-    Insn |= (Value & 0x0fffffff) >> 2;
-    writeBytesUnaligned(Insn, TargetPtr, 4);
-    break;
-  case ELF::R_MIPS_HI16:
-    // Get the higher 16-bits. Also add 1 if bit 15 is 1.
-    Insn &= 0xffff0000;
-    Insn |= ((Value + 0x8000) >> 16) & 0xffff;
-    writeBytesUnaligned(Insn, TargetPtr, 4);
-    break;
-  case ELF::R_MIPS_LO16:
-    Insn &= 0xffff0000;
-    Insn |= Value & 0xffff;
-    writeBytesUnaligned(Insn, TargetPtr, 4);
-    break;
-  case ELF::R_MIPS_PC32: {
-    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
-    writeBytesUnaligned(Value - FinalAddress, (uint8_t *)TargetPtr, 4);
-    break;
-  }
-  case ELF::R_MIPS_PC16: {
-    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
-    Insn &= 0xffff0000;
-    Insn |= ((Value - FinalAddress) >> 2) & 0xffff;
-    writeBytesUnaligned(Insn, TargetPtr, 4);
-    break;
-  }
-  case ELF::R_MIPS_PC19_S2: {
-    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
-    Insn &= 0xfff80000;
-    Insn |= ((Value - (FinalAddress & ~0x3)) >> 2) & 0x7ffff;
-    writeBytesUnaligned(Insn, TargetPtr, 4);
-    break;
-  }
-  case ELF::R_MIPS_PC21_S2: {
-    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
-    Insn &= 0xffe00000;
-    Insn |= ((Value - FinalAddress) >> 2) & 0x1fffff;
-    writeBytesUnaligned(Insn, TargetPtr, 4);
-    break;
-  }
-  case ELF::R_MIPS_PC26_S2: {
-    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
-    Insn &= 0xfc000000;
-    Insn |= ((Value - FinalAddress) >> 2) & 0x3ffffff;
-    writeBytesUnaligned(Insn, TargetPtr, 4);
-    break;
-  }
-  case ELF::R_MIPS_PCHI16: {
-    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
-    Insn &= 0xffff0000;
-    Insn |= ((Value - FinalAddress + 0x8000) >> 16) & 0xffff;
-    writeBytesUnaligned(Insn, TargetPtr, 4);
-    break;
-  }
-  case ELF::R_MIPS_PCLO16: {
-    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
-    Insn &= 0xffff0000;
-    Insn |= (Value - FinalAddress) & 0xffff;
-    writeBytesUnaligned(Insn, TargetPtr, 4);
-    break;
-  }
-  }
+  applyMIPSRelocation(TargetPtr, Value, Type);
 }
 
 void RuntimeDyldELF::setMipsABI(const ObjectFile &Obj) {
@@ -608,8 +538,8 @@
                                               SID SectionID) {
   int64_t CalculatedValue = evaluateMIPS64Relocation(
       Section, Offset, Value, Type, Addend, SymOffset, SectionID);
-  applyMIPS64Relocation(Section.getAddressWithOffset(Offset), CalculatedValue,
-                        Type);
+  applyMIPSRelocation(Section.getAddressWithOffset(Offset), CalculatedValue,
+                      Type);
 }
 
 void RuntimeDyldELF::resolveMIPSN64Relocation(const SectionEntry &Section,
@@ -639,8 +569,64 @@
                                                CalculatedValue, SymOffset,
                                                SectionID);
   }
-  applyMIPS64Relocation(Section.getAddressWithOffset(Offset), CalculatedValue,
-                        RelType);
+  applyMIPSRelocation(Section.getAddressWithOffset(Offset), CalculatedValue,
+                      RelType);
+}
+
+int64_t RuntimeDyldELF::evaluateMIPS32Relocation(const SectionEntry &Section,
+                                                 uint64_t Offset,
+                                                 uint64_t Value,
+                                                 uint32_t Type) {
+
+  DEBUG(dbgs() << "evaluateMIPS32Relocation, LocalAddress: 0x"
+               << format("%llx", Section.getAddressWithOffset(Offset))
+               << " FinalAddress: 0x"
+               << format("%llx", Section.getLoadAddressWithOffset(Offset))
+               << " Value: 0x" << format("%llx", Value) << " Type: 0x"
+               << format("%x", Type) << "\n");
+
+  switch (Type) {
+  default:
+    llvm_unreachable("Unknown relocation type!");
+    return Value;
+  case ELF::R_MIPS_32:
+    return Value;
+  case ELF::R_MIPS_26:
+    return Value >> 2;
+  case ELF::R_MIPS_HI16:
+    // Get the higher 16-bits. Also add 1 if bit 15 is 1.
+    return (Value + 0x8000) >> 16;
+  case ELF::R_MIPS_LO16:
+    return Value;
+  case ELF::R_MIPS_PC32: {
+    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
+    return Value - FinalAddress;
+  }
+  case ELF::R_MIPS_PC16: {
+    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
+    return (Value - FinalAddress) >> 2;
+  }
+  case ELF::R_MIPS_PC19_S2: {
+    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
+    return (Value - (FinalAddress & ~0x3)) >> 2;
+  }
+  case ELF::R_MIPS_PC21_S2: {
+    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
+    return (Value - FinalAddress) >> 2;
+  }
+  case ELF::R_MIPS_PC26_S2: {
+    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
+    return (Value - FinalAddress) >> 2;
+  }
+  case ELF::R_MIPS_PCHI16: {
+    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
+    return (Value - FinalAddress + 0x8000) >> 16;
+  }
+  case ELF::R_MIPS_PCLO16: {
+    uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
+    return Value - FinalAddress;
+  }
+  }
 }
 
 int64_t
@@ -743,57 +729,54 @@
   return 0;
 }
 
-void RuntimeDyldELF::applyMIPS64Relocation(uint8_t *TargetPtr,
-                                           int64_t CalculatedValue,
-                                           uint32_t Type) {
+void RuntimeDyldELF::applyMIPSRelocation(uint8_t *TargetPtr, int64_t Value,
+                                         uint32_t Type) {
   uint32_t Insn = readBytesUnaligned(TargetPtr, 4);
 
   switch (Type) {
-    default:
-      break;
-    case ELF::R_MIPS_32:
-    case ELF::R_MIPS_GPREL32:
-    case ELF::R_MIPS_PC32:
-      writeBytesUnaligned(CalculatedValue & 0xffffffff, TargetPtr, 4);
-      break;
-    case ELF::R_MIPS_64:
-    case ELF::R_MIPS_SUB:
-      writeBytesUnaligned(CalculatedValue, TargetPtr, 8);
-      break;
-    case ELF::R_MIPS_26:
-    case ELF::R_MIPS_PC26_S2:
-      Insn = (Insn & 0xfc000000) | CalculatedValue;
-      writeBytesUnaligned(Insn, TargetPtr, 4);
-      break;
-    case ELF::R_MIPS_GPREL16:
-      Insn = (Insn & 0xffff0000) | (CalculatedValue & 0xffff);
-      writeBytesUnaligned(Insn, TargetPtr, 4);
-      break;
-    case ELF::R_MIPS_HI16:
-    case ELF::R_MIPS_LO16:
-    case ELF::R_MIPS_PCHI16:
-    case ELF::R_MIPS_PCLO16:
-    case ELF::R_MIPS_PC16:
-    case ELF::R_MIPS_CALL16:
-    case ELF::R_MIPS_GOT_DISP:
-    case ELF::R_MIPS_GOT_PAGE:
-    case ELF::R_MIPS_GOT_OFST:
-      Insn = (Insn & 0xffff0000) | CalculatedValue;
-      writeBytesUnaligned(Insn, TargetPtr, 4);
-      break;
-    case ELF::R_MIPS_PC18_S3:
-      Insn = (Insn & 0xfffc0000) | CalculatedValue;
-      writeBytesUnaligned(Insn, TargetPtr, 4);
-      break;
-    case ELF::R_MIPS_PC19_S2:
-      Insn = (Insn & 0xfff80000) | CalculatedValue;
-      writeBytesUnaligned(Insn, TargetPtr, 4);
-      break;
-    case ELF::R_MIPS_PC21_S2:
-      Insn = (Insn & 0xffe00000) | CalculatedValue;
-      writeBytesUnaligned(Insn, TargetPtr, 4);
-      break;
-    }
+  default:
+    llvm_unreachable("Unknown relocation type!");
+    break;
+  case ELF::R_MIPS_GPREL16:
+  case ELF::R_MIPS_HI16:
+  case ELF::R_MIPS_LO16:
+  case ELF::R_MIPS_PC16:
+  case ELF::R_MIPS_PCHI16:
+  case ELF::R_MIPS_PCLO16:
+  case ELF::R_MIPS_CALL16:
+  case ELF::R_MIPS_GOT_DISP:
+  case ELF::R_MIPS_GOT_PAGE:
+  case ELF::R_MIPS_GOT_OFST:
+    Insn = (Insn & 0xffff0000) | (Value & 0x0000ffff);
+    writeBytesUnaligned(Insn, TargetPtr, 4);
+    break;
+  case ELF::R_MIPS_PC18_S3:
+    Insn = (Insn & 0xfffc0000) | (Value & 0x0003ffff);
+    writeBytesUnaligned(Insn, TargetPtr, 4);
+    break;
+  case ELF::R_MIPS_PC19_S2:
+    Insn = (Insn & 0xfff80000) | (Value & 0x0007ffff);
+    writeBytesUnaligned(Insn, TargetPtr, 4);
+    break;
+  case ELF::R_MIPS_PC21_S2:
+    Insn = (Insn & 0xffe00000) | (Value & 0x001fffff);
+    writeBytesUnaligned(Insn, TargetPtr, 4);
+    break;
+  case ELF::R_MIPS_26:
+  case ELF::R_MIPS_PC26_S2:
+    Insn = (Insn & 0xfc000000) | (Value & 0x03ffffff);
+    writeBytesUnaligned(Insn, TargetPtr, 4);
+    break;
+  case ELF::R_MIPS_32:
+  case ELF::R_MIPS_GPREL32:
+  case ELF::R_MIPS_PC32:
+    writeBytesUnaligned(Value & 0xffffffff, TargetPtr, 4);
+    break;
+  case ELF::R_MIPS_64:
+  case ELF::R_MIPS_SUB:
+    writeBytesUnaligned(Value, TargetPtr, 8);
+    break;
+  }
 }
 
 // Return the .TOC. section and offset.
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 85648eb..b192b32 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -62,13 +62,15 @@
                                 uint64_t Value, uint32_t Type, int64_t Addend,
                                 uint64_t SymOffset, SID SectionID);
 
+  int64_t evaluateMIPS32Relocation(const SectionEntry &Section, uint64_t Offset,
+                                   uint64_t Value, uint32_t Type);
   int64_t evaluateMIPS64Relocation(const SectionEntry &Section,
                                    uint64_t Offset, uint64_t Value,
                                    uint32_t Type,  int64_t Addend,
                                    uint64_t SymOffset, SID SectionID);
 
-  void applyMIPS64Relocation(uint8_t *TargetPtr, int64_t CalculatedValue,
-                             uint32_t Type);
+  void applyMIPSRelocation(uint8_t *TargetPtr, int64_t CalculatedValue,
+                           uint32_t Type);
 
   unsigned getMaxStubSize() override {
     if (Arch == Triple::aarch64 || Arch == Triple::aarch64_be)
diff --git a/lib/Fuzzer/CMakeLists.txt b/lib/Fuzzer/CMakeLists.txt
index 0276db5..b7b75a4 100644
--- a/lib/Fuzzer/CMakeLists.txt
+++ b/lib/Fuzzer/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(LIBFUZZER_FLAGS_BASE "${CMAKE_CXX_FLAGS}")
 # Disable the coverage and sanitizer instrumentation for the fuzzer itself.
-set(CMAKE_CXX_FLAGS "${LIBFUZZER_FLAGS_BASE} -fno-sanitize=all -fno-sanitize-coverage=edge,trace-cmp,indirect-calls,8bit-counters -Werror")
+set(CMAKE_CXX_FLAGS "${LIBFUZZER_FLAGS_BASE} -fno-sanitize=all -fno-sanitize-coverage=trace-pc-guard,edge,trace-cmp,indirect-calls,8bit-counters -Werror")
 if( LLVM_USE_SANITIZE_COVERAGE )
   if(NOT "${LLVM_USE_SANITIZER}" STREQUAL "Address")
     message(FATAL_ERROR
@@ -18,6 +18,7 @@
     FuzzerIOPosix.cpp
     FuzzerIOWindows.cpp
     FuzzerLoop.cpp
+    FuzzerMerge.cpp
     FuzzerMutate.cpp
     FuzzerSHA1.cpp
     FuzzerTracePC.cpp
diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
index d432fe8..4051bf6 100644
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -219,8 +219,8 @@
   }
 }
 
-static std::string CloneArgsWithoutX(const std::vector<std::string> &Args,
-                                     const char *X1, const char *X2) {
+std::string CloneArgsWithoutX(const std::vector<std::string> &Args,
+                              const char *X1, const char *X2) {
   std::string Cmd;
   for (auto &S : Args) {
     if (FlagValue(S.c_str(), X1) || FlagValue(S.c_str(), X2))
@@ -230,11 +230,6 @@
   return Cmd;
 }
 
-static std::string CloneArgsWithoutX(const std::vector<std::string> &Args,
-                                     const char *X) {
-  return CloneArgsWithoutX(Args, X, X);
-}
-
 static int RunInMultipleProcesses(const std::vector<std::string> &Args,
                                   int NumWorkers, int NumJobs) {
   std::atomic<int> Counter(0);
@@ -499,6 +494,16 @@
     exit(0);
   }
 
+  if (Flags.experimental_merge) {
+    if (Options.MaxLen == 0)
+      F->SetMaxInputLen(kMaxSaneLen);
+    if (Flags.merge_control_file)
+      F->CrashResistantMergeInternalStep(Flags.merge_control_file);
+    else
+      F->CrashResistantMerge(Args, *Inputs);
+    exit(0);
+  }
+
   size_t TemporaryMaxLen = Options.MaxLen ? Options.MaxLen : kMaxSaneLen;
 
   UnitVector InitialCorpus;
diff --git a/lib/Fuzzer/FuzzerExtFunctionsDlsym.cpp b/lib/Fuzzer/FuzzerExtFunctionsDlsym.cpp
index 78d8de7..65b0458 100644
--- a/lib/Fuzzer/FuzzerExtFunctionsDlsym.cpp
+++ b/lib/Fuzzer/FuzzerExtFunctionsDlsym.cpp
@@ -15,6 +15,7 @@
 #if LIBFUZZER_APPLE
 
 #include "FuzzerExtFunctions.h"
+#include "FuzzerIO.h"
 #include <dlfcn.h>
 
 using namespace fuzzer;
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
index 681b73b..6a27ca3 100644
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -37,6 +37,9 @@
 FUZZER_FLAG_INT(merge, 0, "If 1, the 2-nd, 3-rd, etc corpora will be "
   "merged into the 1-st corpus. Only interesting units will be taken. "
   "This flag can be used to minimize a corpus.")
+FUZZER_FLAG_INT(experimental_merge, 0, "Experimental crash-resistant, "
+                "may eventually replace -merge.")
+FUZZER_FLAG_STRING(merge_control_file, "internal flag")
 FUZZER_FLAG_INT(minimize_crash, 0, "If 1, minimizes the provided"
   " crash input. Use with -runs=N or -max_total_time=N to limit "
   "the number attempts")
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
index ad42d7f..e3a1801 100644
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -89,6 +89,9 @@
 
   // Merge Corpora[1:] into Corpora[0].
   void Merge(const std::vector<std::string> &Corpora);
+  void CrashResistantMerge(const std::vector<std::string> &Args,
+                           const std::vector<std::string> &Corpora);
+  void CrashResistantMergeInternalStep(const std::string &ControlFilePath);
   // Returns a subset of 'Extra' that adds coverage to 'Initial'.
   UnitVector FindExtraUnits(const UnitVector &Initial, const UnitVector &Extra);
   MutationDispatcher &GetMD() { return MD; }
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index e5f3af9..3d90401 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -193,6 +193,8 @@
     EpochOfLastReadOfOutputCorpus = GetEpoch(Options.OutputCorpus);
   MaxInputLen = MaxMutationLen = Options.MaxLen;
   AllocateCurrentUnitData();
+  CurrentUnitSize = 0;
+  memset(BaseSha1, 0, sizeof(BaseSha1));
 }
 
 Fuzzer::~Fuzzer() { }
@@ -486,7 +488,9 @@
   ExecuteCallback(Data, Size);
 
   size_t Res = 0;
-  if (size_t NumFeatures = TPC.FinalizeTrace(&Corpus, Size, Options.Shrink))
+  if (size_t NumFeatures = TPC.CollectFeatures([&](size_t Feature) -> bool {
+        return Corpus.AddFeature(Feature, Size, Options.Shrink);
+      }))
     Res = NumFeatures;
 
   if (!TPC.UsingTracePcGuard()) {
diff --git a/lib/Fuzzer/FuzzerMerge.cpp b/lib/Fuzzer/FuzzerMerge.cpp
new file mode 100644
index 0000000..d3f1ab1
--- /dev/null
+++ b/lib/Fuzzer/FuzzerMerge.cpp
@@ -0,0 +1,255 @@
+//===- FuzzerMerge.cpp - merging corpora ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Merging corpora.
+//===----------------------------------------------------------------------===//
+
+#include "FuzzerInternal.h"
+#include "FuzzerIO.h"
+#include "FuzzerMerge.h"
+#include "FuzzerTracePC.h"
+#include "FuzzerUtil.h"
+
+#include <fstream>
+#include <sstream>
+
+namespace fuzzer {
+
+bool Merger::Parse(const std::string &Str, bool ParseCoverage) {
+  std::istringstream SS(Str);
+  return Parse(SS, ParseCoverage);
+}
+
+void Merger::ParseOrExit(std::istream &IS, bool ParseCoverage) {
+  if (!Parse(IS, ParseCoverage)) {
+    Printf("MERGE: failed to parse the control file (unexpected error)\n");
+    exit(1);
+  }
+}
+
+// The control file example:
+//
+// 3 # The number of inputs
+// 1 # The number of inputs in the first corpus, <= the previous number
+// file0
+// file1
+// file2  # One file name per line.
+// STARTED 0 123  # FileID, file size
+// DONE 0 1 4 6 8  # FileID COV1 COV2 ...
+// STARTED 1 456  # If DONE is missing, the input crashed while processing.
+// STARTED 2 567
+// DONE 2 8 9
+bool Merger::Parse(std::istream &IS, bool ParseCoverage) {
+  LastFailure.clear();
+  std::string Line;
+
+  // Parse NumFiles.
+  if (!std::getline(IS, Line, '\n')) return false;
+  std::istringstream L1(Line);
+  size_t NumFiles = 0;
+  L1 >> NumFiles;
+  if (NumFiles == 0 || NumFiles > 10000000) return false;
+
+  // Parse NumFilesInFirstCorpus.
+  if (!std::getline(IS, Line, '\n')) return false;
+  std::istringstream L2(Line);
+  NumFilesInFirstCorpus = NumFiles + 1;
+  L2 >> NumFilesInFirstCorpus;
+  if (NumFilesInFirstCorpus > NumFiles) return false;
+
+  // Parse file names.
+  Files.resize(NumFiles);
+  for (size_t i = 0; i < NumFiles; i++)
+    if (!std::getline(IS, Files[i].Name, '\n'))
+      return false;
+
+  // Parse STARTED and DONE lines.
+  size_t ExpectedStartMarker = 0;
+  const size_t kInvalidStartMarker = -1;
+  size_t LastSeenStartMarker = kInvalidStartMarker;
+  while (std::getline(IS, Line, '\n')) {
+    std::istringstream ISS1(Line);
+    std::string Marker;
+    size_t N;
+    ISS1 >> Marker;
+    ISS1 >> N;
+    if (Marker == "STARTED") {
+      // STARTED FILE_ID FILE_SIZE
+      if (ExpectedStartMarker != N)
+        return false;
+      ISS1 >> Files[ExpectedStartMarker].Size;
+      LastSeenStartMarker = ExpectedStartMarker;
+      assert(ExpectedStartMarker < Files.size());
+      ExpectedStartMarker++;
+    } else if (Marker == "DONE") {
+      // DONE FILE_SIZE COV1 COV2 COV3 ...
+      size_t CurrentFileIdx = N;
+      if (CurrentFileIdx != LastSeenStartMarker)
+        return false;
+      LastSeenStartMarker = kInvalidStartMarker;
+      if (ParseCoverage) {
+        while (!ISS1.rdstate()) {
+          ISS1 >> std::hex >> N;
+          Files[CurrentFileIdx].Features.insert(N);
+        }
+      }
+    } else {
+      return false;
+    }
+  }
+  if (LastSeenStartMarker != kInvalidStartMarker)
+    LastFailure = Files[LastSeenStartMarker].Name;
+
+  FirstNotProcessedFile = ExpectedStartMarker;
+  return true;
+}
+
+// Decides which files need to be merged (add thost to NewFiles).
+// Returns the number of new features added.
+size_t Merger::Merge(std::vector<std::string> *NewFiles) {
+  NewFiles->clear();
+  assert(NumFilesInFirstCorpus <= Files.size());
+  std::set<size_t> AllFeatures;
+
+  // What features are in the initial corpus?
+  for (size_t i = 0; i < NumFilesInFirstCorpus; i++) {
+    auto &Cur = Files[i].Features;
+    AllFeatures.insert(Cur.begin(), Cur.end());
+  }
+  size_t InitialNumFeatures = AllFeatures.size();
+
+  // Remove all features that we already know from all other inputs.
+  for (size_t i = NumFilesInFirstCorpus; i < Files.size(); i++) {
+    auto &Cur = Files[i].Features;
+    std::set<size_t> Tmp;
+    std::set_difference(Cur.begin(), Cur.end(), AllFeatures.begin(),
+                        AllFeatures.end(), std::inserter(Tmp, Tmp.begin()));
+    Cur.swap(Tmp);
+  }
+
+  // Sort. Give preference to
+  //   * smaller files
+  //   * files with more features.
+  std::sort(Files.begin() + NumFilesInFirstCorpus, Files.end(),
+            [&](const MergeFileInfo &a, const MergeFileInfo &b) -> bool {
+              if (a.Size != b.Size)
+                return a.Size < b.Size;
+              return a.Features.size() > b.Features.size();
+            });
+
+  // One greedy pass: add the file's features to AllFeatures.
+  // If new features were added, add this file to NewFiles.
+  for (size_t i = NumFilesInFirstCorpus; i < Files.size(); i++) {
+    auto &Cur = Files[i].Features;
+    // Printf("%s -> sz %zd ft %zd\n", Files[i].Name.c_str(),
+    //       Files[i].Size, Cur.size());
+    size_t OldSize = AllFeatures.size();
+    AllFeatures.insert(Cur.begin(), Cur.end());
+    if (AllFeatures.size() > OldSize)
+      NewFiles->push_back(Files[i].Name);
+  }
+  return AllFeatures.size() - InitialNumFeatures;
+}
+
+// Inner process. May crash if the target crashes.
+void Fuzzer::CrashResistantMergeInternalStep(const std::string &CFPath) {
+  Printf("MERGE-INNER: using the control file '%s'\n", CFPath.c_str());
+  Merger M;
+  std::ifstream IF(CFPath);
+  M.ParseOrExit(IF, false);
+  IF.close();
+  if (!M.LastFailure.empty())
+    Printf("MERGE-INNER: '%s' caused a failure at the previous merge step\n",
+           M.LastFailure.c_str());
+
+  Printf("MERGE-INNER: %zd total files;"
+         " %zd processed earlier; will process %zd files now\n",
+         M.Files.size(), M.FirstNotProcessedFile,
+         M.Files.size() - M.FirstNotProcessedFile);
+
+  std::ofstream OF(CFPath, std::ofstream::out | std::ofstream::app);
+  for (size_t i = M.FirstNotProcessedFile; i < M.Files.size(); i++) {
+    auto U = FileToVector(M.Files[i].Name);
+    std::ostringstream StartedLine;
+    // Write the pre-run marker.
+    OF << "STARTED " << std::dec << i << " " << U.size() << "\n";
+    OF.flush();  // Flush is important since ExecuteCommand may crash.
+    // Run.
+    TPC.ResetMaps();
+    ExecuteCallback(U.data(), U.size());
+    // Collect coverage.
+    std::set<size_t> Features;
+    TPC.CollectFeatures([&](size_t Feature) -> bool {
+      Features.insert(Feature);
+      return true;
+    });
+    // Show stats.
+    TotalNumberOfRuns++;
+    if (!(TotalNumberOfRuns & (TotalNumberOfRuns - 1)))
+      PrintStats("pulse ");
+    // Write the post-run marker and the coverage.
+    OF << "DONE " << i;
+    for (size_t F : Features)
+      OF << " " << std::hex << F;
+    OF << "\n";
+  }
+}
+
+// Outer process. Does not call the target code and thus sohuld not fail.
+void Fuzzer::CrashResistantMerge(const std::vector<std::string> &Args,
+                                 const std::vector<std::string> &Corpora) {
+  if (Corpora.size() <= 1) {
+    Printf("Merge requires two or more corpus dirs\n");
+    return;
+  }
+  std::vector<std::string> AllFiles;
+  ListFilesInDirRecursive(Corpora[0], nullptr, &AllFiles, /*TopDir*/true);
+  size_t NumFilesInFirstCorpus = AllFiles.size();
+  for (size_t i = 1; i < Corpora.size(); i++)
+    ListFilesInDirRecursive(Corpora[i], nullptr, &AllFiles, /*TopDir*/true);
+  Printf("MERGE-OUTER: %zd files, %zd in the initial corpus\n",
+         AllFiles.size(), NumFilesInFirstCorpus);
+  std::string CFPath =
+      "libFuzzerTemp." + std::to_string(GetPid()) + ".txt";
+  // Write the control file.
+  DeleteFile(CFPath);
+  std::ofstream ControlFile(CFPath);
+  ControlFile << AllFiles.size() << "\n";
+  ControlFile << NumFilesInFirstCorpus << "\n";
+  for (auto &Path: AllFiles)
+    ControlFile << Path << "\n";
+  ControlFile.close();
+
+  // Execute the inner process untill it passes.
+  // Every inner process should execute at least one input.
+  std::string BaseCmd = CloneArgsWithoutX(Args, "keep-all-flags");
+  for (size_t i = 1; i <= AllFiles.size(); i++) {
+    Printf("MERGE-OUTER: attempt %zd\n", i);
+    auto ExitCode =
+        ExecuteCommand(BaseCmd + " -merge_control_file=" + CFPath);
+    if (!ExitCode) {
+      Printf("MERGE-OUTER: succesfull in %zd attempt(s)\n", i);
+      break;
+    }
+  }
+  // Read the control file and do the merge.
+  Merger M;
+  std::ifstream IF(CFPath);
+  M.ParseOrExit(IF, true);
+  IF.close();
+  std::vector<std::string> NewFiles;
+  size_t NumNewFeatures = M.Merge(&NewFiles);
+  Printf("MERGE-OUTER: %zd new files with %zd new features added\n",
+         NewFiles.size(), NumNewFeatures);
+  for (auto &F: NewFiles)
+    WriteToOutputCorpus(FileToVector(F));
+  // We are done, delete the control file.
+  DeleteFile(CFPath);
+}
+
+} // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerMerge.h b/lib/Fuzzer/FuzzerMerge.h
new file mode 100644
index 0000000..d00349a
--- /dev/null
+++ b/lib/Fuzzer/FuzzerMerge.h
@@ -0,0 +1,70 @@
+//===- FuzzerMerge.h - merging corpa ----------------------------*- C++ -* ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Merging Corpora.
+//
+// The task:
+//   Take the existing corpus (possibly empty) and merge new inputs into
+//   it so that only inputs with new coverage ('features') are added.
+//   The process should tolerate the crashes, OOMs, leaks, etc.
+//
+// Algorithm:
+//   The outter process collects the set of files and writes their names
+//   into a temporary "control" file, then repeatedly launches the inner
+//   process until all inputs are processed.
+//   The outer process does not actually execute the target code.
+//
+//   The inner process reads the control file and sees a) list of all the inputs
+//   and b) the last processed input. Then it starts processing the inputs one
+//   by one. Before processing every input it writes one line to control file:
+//   STARTED INPUT_ID INPUT_SIZE
+//   After processing an input it write another line:
+//   DONE INPUT_ID Feature1 Feature2 Feature3 ...
+//   If a crash happens while processing an input the last line in the control
+//   file will be "STARTED INPUT_ID" and so the next process will know
+//   where to resume.
+//
+//   Once all inputs are processed by the innner process(es) the outer process
+//   reads the control files and does the merge based entirely on the contents
+//   of control file.
+//   It uses a single pass greedy algorithm choosing first the smallest inputs
+//   within the same size the inputs that have more new features.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_FUZZER_MERGE_H
+#define LLVM_FUZZER_MERGE_H
+
+#include "FuzzerDefs.h"
+
+#include <istream>
+#include <set>
+
+namespace fuzzer {
+
+struct MergeFileInfo {
+  std::string Name;
+  size_t Size = 0;
+  std::set<size_t> Features;
+};
+
+struct Merger {
+  std::vector<MergeFileInfo> Files;
+  size_t NumFilesInFirstCorpus = 0;
+  size_t FirstNotProcessedFile = 0;
+  std::string LastFailure;
+
+  bool Parse(std::istream &IS, bool ParseCoverage);
+  bool Parse(const std::string &Str, bool ParseCoverage);
+  void ParseOrExit(std::istream &IS, bool ParseCoverage);
+  size_t Merge(std::vector<std::string> *NewFiles);
+};
+
+}  // namespace fuzzer
+
+#endif  // LLVM_FUZZER_MERGE_H
diff --git a/lib/Fuzzer/FuzzerTracePC.cpp b/lib/Fuzzer/FuzzerTracePC.cpp
index 8d58a6d..d8036ed 100644
--- a/lib/Fuzzer/FuzzerTracePC.cpp
+++ b/lib/Fuzzer/FuzzerTracePC.cpp
@@ -59,41 +59,6 @@
   Printf("\n");
 }
 
-size_t TracePC::FinalizeTrace(InputCorpus *C, size_t InputSize, bool Shrink) {
-  if (!UsingTracePcGuard()) return 0;
-  size_t Res = 0;
-  const size_t Step = 8;
-  assert(reinterpret_cast<uintptr_t>(Counters) % Step == 0);
-  size_t N = Min(kNumCounters, NumGuards + 1);
-  N = (N + Step - 1) & ~(Step - 1);  // Round up.
-  for (size_t Idx = 0; Idx < N; Idx += Step) {
-    uint64_t Bundle = *reinterpret_cast<uint64_t*>(&Counters[Idx]);
-    if (!Bundle) continue;
-    for (size_t i = Idx; i < Idx + Step; i++) {
-      uint8_t Counter = (Bundle >> (i * 8)) & 0xff;
-      if (!Counter) continue;
-      Counters[i] = 0;
-      unsigned Bit = 0;
-      /**/ if (Counter >= 128) Bit = 7;
-      else if (Counter >= 32) Bit = 6;
-      else if (Counter >= 16) Bit = 5;
-      else if (Counter >= 8) Bit = 4;
-      else if (Counter >= 4) Bit = 3;
-      else if (Counter >= 3) Bit = 2;
-      else if (Counter >= 2) Bit = 1;
-      size_t Feature = (i * 8 + Bit);
-      if (C->AddFeature(Feature, InputSize, Shrink))
-        Res++;
-    }
-  }
-  if (UseValueProfile)
-    ValueProfileMap.ForEach([&](size_t Idx) {
-      if (C->AddFeature(NumGuards + Idx, InputSize, Shrink))
-        Res++;
-    });
-  return Res;
-}
-
 void TracePC::HandleCallerCallee(uintptr_t Caller, uintptr_t Callee) {
   const uintptr_t kBits = 12;
   const uintptr_t kMask = (1 << kBits) - 1;
diff --git a/lib/Fuzzer/FuzzerTracePC.h b/lib/Fuzzer/FuzzerTracePC.h
index 9c7f563..acff27f 100644
--- a/lib/Fuzzer/FuzzerTracePC.h
+++ b/lib/Fuzzer/FuzzerTracePC.h
@@ -56,7 +56,7 @@
   void SetUseCounters(bool UC) { UseCounters = UC; }
   void SetUseValueProfile(bool VP) { UseValueProfile = VP; }
   void SetPrintNewPCs(bool P) { DoPrintNewPCs = P; }
-  size_t FinalizeTrace(InputCorpus *C, size_t InputSize, bool Shrink);
+  template <class Callback> size_t CollectFeatures(Callback CB);
   bool UpdateValueProfileMap(ValueBitMap *MaxValueProfileMap) {
     return UseValueProfile && MaxValueProfileMap->MergeFrom(ValueProfileMap);
   }
@@ -115,6 +115,42 @@
   ValueBitMap ValueProfileMap;
 };
 
+template <class Callback>
+size_t TracePC::CollectFeatures(Callback CB) {
+  if (!UsingTracePcGuard()) return 0;
+  size_t Res = 0;
+  const size_t Step = 8;
+  assert(reinterpret_cast<uintptr_t>(Counters) % Step == 0);
+  size_t N = Min(kNumCounters, NumGuards + 1);
+  N = (N + Step - 1) & ~(Step - 1);  // Round up.
+  for (size_t Idx = 0; Idx < N; Idx += Step) {
+    uint64_t Bundle = *reinterpret_cast<uint64_t*>(&Counters[Idx]);
+    if (!Bundle) continue;
+    for (size_t i = Idx; i < Idx + Step; i++) {
+      uint8_t Counter = (Bundle >> (i * 8)) & 0xff;
+      if (!Counter) continue;
+      Counters[i] = 0;
+      unsigned Bit = 0;
+      /**/ if (Counter >= 128) Bit = 7;
+      else if (Counter >= 32) Bit = 6;
+      else if (Counter >= 16) Bit = 5;
+      else if (Counter >= 8) Bit = 4;
+      else if (Counter >= 4) Bit = 3;
+      else if (Counter >= 3) Bit = 2;
+      else if (Counter >= 2) Bit = 1;
+      size_t Feature = (i * 8 + Bit);
+      if (CB(Feature))
+        Res++;
+    }
+  }
+  if (UseValueProfile)
+    ValueProfileMap.ForEach([&](size_t Idx) {
+      if (CB(NumGuards + Idx))
+        Res++;
+    });
+  return Res;
+}
+
 extern TracePC TPC;
 
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerUtil.h b/lib/Fuzzer/FuzzerUtil.h
index c9de11f..eb42537 100644
--- a/lib/Fuzzer/FuzzerUtil.h
+++ b/lib/Fuzzer/FuzzerUtil.h
@@ -66,5 +66,13 @@
 const void *SearchMemory(const void *haystack, size_t haystacklen,
                          const void *needle, size_t needlelen);
 
+std::string CloneArgsWithoutX(const std::vector<std::string> &Args,
+                              const char *X1, const char *X2);
+
+inline std::string CloneArgsWithoutX(const std::vector<std::string> &Args,
+                                     const char *X) {
+  return CloneArgsWithoutX(Args, X, X);
+}
+
 }  // namespace fuzzer
 #endif  // LLVM_FUZZER_UTIL_H
diff --git a/lib/Fuzzer/test/FuzzerUnittest.cpp b/lib/Fuzzer/test/FuzzerUnittest.cpp
index 79b8c28..aab3262 100644
--- a/lib/Fuzzer/test/FuzzerUnittest.cpp
+++ b/lib/Fuzzer/test/FuzzerUnittest.cpp
@@ -8,6 +8,7 @@
 #include "FuzzerCorpus.h"
 #include "FuzzerInternal.h"
 #include "FuzzerDictionary.h"
+#include "FuzzerMerge.h"
 #include "FuzzerMutate.h"
 #include "FuzzerRandom.h"
 #include "gtest/gtest.h"
@@ -598,3 +599,137 @@
     EXPECT_GT(Hist[i], TriesPerUnit / N / 3);
   }
 }
+
+TEST(Merge, Bad) {
+  const char *kInvalidInputs[] = {
+    "",
+    "x",
+    "3\nx",
+    "2\n3",
+    "2\n2",
+    "2\n2\nA\n",
+    "2\n2\nA\nB\nC\n",
+    "0\n0\n",
+    "1\n1\nA\nDONE 0",
+    "1\n1\nA\nSTARTED 1",
+  };
+  Merger M;
+  for (auto S : kInvalidInputs) {
+    // fprintf(stderr, "TESTING:\n%s\n", S);
+    EXPECT_FALSE(M.Parse(S, false));
+  }
+}
+
+void EQ(const std::set<size_t> &A, const std::set<size_t> &B) {
+  EXPECT_EQ(A, B);
+}
+
+void EQ(const std::vector<std::string> &A, const std::vector<std::string> &B) {
+  std::set<std::string> a(A.begin(), A.end());
+  std::set<std::string> b(B.begin(), B.end());
+  EXPECT_EQ(a, b);
+}
+
+static void Merge(const std::string &Input,
+                  const std::vector<std::string> Result,
+                  size_t NumNewFeatures) {
+  Merger M;
+  std::vector<std::string> NewFiles;
+  EXPECT_TRUE(M.Parse(Input, true));
+  EXPECT_EQ(NumNewFeatures, M.Merge(&NewFiles));
+  EQ(NewFiles, Result);
+}
+
+TEST(Merge, Good) {
+  Merger M;
+
+  EXPECT_TRUE(M.Parse("1\n0\nAA\n", false));
+  EXPECT_EQ(M.Files.size(), 1U);
+  EXPECT_EQ(M.NumFilesInFirstCorpus, 0U);
+  EXPECT_EQ(M.Files[0].Name, "AA");
+  EXPECT_TRUE(M.LastFailure.empty());
+  EXPECT_EQ(M.FirstNotProcessedFile, 0U);
+
+  EXPECT_TRUE(M.Parse("2\n1\nAA\nBB\nSTARTED 0 42\n", false));
+  EXPECT_EQ(M.Files.size(), 2U);
+  EXPECT_EQ(M.NumFilesInFirstCorpus, 1U);
+  EXPECT_EQ(M.Files[0].Name, "AA");
+  EXPECT_EQ(M.Files[1].Name, "BB");
+  EXPECT_EQ(M.LastFailure, "AA");
+  EXPECT_EQ(M.FirstNotProcessedFile, 1U);
+
+  EXPECT_TRUE(M.Parse("3\n1\nAA\nBB\nC\n"
+                        "STARTED 0 1000\n"
+                        "DONE 0 1 2 3\n"
+                        "STARTED 1 1001\n"
+                        "DONE 1 4 5 6 \n"
+                        "STARTED 2 1002\n"
+                        "", true));
+  EXPECT_EQ(M.Files.size(), 3U);
+  EXPECT_EQ(M.NumFilesInFirstCorpus, 1U);
+  EXPECT_EQ(M.Files[0].Name, "AA");
+  EXPECT_EQ(M.Files[0].Size, 1000U);
+  EXPECT_EQ(M.Files[1].Name, "BB");
+  EXPECT_EQ(M.Files[1].Size, 1001U);
+  EXPECT_EQ(M.Files[2].Name, "C");
+  EXPECT_EQ(M.Files[2].Size, 1002U);
+  EXPECT_EQ(M.LastFailure, "C");
+  EXPECT_EQ(M.FirstNotProcessedFile, 3U);
+  EQ(M.Files[0].Features, {1, 2, 3});
+  EQ(M.Files[1].Features, {4, 5, 6});
+
+
+  std::vector<std::string> NewFiles;
+
+  EXPECT_TRUE(M.Parse("3\n2\nAA\nBB\nC\n"
+                        "STARTED 0 1000\nDONE 0 1 2 3\n"
+                        "STARTED 1 1001\nDONE 1 4 5 6 \n"
+                        "STARTED 2 1002\nDONE 2 6 1 3 \n"
+                        "", true));
+  EXPECT_EQ(M.Files.size(), 3U);
+  EXPECT_EQ(M.NumFilesInFirstCorpus, 2U);
+  EXPECT_TRUE(M.LastFailure.empty());
+  EXPECT_EQ(M.FirstNotProcessedFile, 3U);
+  EQ(M.Files[0].Features, {1, 2, 3});
+  EQ(M.Files[1].Features, {4, 5, 6});
+  EQ(M.Files[2].Features, {1, 3, 6});
+  EXPECT_EQ(0U, M.Merge(&NewFiles));
+  EQ(NewFiles, {});
+
+  EXPECT_TRUE(M.Parse("3\n1\nA\nB\nC\n"
+                        "STARTED 0 1000\nDONE 0 1 2 3\n"
+                        "STARTED 1 1001\nDONE 1 4 5 6 \n"
+                        "STARTED 2 1002\nDONE 2 6 1 3 \n"
+                        "", true));
+  EXPECT_EQ(3U, M.Merge(&NewFiles));
+  EQ(NewFiles, {"B"});
+}
+
+TEST(Merge, Merge) {
+
+  Merge("3\n1\nA\nB\nC\n"
+        "STARTED 0 1000\nDONE 0 1 2 3\n"
+        "STARTED 1 1001\nDONE 1 4 5 6 \n"
+        "STARTED 2 1002\nDONE 2 6 1 3 \n",
+        {"B"}, 3);
+
+  Merge("3\n0\nA\nB\nC\n"
+        "STARTED 0 2000\nDONE 0 1 2 3\n"
+        "STARTED 1 1001\nDONE 1 4 5 6 \n"
+        "STARTED 2 1002\nDONE 2 6 1 3 \n",
+        {"A", "B", "C"}, 6);
+
+  Merge("4\n0\nA\nB\nC\nD\n"
+        "STARTED 0 2000\nDONE 0 1 2 3\n"
+        "STARTED 1 1101\nDONE 1 4 5 6 \n"
+        "STARTED 2 1102\nDONE 2 6 1 3 100 \n"
+        "STARTED 3 1000\nDONE 3 1  \n",
+        {"A", "B", "C", "D"}, 7);
+
+  Merge("4\n1\nA\nB\nC\nD\n"
+        "STARTED 0 2000\nDONE 0 4 5 6 7 8\n"
+        "STARTED 1 1100\nDONE 1 1 2 3 \n"
+        "STARTED 2 1100\nDONE 2 2 3 \n"
+        "STARTED 3 1000\nDONE 3 1  \n",
+        {"B", "D"}, 3);
+}
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index 1341937..d0d2710 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -16,17 +16,22 @@
 #ifndef LLVM_LIB_IR_ATTRIBUTEIMPL_H
 #define LLVM_LIB_IR_ATTRIBUTEIMPL_H
 
-#include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/IR/Attributes.h"
 #include "AttributeSetNode.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/Support/TrailingObjects.h"
+#include <algorithm>
+#include <cassert>
 #include <climits>
+#include <cstddef>
+#include <cstdint>
 #include <string>
+#include <utility>
 
 namespace llvm {
 
-class Constant;
 class LLVMContext;
 
 //===----------------------------------------------------------------------===//
@@ -36,10 +41,6 @@
 class AttributeImpl : public FoldingSetNode {
   unsigned char KindID; ///< Holds the AttrEntryKind of the attribute
 
-  // AttributesImpl is uniqued, these should not be publicly available.
-  void operator=(const AttributeImpl &) = delete;
-  AttributeImpl(const AttributeImpl &) = delete;
-
 protected:
   enum AttrEntryKind {
     EnumAttrEntry,
@@ -50,6 +51,10 @@
   AttributeImpl(AttrEntryKind KindID) : KindID(KindID) {}
 
 public:
+  // AttributesImpl is uniqued, these should not be available.
+  AttributeImpl(const AttributeImpl &) = delete;
+  AttributeImpl &operator=(const AttributeImpl &) = delete;
+
   virtual ~AttributeImpl();
 
   bool isEnumAttribute() const { return KindID == EnumAttrEntry; }
@@ -165,12 +170,9 @@
     return getTrailingObjects<IndexAttrPair>() + Slot;
   }
 
-  // AttributesSet is uniqued, these should not be publicly available.
-  void operator=(const AttributeSetImpl &) = delete;
-  AttributeSetImpl(const AttributeSetImpl &) = delete;
 public:
   AttributeSetImpl(LLVMContext &C,
-                   ArrayRef<std::pair<unsigned, AttributeSetNode *> > Slots)
+                   ArrayRef<std::pair<unsigned, AttributeSetNode *>> Slots)
       : Context(C), NumSlots(Slots.size()), AvailableFunctionAttrs(0) {
     static_assert(Attribute::EndAttrKinds <=
                       sizeof(AvailableFunctionAttrs) * CHAR_BIT,
@@ -203,6 +205,10 @@
     }
   }
 
+  // AttributesSetImpt is uniqued, these should not be available.
+  AttributeSetImpl(const AttributeSetImpl &) = delete;
+  AttributeSetImpl &operator=(const AttributeSetImpl &) = delete;
+
   void operator delete(void *p) { ::operator delete(p); }
 
   /// \brief Get the context that created this AttributeSetImpl.
@@ -248,16 +254,16 @@
     Profile(ID, makeArrayRef(getNode(0), getNumSlots()));
   }
   static void Profile(FoldingSetNodeID &ID,
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*> > Nodes) {
-    for (unsigned i = 0, e = Nodes.size(); i != e; ++i) {
-      ID.AddInteger(Nodes[i].first);
-      ID.AddPointer(Nodes[i].second);
+                      ArrayRef<std::pair<unsigned, AttributeSetNode*>> Nodes) {
+    for (const auto &Node : Nodes) {
+      ID.AddInteger(Node.first);
+      ID.AddPointer(Node.second);
     }
   }
 
   void dump() const;
 };
 
-} // end llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_IR_ATTRIBUTEIMPL_H
diff --git a/lib/IR/AttributeSetNode.h b/lib/IR/AttributeSetNode.h
index fab1ed5..23ce371 100644
--- a/lib/IR/AttributeSetNode.h
+++ b/lib/IR/AttributeSetNode.h
@@ -15,10 +15,17 @@
 #ifndef LLVM_IR_ATTRIBUTESETNODE_H
 #define LLVM_IR_ATTRIBUTESETNODE_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/Support/TrailingObjects.h"
+#include <algorithm>
 #include <climits>
+#include <cstdint>
+#include <string>
+#include <utility>
 
 namespace llvm {
 
@@ -49,10 +56,11 @@
     }
   }
 
-  // AttributesSetNode is uniqued, these should not be publicly available.
-  void operator=(const AttributeSetNode &) = delete;
-  AttributeSetNode(const AttributeSetNode &) = delete;
 public:
+  // AttributesSetNode is uniqued, these should not be available.
+  AttributeSetNode(const AttributeSetNode &) = delete;
+  AttributeSetNode &operator=(const AttributeSetNode &) = delete;
+
   void operator delete(void *p) { ::operator delete(p); }
 
   static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
@@ -88,11 +96,11 @@
     Profile(ID, makeArrayRef(begin(), end()));
   }
   static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
-    for (unsigned I = 0, E = AttrList.size(); I != E; ++I)
-      AttrList[I].Profile(ID);
+    for (const auto &Attr : AttrList)
+      Attr.Profile(ID);
   }
 };
 
-} // end llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_ATTRIBUTESETNODE_H
diff --git a/lib/IR/ConstantsContext.h b/lib/IR/ConstantsContext.h
index 7db87ed..eda751d 100644
--- a/lib/IR/ConstantsContext.h
+++ b/lib/IR/ConstantsContext.h
@@ -1,4 +1,4 @@
-//===-- ConstantsContext.h - Constants-related Context Interals -----------===//
+//===-- ConstantsContext.h - Constants-related Context Interals -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,14 +15,26 @@
 #ifndef LLVM_LIB_IR_CONSTANTSCONTEXT_H
 #define LLVM_LIB_IR_CONSTANTSCONTEXT_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Operator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/OperandTraits.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
 
 #define DEBUG_TYPE "ir"
 
@@ -32,16 +44,20 @@
 /// behind the scenes to implement unary constant exprs.
 class UnaryConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) = delete;
+
 public:
-  // allocate space for exactly one operand
-  void *operator new(size_t s) {
-    return User::operator new(s, 1);
-  }
   UnaryConstantExpr(unsigned Opcode, Constant *C, Type *Ty)
     : ConstantExpr(Ty, Opcode, &Op<0>(), 1) {
     Op<0>() = C;
   }
+
+  // allocate space for exactly one operand
+  void *operator new(size_t s) {
+    return User::operator new(s, 1);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 };
 
@@ -49,12 +65,8 @@
 /// behind the scenes to implement binary constant exprs.
 class BinaryConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) = delete;
+
 public:
-  // allocate space for exactly two operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 2);
-  }
   BinaryConstantExpr(unsigned Opcode, Constant *C1, Constant *C2,
                      unsigned Flags)
     : ConstantExpr(C1->getType(), Opcode, &Op<0>(), 2) {
@@ -62,6 +74,14 @@
     Op<1>() = C2;
     SubclassOptionalData = Flags;
   }
+
+  // allocate space for exactly two operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 2);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 };
@@ -70,18 +90,22 @@
 /// behind the scenes to implement select constant exprs.
 class SelectConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) = delete;
+
 public:
-  // allocate space for exactly three operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 3);
-  }
   SelectConstantExpr(Constant *C1, Constant *C2, Constant *C3)
     : ConstantExpr(C2->getType(), Instruction::Select, &Op<0>(), 3) {
     Op<0>() = C1;
     Op<1>() = C2;
     Op<2>() = C3;
   }
+
+  // allocate space for exactly three operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 3);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 };
@@ -91,18 +115,22 @@
 /// extractelement constant exprs.
 class ExtractElementConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) = delete;
+
 public:
-  // allocate space for exactly two operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 2);
-  }
   ExtractElementConstantExpr(Constant *C1, Constant *C2)
     : ConstantExpr(cast<VectorType>(C1->getType())->getElementType(),
                    Instruction::ExtractElement, &Op<0>(), 2) {
     Op<0>() = C1;
     Op<1>() = C2;
   }
+
+  // allocate space for exactly two operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 2);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 };
@@ -112,12 +140,8 @@
 /// insertelement constant exprs.
 class InsertElementConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) = delete;
+
 public:
-  // allocate space for exactly three operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 3);
-  }
   InsertElementConstantExpr(Constant *C1, Constant *C2, Constant *C3)
     : ConstantExpr(C1->getType(), Instruction::InsertElement,
                    &Op<0>(), 3) {
@@ -125,6 +149,14 @@
     Op<1>() = C2;
     Op<2>() = C3;
   }
+
+  // allocate space for exactly three operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 3);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 };
@@ -134,12 +166,8 @@
 /// shufflevector constant exprs.
 class ShuffleVectorConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) = delete;
+
 public:
-  // allocate space for exactly three operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 3);
-  }
   ShuffleVectorConstantExpr(Constant *C1, Constant *C2, Constant *C3)
   : ConstantExpr(VectorType::get(
                    cast<VectorType>(C1->getType())->getElementType(),
@@ -150,6 +178,14 @@
     Op<1>() = C2;
     Op<2>() = C3;
   }
+
+  // allocate space for exactly three operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 3);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 };
@@ -159,12 +195,8 @@
 /// extractvalue constant exprs.
 class ExtractValueConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) = delete;
+
 public:
-  // allocate space for exactly one operand
-  void *operator new(size_t s) {
-    return User::operator new(s, 1);
-  }
   ExtractValueConstantExpr(Constant *Agg, ArrayRef<unsigned> IdxList,
                            Type *DestTy)
       : ConstantExpr(DestTy, Instruction::ExtractValue, &Op<0>(), 1),
@@ -172,6 +204,13 @@
     Op<0>() = Agg;
   }
 
+  // allocate space for exactly one operand
+  void *operator new(size_t s) {
+    return User::operator new(s, 1);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
   /// Indices - These identify which value to extract.
   const SmallVector<unsigned, 4> Indices;
 
@@ -191,12 +230,8 @@
 /// insertvalue constant exprs.
 class InsertValueConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) = delete;
+
 public:
-  // allocate space for exactly one operand
-  void *operator new(size_t s) {
-    return User::operator new(s, 2);
-  }
   InsertValueConstantExpr(Constant *Agg, Constant *Val,
                           ArrayRef<unsigned> IdxList, Type *DestTy)
       : ConstantExpr(DestTy, Instruction::InsertValue, &Op<0>(), 2),
@@ -205,6 +240,13 @@
     Op<1>() = Val;
   }
 
+  // allocate space for exactly one operand
+  void *operator new(size_t s) {
+    return User::operator new(s, 2);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
   /// Indices - These identify the position for the insertion.
   const SmallVector<unsigned, 4> Indices;
 
@@ -224,10 +266,12 @@
 class GetElementPtrConstantExpr : public ConstantExpr {
   Type *SrcElementTy;
   Type *ResElementTy;
-  void anchor() override;
+
   GetElementPtrConstantExpr(Type *SrcElementTy, Constant *C,
                             ArrayRef<Constant *> IdxList, Type *DestTy);
 
+  void anchor() override;
+
 public:
   static GetElementPtrConstantExpr *Create(Type *SrcElementTy, Constant *C,
                                            ArrayRef<Constant *> IdxList,
@@ -237,8 +281,10 @@
     Result->SubclassOptionalData = Flags;
     return Result;
   }
+
   Type *getSourceElementType() const;
   Type *getResultElementType() const;
+
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
@@ -255,12 +301,8 @@
 // needed in order to store the predicate value for these instructions.
 class CompareConstantExpr : public ConstantExpr {
   void anchor() override;
-  void *operator new(size_t, unsigned) = delete;
+
 public:
-  // allocate space for exactly two operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 2);
-  }
   unsigned short predicate;
   CompareConstantExpr(Type *ty, Instruction::OtherOps opc,
                       unsigned short pred,  Constant* LHS, Constant* RHS)
@@ -268,6 +310,14 @@
     Op<0>() = LHS;
     Op<1>() = RHS;
   }
+
+  // allocate space for exactly two operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 2);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
@@ -373,6 +423,7 @@
   bool operator==(const ConstantAggrKeyType &X) const {
     return Operands == X.Operands;
   }
+
   bool operator==(const ConstantClass *C) const {
     if (Operands.size() != C->getNumOperands())
       return false;
@@ -381,6 +432,7 @@
         return false;
     return true;
   }
+
   unsigned getHash() const {
     return hash_combine_range(Operands.begin(), Operands.end());
   }
@@ -416,6 +468,7 @@
            AsmString == X.AsmString && Constraints == X.Constraints &&
            FTy == X.FTy;
   }
+
   bool operator==(const InlineAsm *Asm) const {
     return HasSideEffects == Asm->hasSideEffects() &&
            IsAlignStack == Asm->isAlignStack() &&
@@ -424,6 +477,7 @@
            Constraints == Asm->getConstraintString() &&
            FTy == Asm->getFunctionType();
   }
+
   unsigned getHash() const {
     return hash_combine(AsmString, Constraints, HasSideEffects, IsAlignStack,
                         AsmDialect, FTy);
@@ -553,22 +607,28 @@
     static inline ConstantClass *getEmptyKey() {
       return ConstantClassInfo::getEmptyKey();
     }
+
     static inline ConstantClass *getTombstoneKey() {
       return ConstantClassInfo::getTombstoneKey();
     }
+
     static unsigned getHashValue(const ConstantClass *CP) {
       SmallVector<Constant *, 32> Storage;
       return getHashValue(LookupKey(CP->getType(), ValType(CP, Storage)));
     }
+
     static bool isEqual(const ConstantClass *LHS, const ConstantClass *RHS) {
       return LHS == RHS;
     }
+
     static unsigned getHashValue(const LookupKey &Val) {
       return hash_combine(Val.first, Val.second.getHash());
     }
+
     static unsigned getHashValue(const LookupKeyHashed &Val) {
       return Val.first;
     }
+
     static bool isEqual(const LookupKey &LHS, const ConstantClass *RHS) {
       if (RHS == getEmptyKey() || RHS == getTombstoneKey())
         return false;
@@ -576,6 +636,7 @@
         return false;
       return LHS.second == RHS;
     }
+
     static bool isEqual(const LookupKeyHashed &LHS, const ConstantClass *RHS) {
       return isEqual(LHS.second, RHS);
     }
@@ -595,6 +656,7 @@
     for (auto &I : Map)
       delete I; // Asserts that use_empty().
   }
+
 private:
   ConstantClass *create(TypeClass *Ty, ValType V, LookupKeyHashed &HashKey) {
     ConstantClass *Result = V.create(Ty);
@@ -665,4 +727,4 @@
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_IR_CONSTANTSCONTEXT_H
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index aa67da2..650a255 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -616,9 +616,9 @@
   return createExpression(Addr);
 }
 
-DIExpression *DIBuilder::createBitPieceExpression(unsigned OffsetInBytes,
+DIExpression *DIBuilder::createFragmentExpression(unsigned OffsetInBytes,
                                                   unsigned SizeInBytes) {
-  uint64_t Addr[] = {dwarf::DW_OP_bit_piece, OffsetInBytes, SizeInBytes};
+  uint64_t Addr[] = {dwarf::DW_OP_LLVM_fragment, OffsetInBytes, SizeInBytes};
   return DIExpression::get(VMContext, Addr);
 }
 
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index 278eb89..fe61c27 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -559,7 +559,7 @@
 
 unsigned DIExpression::ExprOperand::getSize() const {
   switch (getOp()) {
-  case dwarf::DW_OP_bit_piece:
+  case dwarf::DW_OP_LLVM_fragment:
     return 3;
   case dwarf::DW_OP_constu:
   case dwarf::DW_OP_plus:
@@ -580,9 +580,9 @@
     switch (I->getOp()) {
     default:
       return false;
-    case dwarf::DW_OP_bit_piece:
+    case dwarf::DW_OP_LLVM_fragment:
     case dwarf::DW_OP_stack_value:
-      // We only support bit piece and stack value expressions which appear at
+      // We only support fragment and stack value expressions which appear at
       // the end.
       return I->get() + I->getSize() == E->get();
     case dwarf::DW_OP_constu:
@@ -595,21 +595,21 @@
   return true;
 }
 
-bool DIExpression::isBitPiece() const {
+bool DIExpression::isFragment() const {
   assert(isValid() && "Expected valid expression");
   if (unsigned N = getNumElements())
     if (N >= 3)
-      return getElement(N - 3) == dwarf::DW_OP_bit_piece;
+      return getElement(N - 3) == dwarf::DW_OP_LLVM_fragment;
   return false;
 }
 
-uint64_t DIExpression::getBitPieceOffset() const {
-  assert(isBitPiece() && "Expected bit piece");
+uint64_t DIExpression::getFragmentOffsetInBits() const {
+  assert(isFragment() && "Expected fragment");
   return getElement(getNumElements() - 2);
 }
 
-uint64_t DIExpression::getBitPieceSize() const {
-  assert(isBitPiece() && "Expected bit piece");
+uint64_t DIExpression::getFragmentSizeInBits() const {
+  assert(isFragment() && "Expected fragment");
   return getElement(getNumElements() - 1);
 }
 
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index f8ac37f..7cad629 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
@@ -222,6 +223,26 @@
   return nullptr;
 }
 
+bool GlobalValue::isAbsoluteSymbolRef() const {
+  auto *GO = dyn_cast<GlobalObject>(this);
+  if (!GO)
+    return false;
+
+  return GO->getMetadata(LLVMContext::MD_absolute_symbol);
+}
+
+Optional<ConstantRange> GlobalValue::getAbsoluteSymbolRange() const {
+  auto *GO = dyn_cast<GlobalObject>(this);
+  if (!GO)
+    return None;
+
+  MDNode *MD = GO->getMetadata(LLVMContext::MD_absolute_symbol);
+  if (!MD)
+    return None;
+
+  return getConstantRangeFromMetadata(*MD);
+}
+
 //===----------------------------------------------------------------------===//
 // GlobalVariable Implementation
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp
index 94934b3..dd66f14 100644
--- a/lib/IR/LLVMContext.cpp
+++ b/lib/IR/LLVMContext.cpp
@@ -35,113 +35,36 @@
 LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
   // Create the fixed metadata kinds. This is done in the same order as the
   // MD_* enum values so that they correspond.
+  std::pair<unsigned, StringRef> MDKinds[] = {
+    {MD_dbg, "dbg"},
+    {MD_tbaa, "tbaa"},
+    {MD_prof, "prof"},
+    {MD_fpmath, "fpmath"},
+    {MD_range, "range"},
+    {MD_tbaa_struct, "tbaa.struct"},
+    {MD_invariant_load, "invariant.load"},
+    {MD_alias_scope, "alias.scope"},
+    {MD_noalias, "noalias"},
+    {MD_nontemporal, "nontemporal"},
+    {MD_mem_parallel_loop_access, "llvm.mem.parallel_loop_access"},
+    {MD_nonnull, "nonnull"},
+    {MD_dereferenceable, "dereferenceable"},
+    {MD_dereferenceable_or_null, "dereferenceable_or_null"},
+    {MD_make_implicit, "make.implicit"},
+    {MD_unpredictable, "unpredictable"},
+    {MD_invariant_group, "invariant.group"},
+    {MD_align, "align"},
+    {MD_loop, "llvm.loop"},
+    {MD_type, "type"},
+    {MD_section_prefix, "section_prefix"},
+    {MD_absolute_symbol, "absolute_symbol"},
+  };
 
-  // Create the 'dbg' metadata kind.
-  unsigned DbgID = getMDKindID("dbg");
-  assert(DbgID == MD_dbg && "dbg kind id drifted"); (void)DbgID;
-
-  // Create the 'tbaa' metadata kind.
-  unsigned TBAAID = getMDKindID("tbaa");
-  assert(TBAAID == MD_tbaa && "tbaa kind id drifted"); (void)TBAAID;
-
-  // Create the 'prof' metadata kind.
-  unsigned ProfID = getMDKindID("prof");
-  assert(ProfID == MD_prof && "prof kind id drifted"); (void)ProfID;
-
-  // Create the 'fpmath' metadata kind.
-  unsigned FPAccuracyID = getMDKindID("fpmath");
-  assert(FPAccuracyID == MD_fpmath && "fpmath kind id drifted");
-  (void)FPAccuracyID;
-
-  // Create the 'range' metadata kind.
-  unsigned RangeID = getMDKindID("range");
-  assert(RangeID == MD_range && "range kind id drifted");
-  (void)RangeID;
-
-  // Create the 'tbaa.struct' metadata kind.
-  unsigned TBAAStructID = getMDKindID("tbaa.struct");
-  assert(TBAAStructID == MD_tbaa_struct && "tbaa.struct kind id drifted");
-  (void)TBAAStructID;
-
-  // Create the 'invariant.load' metadata kind.
-  unsigned InvariantLdId = getMDKindID("invariant.load");
-  assert(InvariantLdId == MD_invariant_load && "invariant.load kind id drifted");
-  (void)InvariantLdId;
-
-  // Create the 'alias.scope' metadata kind.
-  unsigned AliasScopeID = getMDKindID("alias.scope");
-  assert(AliasScopeID == MD_alias_scope && "alias.scope kind id drifted");
-  (void)AliasScopeID;
-
-  // Create the 'noalias' metadata kind.
-  unsigned NoAliasID = getMDKindID("noalias");
-  assert(NoAliasID == MD_noalias && "noalias kind id drifted");
-  (void)NoAliasID;
-
-  // Create the 'nontemporal' metadata kind.
-  unsigned NonTemporalID = getMDKindID("nontemporal");
-  assert(NonTemporalID == MD_nontemporal && "nontemporal kind id drifted");
-  (void)NonTemporalID;
-
-  // Create the 'llvm.mem.parallel_loop_access' metadata kind.
-  unsigned MemParallelLoopAccessID = getMDKindID("llvm.mem.parallel_loop_access");
-  assert(MemParallelLoopAccessID == MD_mem_parallel_loop_access &&
-         "mem_parallel_loop_access kind id drifted");
-  (void)MemParallelLoopAccessID;
-
-  // Create the 'nonnull' metadata kind.
-  unsigned NonNullID = getMDKindID("nonnull");
-  assert(NonNullID == MD_nonnull && "nonnull kind id drifted");
-  (void)NonNullID;
-  
-  // Create the 'dereferenceable' metadata kind.
-  unsigned DereferenceableID = getMDKindID("dereferenceable");
-  assert(DereferenceableID == MD_dereferenceable && 
-         "dereferenceable kind id drifted");
-  (void)DereferenceableID;
-  
-  // Create the 'dereferenceable_or_null' metadata kind.
-  unsigned DereferenceableOrNullID = getMDKindID("dereferenceable_or_null");
-  assert(DereferenceableOrNullID == MD_dereferenceable_or_null && 
-         "dereferenceable_or_null kind id drifted");
-  (void)DereferenceableOrNullID;
-
-  // Create the 'make.implicit' metadata kind.
-  unsigned MakeImplicitID = getMDKindID("make.implicit");
-  assert(MakeImplicitID == MD_make_implicit &&
-         "make.implicit kind id drifted");
-  (void)MakeImplicitID;
-
-  // Create the 'unpredictable' metadata kind.
-  unsigned UnpredictableID = getMDKindID("unpredictable");
-  assert(UnpredictableID == MD_unpredictable &&
-         "unpredictable kind id drifted");
-  (void)UnpredictableID;
-
-  // Create the 'invariant.group' metadata kind.
-  unsigned InvariantGroupId = getMDKindID("invariant.group");
-  assert(InvariantGroupId == MD_invariant_group &&
-         "invariant.group kind id drifted");
-  (void)InvariantGroupId;
-
-  // Create the 'align' metadata kind.
-  unsigned AlignID = getMDKindID("align");
-  assert(AlignID == MD_align && "align kind id drifted");
-  (void)AlignID;
-
-  // Create the 'llvm.loop' metadata kind.
-  unsigned LoopID = getMDKindID("llvm.loop");
-  assert(LoopID == MD_loop && "llvm.loop kind id drifted");
-  (void)LoopID;
-
-  unsigned TypeID = getMDKindID("type");
-  assert(TypeID == MD_type && "type kind id drifted");
-  (void)TypeID;
-
-  unsigned SectionPrefixID = getMDKindID("section_prefix");
-  assert(SectionPrefixID == MD_section_prefix &&
-         "section_prefix kind id drifted");
-  (void)SectionPrefixID;
+  for (auto &MDKind : MDKinds) {
+    unsigned ID = getMDKindID(MDKind.second);
+    assert(ID == MDKind.first && "metadata kind id drifted");
+    (void)ID;
+  }
 
   auto *DeoptEntry = pImpl->getOrInsertBundleTag("deopt");
   assert(DeoptEntry->second == LLVMContext::OB_deopt &&
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index 451cb09..91a999b 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -367,7 +367,7 @@
 }
 #endif // NDEBUG
 
-void Value::replaceAllUsesWith(Value *New) {
+void Value::doRAUW(Value *New, bool NoMetadata) {
   assert(New && "Value::replaceAllUsesWith(<null>) is invalid!");
   assert(!contains(New, this) &&
          "this->replaceAllUsesWith(expr(this)) is NOT valid!");
@@ -377,7 +377,7 @@
   // Notify all ValueHandles (if present) that this value is going away.
   if (HasValueHandle)
     ValueHandleBase::ValueIsRAUWd(this, New);
-  if (isUsedByMetadata())
+  if (!NoMetadata && isUsedByMetadata())
     ValueAsMetadata::handleRAUW(this, New);
 
   while (!use_empty()) {
@@ -398,6 +398,14 @@
     BB->replaceSuccessorsPhiUsesWith(cast<BasicBlock>(New));
 }
 
+void Value::replaceAllUsesWith(Value *New) {
+  doRAUW(New, false /* NoMetadata */);
+}
+
+void Value::replaceNonMetadataUsesWith(Value *New) {
+  doRAUW(New, true /* NoMetadata */);
+}
+
 // Like replaceAllUsesWith except it does not handle constants or basic blocks.
 // This routine leaves uses within BB.
 void Value::replaceUsesOutsideBlock(Value *New, BasicBlock *BB) {
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 2f819f7..6f7e344 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -483,7 +483,7 @@
   void verifyFrameRecoverIndices();
   void verifySiblingFuncletUnwinds();
 
-  void verifyBitPieceExpression(const DbgInfoIntrinsic &I);
+  void verifyFragmentExpression(const DbgInfoIntrinsic &I);
 
   /// Module-level debug info verification...
   void verifyCompileUnits();
@@ -3826,7 +3826,7 @@
   }
 
   if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I))
-    verifyBitPieceExpression(*DII);
+    verifyFragmentExpression(*DII);
 
   InstsInThisBlock.insert(&I);
 }
@@ -4307,7 +4307,7 @@
   return 0;
 }
 
-void Verifier::verifyBitPieceExpression(const DbgInfoIntrinsic &I) {
+void Verifier::verifyFragmentExpression(const DbgInfoIntrinsic &I) {
   DILocalVariable *V;
   DIExpression *E;
   if (auto *DVI = dyn_cast<DbgValueInst>(&I)) {
@@ -4324,7 +4324,7 @@
     return;
 
   // Nothing to do if this isn't a bit piece expression.
-  if (!E->isBitPiece())
+  if (!E->isFragment())
     return;
 
   // The frontend helps out GDB by emitting the members of local anonymous
@@ -4342,11 +4342,11 @@
   if (!VarSize)
     return;
 
-  unsigned PieceSize = E->getBitPieceSize();
-  unsigned PieceOffset = E->getBitPieceOffset();
-  AssertDI(PieceSize + PieceOffset <= VarSize,
-         "piece is larger than or outside of variable", &I, V, E);
-  AssertDI(PieceSize != VarSize, "piece covers entire variable", &I, V, E);
+  unsigned FragSize = E->getFragmentSizeInBits();
+  unsigned FragOffset = E->getFragmentOffsetInBits();
+  AssertDI(FragSize + FragOffset <= VarSize,
+         "fragment is larger than or outside of variable", &I, V, E);
+  AssertDI(FragSize != VarSize, "fragment covers entire variable", &I, V, E);
 }
 
 void Verifier::verifyCompileUnits() {
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index c858746..108b5a4 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -50,8 +50,8 @@
 // export/import and other global analysis results.
 // The hash is produced in \p Key.
 static void computeCacheKey(
-    SmallString<40> &Key, const ModuleSummaryIndex &Index, StringRef ModuleID,
-    const FunctionImporter::ImportMapTy &ImportList,
+    SmallString<40> &Key, const Config &Conf, const ModuleSummaryIndex &Index,
+    StringRef ModuleID, const FunctionImporter::ImportMapTy &ImportList,
     const FunctionImporter::ExportSetTy &ExportList,
     const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
     const GVSummaryMapTy &DefinedGlobals) {
@@ -67,6 +67,39 @@
   Hasher.update(LLVM_REVISION);
 #endif
 
+  // Include the parts of the LTO configuration that affect code generation.
+  auto AddString = [&](StringRef Str) {
+    Hasher.update(Str);
+    Hasher.update(ArrayRef<uint8_t>{0});
+  };
+  auto AddUnsigned = [&](unsigned I) {
+    uint8_t Data[4];
+    Data[0] = I;
+    Data[1] = I >> 8;
+    Data[2] = I >> 16;
+    Data[3] = I >> 24;
+    Hasher.update(ArrayRef<uint8_t>{Data, 4});
+  };
+  AddString(Conf.CPU);
+  // FIXME: Hash more of Options. For now all clients initialize Options from
+  // command-line flags (which is unsupported in production), but may set
+  // RelaxELFRelocations. The clang driver can also pass FunctionSections,
+  // DataSections and DebuggerTuning via command line flags.
+  AddUnsigned(Conf.Options.RelaxELFRelocations);
+  AddUnsigned(Conf.Options.FunctionSections);
+  AddUnsigned(Conf.Options.DataSections);
+  AddUnsigned((unsigned)Conf.Options.DebuggerTuning);
+  for (auto &A : Conf.MAttrs)
+    AddString(A);
+  AddUnsigned(Conf.RelocModel);
+  AddUnsigned(Conf.CodeModel);
+  AddUnsigned(Conf.CGOptLevel);
+  AddUnsigned(Conf.OptLevel);
+  AddString(Conf.OptPipeline);
+  AddString(Conf.AAPipeline);
+  AddString(Conf.OverrideTriple);
+  AddString(Conf.DefaultTriple);
+
   // Include the hash for the current module
   auto ModHash = Index.getModuleHash(ModuleID);
   Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
@@ -562,7 +595,7 @@
 
     SmallString<40> Key;
     // The module may be cached, this helps handling it.
-    computeCacheKey(Key, CombinedIndex, ModuleID, ImportList, ExportList,
+    computeCacheKey(Key, Conf, CombinedIndex, ModuleID, ImportList, ExportList,
                     ResolvedODR, DefinedGlobals);
     if (AddStreamFn CacheAddStream = Cache(Task, Key))
       return RunThinBackend(CacheAddStream);
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index afe8ee8..8298aff 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -260,20 +260,11 @@
   return Symbols.lookup(NameRef);
 }
 
-int MCContext::setSymbolValue(MCStreamer &Streamer, std::string &I) {
-    auto Pair = StringRef(I).split('=');
-    if (Pair.second.empty()) {
-      errs() << "error: defsym must be of the form: sym=value: " << I << "\n";
-      return 1;
-    }
-    int64_t Value;
-    if (Pair.second.getAsInteger(0, Value)) {
-      errs() << "error: Value is not an integer: " << Pair.second << "\n";
-      return 1;
-    }
-    auto Symbol = getOrCreateSymbol(Pair.first);
-    Streamer.EmitAssignment(Symbol, MCConstantExpr::create(Value, *this));
-    return 0;
+void MCContext::setSymbolValue(MCStreamer &Streamer,
+                              StringRef Sym,
+                              uint64_t Val) {
+  auto Symbol = getOrCreateSymbol(Sym);
+  Streamer.EmitAssignment(Symbol, MCConstantExpr::create(Val, *this));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index 0fa7fbd..87ecf9e 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -181,12 +181,19 @@
 
   // C Style comment.
   ++CurPtr;  // skip the star.
+  const char *CommentTextStart = CurPtr;
   while (CurPtr != CurBuf.end()) {
     switch (*CurPtr++) {
     case '*':
       // End of the comment?
       if (*CurPtr != '/')
         break;
+      // If we have a CommentConsumer, notify it about the comment.
+      if (CommentConsumer) {
+        CommentConsumer->HandleComment(
+            SMLoc::getFromPointer(CommentTextStart),
+            StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
+      }
       ++CurPtr;   // End the */.
       return AsmToken(AsmToken::Comment,
                       StringRef(TokStart, CurPtr - TokStart));
@@ -202,10 +209,18 @@
   // comment. While it would be nicer to leave this two tokens,
   // backwards compatability with TargetParsers makes keeping this in this form
   // better.
+  const char *CommentTextStart = CurPtr;
   int CurChar = getNextChar();
   while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
     CurChar = getNextChar();
 
+  // If we have a CommentConsumer, notify it about the comment.
+  if (CommentConsumer) {
+    CommentConsumer->HandleComment(
+        SMLoc::getFromPointer(CommentTextStart),
+        StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
+  }
+
   IsAtStartOfLine = true;
   // This is a whole line comment. leave newline
   if (IsAtStartOfStatement)
diff --git a/lib/MC/MCParser/MCAsmLexer.cpp b/lib/MC/MCParser/MCAsmLexer.cpp
index 336e641..63c0dab 100644
--- a/lib/MC/MCParser/MCAsmLexer.cpp
+++ b/lib/MC/MCParser/MCAsmLexer.cpp
@@ -13,7 +13,8 @@
 using namespace llvm;
 
 MCAsmLexer::MCAsmLexer()
-    : TokStart(nullptr), SkipSpace(true), IsAtStartOfStatement(true) {
+    : TokStart(nullptr), SkipSpace(true), IsAtStartOfStatement(true),
+      CommentConsumer(nullptr) {
   CurTok.emplace_back(AsmToken::Space, StringRef());
 }
 
diff --git a/lib/MC/MCTargetOptions.cpp b/lib/MC/MCTargetOptions.cpp
index e847a17..4192105 100644
--- a/lib/MC/MCTargetOptions.cpp
+++ b/lib/MC/MCTargetOptions.cpp
@@ -14,7 +14,8 @@
 
 MCTargetOptions::MCTargetOptions()
     : SanitizeAddress(false), MCRelaxAll(false), MCNoExecStack(false),
-      MCFatalWarnings(false), MCNoWarn(false), MCSaveTempLabels(false),
+      MCFatalWarnings(false), MCNoWarn(false), MCNoDeprecatedWarn(false),
+      MCSaveTempLabels(false),
       MCUseDwarfDirectory(false), MCIncrementalLinkerCompatible(false),
       MCPIECopyRelocations(false), ShowMCEncoding(false),
       ShowMCInst(false), AsmVerbose(false),
diff --git a/lib/ObjectYAML/CMakeLists.txt b/lib/ObjectYAML/CMakeLists.txt
index 7737090..2eee95b 100644
--- a/lib/ObjectYAML/CMakeLists.txt
+++ b/lib/ObjectYAML/CMakeLists.txt
@@ -4,4 +4,5 @@
   ELFYAML.cpp
   MachOYAML.cpp
   ObjectYAML.cpp
+  DWARFYAML.cpp
   )
diff --git a/lib/ObjectYAML/DWARFYAML.cpp b/lib/ObjectYAML/DWARFYAML.cpp
new file mode 100644
index 0000000..0a463bd
--- /dev/null
+++ b/lib/ObjectYAML/DWARFYAML.cpp
@@ -0,0 +1,65 @@
+//===- DWARFYAML.cpp - DWARF YAMLIO implementation ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of DWARF Debug
+// Info.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/DWARFYAML.h"
+
+namespace llvm {
+
+bool DWARFYAML::Data::isEmpty() const {
+  return 0 == DebugStrings.size() + AbbrevDecls.size();
+}
+
+namespace yaml {
+
+void MappingTraits<DWARFYAML::Data>::mapping(
+    IO &IO, DWARFYAML::Data &DWARF) {
+  IO.mapOptional("debug_str", DWARF.DebugStrings);
+  IO.mapOptional("debug_abbrev", DWARF.AbbrevDecls);
+  if(!DWARF.ARanges.empty() || !IO.outputting())
+    IO.mapOptional("debug_aranges", DWARF.ARanges);
+}
+
+void MappingTraits<DWARFYAML::Abbrev>::mapping(
+    IO &IO, DWARFYAML::Abbrev &Abbrev) {
+  IO.mapRequired("Code", Abbrev.Code);
+  IO.mapRequired("Tag", Abbrev.Tag);
+  IO.mapRequired("Children", Abbrev.Children);
+  IO.mapRequired("Attributes", Abbrev.Attributes);
+}
+
+void MappingTraits<DWARFYAML::AttributeAbbrev>::mapping(
+    IO &IO, DWARFYAML::AttributeAbbrev &AttAbbrev) {
+  IO.mapRequired("Attribute", AttAbbrev.Attribute);
+  IO.mapRequired("Form", AttAbbrev.Form);
+}
+
+void MappingTraits<DWARFYAML::ARangeDescriptor>::mapping(
+    IO &IO, DWARFYAML::ARangeDescriptor &Descriptor) {
+  IO.mapRequired("Address", Descriptor.Address);
+  IO.mapRequired("Length", Descriptor.Length);
+}
+
+void MappingTraits<DWARFYAML::ARange>::mapping(IO &IO,
+                                                DWARFYAML::ARange &Range) {
+  IO.mapRequired("Length", Range.Length);
+  IO.mapRequired("Version", Range.Version);
+  IO.mapRequired("CuOffset", Range.CuOffset);
+  IO.mapRequired("AddrSize", Range.AddrSize);
+  IO.mapRequired("SegSize", Range.SegSize);
+  IO.mapRequired("Descriptors", Range.Descriptors);
+}
+
+} // namespace llvm::yaml
+
+} // namespace llvm
diff --git a/lib/ObjectYAML/MachOYAML.cpp b/lib/ObjectYAML/MachOYAML.cpp
index 984ca0b..7ebb1bed 100644
--- a/lib/ObjectYAML/MachOYAML.cpp
+++ b/lib/ObjectYAML/MachOYAML.cpp
@@ -23,7 +23,10 @@
 MachOYAML::LoadCommand::~LoadCommand() {}
 
 bool MachOYAML::LinkEditData::isEmpty() const {
-  return 0 == RebaseOpcodes.size() + BindOpcodes.size() + WeakBindOpcodes.size() + LazyBindOpcodes.size() + ExportTrie.Children.size() + NameList.size() + StringTable.size();
+  return 0 ==
+         RebaseOpcodes.size() + BindOpcodes.size() + WeakBindOpcodes.size() +
+             LazyBindOpcodes.size() + ExportTrie.Children.size() +
+             NameList.size() + StringTable.size();
 }
 
 namespace yaml {
@@ -102,6 +105,9 @@
   if(!Object.LinkEdit.isEmpty() || !IO.outputting())
     IO.mapOptional("LinkEditData", Object.LinkEdit);
 
+  if(!Object.DWARF.isEmpty() || !IO.outputting())
+    IO.mapOptional("DWARF", Object.DWARF);
+
   if (IO.getContext() == &Object)
     IO.setContext(nullptr);
 }
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index f94cc18..8950e8c 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -74,6 +74,8 @@
   case DW_OP_##NAME:                                                           \
     return "DW_OP_" #NAME;
 #include "llvm/Support/Dwarf.def"
+  case DW_OP_LLVM_fragment:
+    return "DW_OP_LLVM_fragment";
   }
 }
 
@@ -81,6 +83,7 @@
   return StringSwitch<unsigned>(OperationEncodingString)
 #define HANDLE_DW_OP(ID, NAME) .Case("DW_OP_" #NAME, DW_OP_##NAME)
 #include "llvm/Support/Dwarf.def"
+      .Case("DW_OP_LLVM_fragment", DW_OP_LLVM_fragment)
       .Default(0);
 }
 
diff --git a/lib/Support/PrettyStackTrace.cpp b/lib/Support/PrettyStackTrace.cpp
index 845745c..f49eb0a 100644
--- a/lib/Support/PrettyStackTrace.cpp
+++ b/lib/Support/PrettyStackTrace.cpp
@@ -79,7 +79,7 @@
 }
 
 // Integrate with crash reporter libraries.
-#if defined (__APPLE__) && HAVE_CRASHREPORTERCLIENT_H
+#if defined (__APPLE__) && defined(HAVE_CRASHREPORTERCLIENT_H)
 //  If any clients of llvm try to link to libCrashReporterClient.a themselves,
 //  only one crash info struct will be used.
 extern "C" {
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 92c7967..d073802 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -598,7 +598,13 @@
 uint64_t raw_fd_ostream::seek(uint64_t off) {
   assert(SupportsSeeking && "Stream does not support seeking!");
   flush();
+#ifdef LLVM_ON_WIN32
+  pos = ::_lseeki64(FD, off, SEEK_SET);
+#elif defined(HAVE_LSEEK64)
+  pos = ::lseek64(FD, off, SEEK_SET);
+#else
   pos = ::lseek(FD, off, SEEK_SET);
+#endif
   if (pos == (uint64_t)-1)
     error_detected();
   return pos;
diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp
index bb590c7..278b567 100644
--- a/lib/TableGen/Main.cpp
+++ b/lib/TableGen/Main.cpp
@@ -17,6 +17,7 @@
 
 #include "llvm/TableGen/Main.h"
 #include "TGParser.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -45,22 +46,25 @@
 IncludeDirs("I", cl::desc("Directory of include files"),
             cl::value_desc("directory"), cl::Prefix);
 
+static int reportError(const char *ProgName, Twine Msg) {
+  errs() << ProgName << ": " << Msg;
+  errs().flush();
+  return 1;
+}
+
 /// \brief Create a dependency file for `-d` option.
 ///
 /// This functionality is really only for the benefit of the build system.
 /// It is similar to GCC's `-M*` family of options.
 static int createDependencyFile(const TGParser &Parser, const char *argv0) {
-  if (OutputFilename == "-") {
-    errs() << argv0 << ": the option -d must be used together with -o\n";
-    return 1;
-  }
+  if (OutputFilename == "-")
+    return reportError(argv0, "the option -d must be used together with -o\n");
+
   std::error_code EC;
   tool_output_file DepOut(DependFilename, EC, sys::fs::F_Text);
-  if (EC) {
-    errs() << argv0 << ": error opening " << DependFilename << ":"
-           << EC.message() << "\n";
-    return 1;
-  }
+  if (EC)
+    return reportError(argv0, "error opening " + DependFilename + ":" +
+                                  EC.message() + "\n");
   DepOut.os() << OutputFilename << ":";
   for (const auto &Dep : Parser.getDependencies()) {
     DepOut.os() << ' ' << Dep.first;
@@ -76,11 +80,9 @@
   // Parse the input file.
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(InputFilename);
-  if (std::error_code EC = FileOrErr.getError()) {
-    errs() << "Could not open input file '" << InputFilename
-           << "': " << EC.message() << "\n";
-    return 1;
-  }
+  if (std::error_code EC = FileOrErr.getError())
+    return reportError(argv0, "Could not open input file '" + InputFilename +
+                                  "': " + EC.message() + "\n");
 
   // Tell SrcMgr about this buffer, which is what TGParser will pick up.
   SrcMgr.AddNewSourceBuffer(std::move(*FileOrErr), SMLoc());
@@ -96,11 +98,9 @@
 
   std::error_code EC;
   tool_output_file Out(OutputFilename, EC, sys::fs::F_Text);
-  if (EC) {
-    errs() << argv0 << ": error opening " << OutputFilename << ":"
-           << EC.message() << "\n";
-    return 1;
-  }
+  if (EC)
+    return reportError(argv0, "error opening " + OutputFilename + ":" +
+                                  EC.message() + "\n");
   if (!DependFilename.empty()) {
     if (int Ret = createDependencyFile(Parser, argv0))
       return Ret;
@@ -109,10 +109,8 @@
   if (MainFn(Out.os(), Records))
     return 1;
 
-  if (ErrorsPrinted > 0) {
-    errs() << argv0 << ": " << ErrorsPrinted << " errors.\n";
-    return 1;
-  }
+  if (ErrorsPrinted > 0)
+    return reportError(argv0, utostr(ErrorsPrinted) + " errors.\n");
 
   // Declare success.
   Out.keep();
diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp
index 3d92942..49ad47f 100644
--- a/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -168,8 +168,10 @@
   ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
 
   if (SplitVTs.size() == 1) {
-    // No splitting to do, just forward the input directly.
-    SplitArgs.push_back(OrigArg);
+    // No splitting to do, but we want to replace the original type (e.g. [1 x
+    // double] -> double).
+    SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx),
+                           OrigArg.Flags);
     return;
   }
 
@@ -198,12 +200,10 @@
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = *MF.getFunction();
 
-  MachineInstrBuilder MIB = MIRBuilder.buildInstr(AArch64::RET_ReallyLR);
-  assert(MIB.getInstr() && "Unable to build a return instruction?!");
-
+  auto MIB = MIRBuilder.buildInstrNoInsert(AArch64::RET_ReallyLR);
   assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg");
+  bool Success = true;
   if (VReg) {
-    MIRBuilder.setInstr(*MIB.getInstr(), /* Before */ true);
     const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
     CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
     MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -219,9 +219,11 @@
                       });
 
     OutgoingArgHandler Handler(MIRBuilder, MRI, MIB);
-    return handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler);
+    Success = handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler);
   }
-  return true;
+
+  MIRBuilder.insertInstr(MIB);
+  return Success;
 }
 
 bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
diff --git a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index 621bd14..e927d58 100644
--- a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -26,18 +26,19 @@
 
 // PartialMappings.
 enum PartialMappingIdx {
-  None = -1,
-  GPR32 = 0,
-  GPR64,
-  FPR32,
-  FPR64,
-  FPR128,
-  FPR256,
-  FPR512,
-  FirstGPR = GPR32,
-  LastGPR = GPR64,
-  FirstFPR = FPR32,
-  LastFPR = FPR512
+  PMI_None = -1,
+  PMI_GPR32 = 1,
+  PMI_GPR64,
+  PMI_FPR32,
+  PMI_FPR64,
+  PMI_FPR128,
+  PMI_FPR256,
+  PMI_FPR512,
+  PMI_FirstGPR = PMI_GPR32,
+  PMI_LastGPR = PMI_GPR64,
+  PMI_FirstFPR = PMI_FPR32,
+  PMI_LastFPR = PMI_FPR512,
+  PMI_Min = PMI_FirstGPR,
 };
 
 static unsigned getRegBankBaseIdxOffset(unsigned Size) {
@@ -76,34 +77,53 @@
 };
 
 // ValueMappings.
-RegisterBankInfo::ValueMapping ValMappings[] {
-  /* BreakDown, NumBreakDowns */
-  // 3-operands instructions (all binary operations should end up with one of
-  // those mapping).
-  // 0: GPR 32-bit value. <-- This must match First3OpsIdx.
-  {&PartMappings[0], 1}, {&PartMappings[0], 1}, {&PartMappings[0], 1},
-  // 3: GPR 64-bit value.
-  {&PartMappings[1], 1}, {&PartMappings[1], 1}, {&PartMappings[1], 1},
-  // 6: FPR 32-bit value.
-  {&PartMappings[2], 1}, {&PartMappings[2], 1}, {&PartMappings[2], 1},
-  // 9: FPR 64-bit value.
-  {&PartMappings[3], 1}, {&PartMappings[3], 1}, {&PartMappings[3], 1},
-  // 12: FPR 128-bit value.
-  {&PartMappings[4], 1}, {&PartMappings[4], 1}, {&PartMappings[4], 1},
-  // 15: FPR 256-bit value.
-  {&PartMappings[5], 1}, {&PartMappings[5], 1}, {&PartMappings[5], 1},
-  // 18: FPR 512-bit value. <-- This must match Last3OpsIdx.
-  {&PartMappings[6], 1}, {&PartMappings[6], 1}, {&PartMappings[6], 1},
-  // Cross register bank copies.
-  // 21: GPR 32-bit value to FPR 32-bit value. <-- This must match FirstCrossRegCpyIdx.
-  {&PartMappings[0], 1}, {&PartMappings[2], 1},
-  // 23: GPR 64-bit value to FPR 64-bit value.
-  {&PartMappings[1], 1}, {&PartMappings[3], 1},
-  // 25: FPR 32-bit value to GPR 32-bit value.
-  {&PartMappings[2], 1}, {&PartMappings[0], 1},
-  // 27: FPR 64-bit value to GPR 64-bit value. <-- This must match LastCrossRegCpyIdx.
-  {&PartMappings[3], 1}, {&PartMappings[1], 1}
-
+RegisterBankInfo::ValueMapping ValMappings[]{
+    /* BreakDown, NumBreakDowns */
+    // 3-operands instructions (all binary operations should end up with one of
+    // those mapping).
+    // 0: GPR 32-bit value. <-- This must match First3OpsIdx.
+    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
+    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
+    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
+    // 3: GPR 64-bit value.
+    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
+    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
+    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
+    // 6: FPR 32-bit value.
+    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
+    // 9: FPR 64-bit value.
+    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
+    // 12: FPR 128-bit value.
+    {&PartMappings[PMI_FPR128 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR128 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR128 - PMI_Min], 1},
+    // 15: FPR 256-bit value.
+    {&PartMappings[PMI_FPR256 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR256 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR256 - PMI_Min], 1},
+    // 18: FPR 512-bit value. <-- This must match Last3OpsIdx.
+    {&PartMappings[PMI_FPR512 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR512 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR512 - PMI_Min], 1},
+    // Cross register bank copies.
+    // 21: GPR 32-bit value to FPR 32-bit value. <-- This must match
+    //                                               FirstCrossRegCpyIdx.
+    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
+    // 23: GPR 64-bit value to FPR 64-bit value.
+    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
+    // 25: FPR 32-bit value to GPR 32-bit value.
+    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
+    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
+    // 27: FPR 64-bit value to GPR 64-bit value. <-- This must match
+    //                                               LastCrossRegCpyIdx.
+    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
+    {&PartMappings[PMI_GPR64 - PMI_Min], 1}
 };
 
 /// Get the pointer to the ValueMapping representing the RegisterBank
@@ -115,12 +135,13 @@
 /// \pre \p RBIdx != PartialMappingIdx::None
 const RegisterBankInfo::ValueMapping *
 getValueMapping(PartialMappingIdx RBIdx, unsigned Size) {
-  assert(RBIdx != PartialMappingIdx::None && "No mapping needed for that");
+  assert(RBIdx != PartialMappingIdx::PMI_None && "No mapping needed for that");
   unsigned ValMappingIdx = First3OpsIdx +
-                      (RBIdx + getRegBankBaseIdxOffset(Size)) *
-                          ValueMappingIdx::DistanceBetweenRegBanks;
-    assert(ValMappingIdx >= AArch64::First3OpsIdx &&
-           ValMappingIdx <= AArch64::Last3OpsIdx && "Mapping out of bound");
+                           (RBIdx - AArch64::PartialMappingIdx::PMI_Min +
+                            getRegBankBaseIdxOffset(Size)) *
+                               ValueMappingIdx::DistanceBetweenRegBanks;
+  assert(ValMappingIdx >= AArch64::First3OpsIdx &&
+         ValMappingIdx <= AArch64::Last3OpsIdx && "Mapping out of bound");
 
   return &ValMappings[ValMappingIdx];
 }
@@ -133,14 +154,14 @@
 /// otherwise it is on FPR. Same thing for \p SrcIsGPR.
 const RegisterBankInfo::ValueMapping *
 getCopyMapping(bool DstIsGPR, bool SrcIsGPR, unsigned Size) {
-  PartialMappingIdx DstRBIdx = DstIsGPR ? FirstGPR : FirstFPR;
-  PartialMappingIdx SrcRBIdx = SrcIsGPR ? FirstGPR : FirstFPR;
+  PartialMappingIdx DstRBIdx = DstIsGPR ? PMI_FirstGPR : PMI_FirstFPR;
+  PartialMappingIdx SrcRBIdx = SrcIsGPR ? PMI_FirstGPR : PMI_FirstFPR;
   if (DstRBIdx == SrcRBIdx)
     return getValueMapping(DstRBIdx, Size);
   assert(Size <= 64 && "GPR cannot handle that size");
   unsigned ValMappingIdx =
       FirstCrossRegCpyIdx +
-      (DstRBIdx - FirstGPR + getRegBankBaseIdxOffset(Size)) *
+      (DstRBIdx - PMI_Min + getRegBankBaseIdxOffset(Size)) *
           ValueMappingIdx::DistanceBetweenCrossRegCpy;
   assert(ValMappingIdx >= AArch64::FirstCrossRegCpyIdx &&
          ValMappingIdx <= AArch64::LastCrossRegCpyIdx &&
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 581958c..af3ab1b 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7437,7 +7437,7 @@
     int64_t Offset = AM.BaseOffs;
 
     // 9-bit signed offset
-    if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
+    if (isInt<9>(Offset))
       return true;
 
     // 12-bit unsigned offset
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index b156fcd..a5303fc 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -629,6 +629,9 @@
       // FIXME: Is going through int64_t always correct?
       ImmOp.ChangeToImmediate(
           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
+    } else {
+      uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
+      I.getOperand(1).ChangeToImmediate(Val);
     }
 
     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 962c4c7..a5fd2fb 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -83,61 +83,63 @@
 
   // Check that the TableGen'ed like file is in sync we our expectations.
   // First, the Idx.
-  assert(AArch64::PartialMappingIdx::GPR32 ==
-             AArch64::PartialMappingIdx::FirstGPR &&
+  assert(AArch64::PartialMappingIdx::PMI_GPR32 ==
+             AArch64::PartialMappingIdx::PMI_FirstGPR &&
          "GPR32 index not first in the GPR list");
-  assert(AArch64::PartialMappingIdx::GPR64 ==
-             AArch64::PartialMappingIdx::LastGPR &&
+  assert(AArch64::PartialMappingIdx::PMI_GPR64 ==
+             AArch64::PartialMappingIdx::PMI_LastGPR &&
          "GPR64 index not last in the GPR list");
-  assert(AArch64::PartialMappingIdx::FirstGPR <=
-             AArch64::PartialMappingIdx::LastGPR &&
+  assert(AArch64::PartialMappingIdx::PMI_FirstGPR <=
+             AArch64::PartialMappingIdx::PMI_LastGPR &&
          "GPR list is backward");
-  assert(AArch64::PartialMappingIdx::FPR32 ==
-             AArch64::PartialMappingIdx::FirstFPR &&
+  assert(AArch64::PartialMappingIdx::PMI_FPR32 ==
+             AArch64::PartialMappingIdx::PMI_FirstFPR &&
          "FPR32 index not first in the FPR list");
-  assert(AArch64::PartialMappingIdx::FPR512 ==
-             AArch64::PartialMappingIdx::LastFPR &&
+  assert(AArch64::PartialMappingIdx::PMI_FPR512 ==
+             AArch64::PartialMappingIdx::PMI_LastFPR &&
          "FPR512 index not last in the FPR list");
-  assert(AArch64::PartialMappingIdx::FirstFPR <=
-             AArch64::PartialMappingIdx::LastFPR &&
+  assert(AArch64::PartialMappingIdx::PMI_FirstFPR <=
+             AArch64::PartialMappingIdx::PMI_LastFPR &&
          "FPR list is backward");
-  assert(AArch64::PartialMappingIdx::FPR32 + 1 ==
-             AArch64::PartialMappingIdx::FPR64 &&
-         AArch64::PartialMappingIdx::FPR64 + 1 ==
-             AArch64::PartialMappingIdx::FPR128 &&
-         AArch64::PartialMappingIdx::FPR128 + 1 ==
-             AArch64::PartialMappingIdx::FPR256 &&
-         AArch64::PartialMappingIdx::FPR256 + 1 ==
-             AArch64::PartialMappingIdx::FPR512 &&
+  assert(AArch64::PartialMappingIdx::PMI_FPR32 + 1 ==
+             AArch64::PartialMappingIdx::PMI_FPR64 &&
+         AArch64::PartialMappingIdx::PMI_FPR64 + 1 ==
+             AArch64::PartialMappingIdx::PMI_FPR128 &&
+         AArch64::PartialMappingIdx::PMI_FPR128 + 1 ==
+             AArch64::PartialMappingIdx::PMI_FPR256 &&
+         AArch64::PartialMappingIdx::PMI_FPR256 + 1 ==
+             AArch64::PartialMappingIdx::PMI_FPR512 &&
          "FPR indices not properly ordered");
 // Now, the content.
 // Check partial mapping.
 #define CHECK_PARTIALMAP(Idx, ValStartIdx, ValLength, RB)                      \
   do {                                                                         \
     const PartialMapping &Map =                                                \
-        AArch64::PartMappings[AArch64::PartialMappingIdx::Idx];                \
-    (void) Map;                                                                \
+        AArch64::PartMappings[AArch64::PartialMappingIdx::Idx -                \
+                              AArch64::PartialMappingIdx::PMI_Min];            \
+    (void)Map;                                                                 \
     assert(Map.StartIdx == ValStartIdx && Map.Length == ValLength &&           \
            Map.RegBank == &RB && #Idx " is incorrectly initialized");          \
   } while (0)
 
-  CHECK_PARTIALMAP(GPR32, 0, 32, RBGPR);
-  CHECK_PARTIALMAP(GPR64, 0, 64, RBGPR);
-  CHECK_PARTIALMAP(FPR32, 0, 32, RBFPR);
-  CHECK_PARTIALMAP(FPR64, 0, 64, RBFPR);
-  CHECK_PARTIALMAP(FPR128, 0, 128, RBFPR);
-  CHECK_PARTIALMAP(FPR256, 0, 256, RBFPR);
-  CHECK_PARTIALMAP(FPR512, 0, 512, RBFPR);
+  CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR);
+  CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR);
+  CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR);
+  CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR);
+  CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR);
+  CHECK_PARTIALMAP(PMI_FPR256, 0, 256, RBFPR);
+  CHECK_PARTIALMAP(PMI_FPR512, 0, 512, RBFPR);
 
 // Check value mapping.
 #define CHECK_VALUEMAP_IMPL(RBName, Size, Offset)                              \
   do {                                                                         \
-    AArch64::PartialMappingIdx PartialMapBaseIdx =                             \
-        AArch64::PartialMappingIdx::RBName##Size;                              \
-    (void) PartialMapBaseIdx;                                                  \
-    const ValueMapping &Map =                                                  \
-        AArch64::getValueMapping(AArch64::First##RBName, Size)[Offset];        \
-    (void) Map;                                                                \
+    unsigned PartialMapBaseIdx =                                               \
+        AArch64::PartialMappingIdx::PMI_##RBName##Size -                       \
+        AArch64::PartialMappingIdx::PMI_Min;                                   \
+    (void)PartialMapBaseIdx;                                                   \
+    const ValueMapping &Map = AArch64::getValueMapping(                        \
+        AArch64::PartialMappingIdx::PMI_First##RBName, Size)[Offset];          \
+    (void)Map;                                                                 \
     assert(Map.BreakDown == &AArch64::PartMappings[PartialMapBaseIdx] &&       \
            Map.NumBreakDowns == 1 && #RBName #Size                             \
            " " #Offset " is incorrectly initialized");                         \
@@ -172,15 +174,15 @@
 
 #define CHECK_VALUEMAP_CROSSREGCPY(RBNameDst, RBNameSrc, Size)                 \
   do {                                                                         \
-    AArch64::PartialMappingIdx PartialMapDstIdx =                              \
-        AArch64::PartialMappingIdx::RBNameDst##Size;                           \
-    AArch64::PartialMappingIdx PartialMapSrcIdx =                              \
-        AArch64::PartialMappingIdx::RBNameSrc##Size;                           \
+    unsigned PartialMapDstIdx =                                                \
+        AArch64::PMI_##RBNameDst##Size - AArch64::PMI_Min;                     \
+    unsigned PartialMapSrcIdx =                                                \
+        AArch64::PMI_##RBNameSrc##Size - AArch64::PMI_Min;                     \
     (void) PartialMapDstIdx;                                                   \
     (void) PartialMapSrcIdx;                                                   \
     const ValueMapping *Map = AArch64::getCopyMapping(                         \
-        AArch64::First##RBNameDst == AArch64::FirstGPR,                        \
-        AArch64::First##RBNameSrc == AArch64::FirstGPR, Size);                 \
+        AArch64::PMI_First##RBNameDst == AArch64::PMI_FirstGPR,                \
+        AArch64::PMI_First##RBNameSrc == AArch64::PMI_FirstGPR, Size);         \
     (void) Map;                                                                \
     assert(Map[0].BreakDown == &AArch64::PartMappings[PartialMapDstIdx] &&     \
            Map[0].NumBreakDowns == 1 && #RBNameDst #Size                       \
@@ -283,10 +285,12 @@
       break;
     InstructionMappings AltMappings;
     InstructionMapping GPRMapping(
-        /*ID*/ 1, /*Cost*/ 1, AArch64::getValueMapping(AArch64::FirstGPR, Size),
+        /*ID*/ 1, /*Cost*/ 1,
+        AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size),
         /*NumOperands*/ 3);
     InstructionMapping FPRMapping(
-        /*ID*/ 2, /*Cost*/ 1, AArch64::getValueMapping(AArch64::FirstFPR, Size),
+        /*ID*/ 2, /*Cost*/ 1,
+        AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size),
         /*NumOperands*/ 3);
 
     AltMappings.emplace_back(std::move(GPRMapping));
@@ -310,17 +314,17 @@
         /*NumOperands*/ 2);
     InstructionMapping FPRMapping(
         /*ID*/ 2, /*Cost*/ 1,
-        AArch64::getCopyMapping(/*DstIsFPR*/ false, /*SrcIsFPR*/ false, Size),
+        AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ false, Size),
         /*NumOperands*/ 2);
     InstructionMapping GPRToFPRMapping(
         /*ID*/ 3,
         /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size),
-        AArch64::getCopyMapping(/*DstIsFPR*/ false, /*SrcIsFPR*/ true, Size),
+        AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ true, Size),
         /*NumOperands*/ 2);
     InstructionMapping FPRToGPRMapping(
         /*ID*/ 3,
         /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size),
-        AArch64::getCopyMapping(/*DstIsFPR*/ true, /*SrcIsFPR*/ false, Size),
+        AArch64::getCopyMapping(/*DstIsGPR*/ true, /*SrcIsGPR*/ false, Size),
         /*NumOperands*/ 2);
 
     AltMappings.emplace_back(std::move(GPRMapping));
@@ -342,15 +346,17 @@
     InstructionMappings AltMappings;
     InstructionMapping GPRMapping(
         /*ID*/ 1, /*Cost*/ 1,
-        getOperandsMapping({AArch64::getValueMapping(AArch64::FirstGPR, Size),
-                            // Addresses are GPR 64-bit.
-                            AArch64::getValueMapping(AArch64::FirstGPR, 64)}),
+        getOperandsMapping(
+            {AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size),
+             // Addresses are GPR 64-bit.
+             AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}),
         /*NumOperands*/ 2);
     InstructionMapping FPRMapping(
         /*ID*/ 2, /*Cost*/ 1,
-        getOperandsMapping({AArch64::getValueMapping(AArch64::FirstFPR, Size),
-                            // Addresses are GPR 64-bit.
-                            AArch64::getValueMapping(AArch64::FirstGPR, 64)}),
+        getOperandsMapping(
+            {AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size),
+             // Addresses are GPR 64-bit.
+             AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}),
         /*NumOperands*/ 2);
 
     AltMappings.emplace_back(std::move(GPRMapping));
@@ -370,7 +376,7 @@
   case TargetOpcode::G_BITCAST:
   case TargetOpcode::G_LOAD: {
     // Those ID must match getInstrAlternativeMappings.
-    assert((OpdMapper.getInstrMapping().getID() >= 1 ||
+    assert((OpdMapper.getInstrMapping().getID() >= 1 &&
             OpdMapper.getInstrMapping().getID() <= 4) &&
            "Don't know how to handle that ID");
     return applyDefaultMapping(OpdMapper);
@@ -431,7 +437,7 @@
 #endif // End NDEBUG.
 
   AArch64::PartialMappingIdx RBIdx =
-      IsFPR ? AArch64::FirstFPR : AArch64::FirstGPR;
+      IsFPR ? AArch64::PMI_FirstFPR : AArch64::PMI_FirstGPR;
 
   return InstructionMapping{DefaultMappingID, 1,
                             AArch64::getValueMapping(RBIdx, Size), NumOperands};
@@ -488,6 +494,10 @@
                               AArch64::getCopyMapping(DstIsGPR, SrcIsGPR, Size),
                               /*NumOperands*/ 2};
   }
+  case TargetOpcode::G_SEQUENCE:
+    // FIXME: support this, but the generic code is really not going to do
+    // anything sane.
+    return InstructionMapping();
   default:
     break;
   }
@@ -508,9 +518,9 @@
     // As a top-level guess, vectors go in FPRs, scalars and pointers in GPRs.
     // For floating-point instructions, scalars go in FPRs.
     if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc))
-      OpRegBankIdx[Idx] = AArch64::FirstFPR;
+      OpRegBankIdx[Idx] = AArch64::PMI_FirstFPR;
     else
-      OpRegBankIdx[Idx] = AArch64::FirstGPR;
+      OpRegBankIdx[Idx] = AArch64::PMI_FirstGPR;
   }
 
   unsigned Cost = 1;
@@ -519,18 +529,18 @@
   switch (Opc) {
   case TargetOpcode::G_SITOFP:
   case TargetOpcode::G_UITOFP: {
-    OpRegBankIdx = {AArch64::FirstFPR, AArch64::FirstGPR};
+    OpRegBankIdx = {AArch64::PMI_FirstFPR, AArch64::PMI_FirstGPR};
     break;
   }
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI: {
-    OpRegBankIdx = {AArch64::FirstGPR, AArch64::FirstFPR};
+    OpRegBankIdx = {AArch64::PMI_FirstGPR, AArch64::PMI_FirstFPR};
     break;
   }
   case TargetOpcode::G_FCMP: {
-    OpRegBankIdx = {AArch64::FirstGPR,
-                    /* Predicate */ AArch64::PartialMappingIdx::None,
-                    AArch64::FirstFPR, AArch64::FirstFPR};
+    OpRegBankIdx = {AArch64::PMI_FirstGPR,
+                    /* Predicate */ AArch64::PMI_None, AArch64::PMI_FirstFPR,
+                    AArch64::PMI_FirstFPR};
     break;
   }
   case TargetOpcode::G_BITCAST: {
@@ -548,7 +558,7 @@
     // for the greedy mode the cost of the cross bank copy will
     // offset this number.
     // FIXME: Should be derived from the scheduling model.
-    if (OpRegBankIdx[0] >= AArch64::FirstFPR)
+    if (OpRegBankIdx[0] >= AArch64::PMI_FirstFPR)
       Cost = 2;
   }
   }
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index b4b8dc0..7b0a7f4 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -90,6 +90,10 @@
 ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
 FunctionPass *createAMDGPUAnnotateUniformValues();
 
+FunctionPass* createAMDGPUUnifyMetadataPass();
+void initializeAMDGPUUnifyMetadataPass(PassRegistry&);
+extern char &AMDGPUUnifyMetadataID;
+
 void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
 extern char &SIFixControlFlowLiveIntervalsID;
 
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index 16815af..c011be6 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -15,7 +15,10 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUIntrinsicInfo.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/Debug.h"
@@ -30,6 +33,10 @@
 class AMDGPUAnnotateUniformValues : public FunctionPass,
                        public InstVisitor<AMDGPUAnnotateUniformValues> {
   DivergenceAnalysis *DA;
+  MemoryDependenceResults *MDR;
+  LoopInfo *LI;
+  DenseMap<Value*, GetElementPtrInst*> noClobberClones;
+  bool isKernelFunc;
 
 public:
   static char ID;
@@ -42,12 +49,14 @@
   }
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DivergenceAnalysis>();
+    AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
     AU.setPreservesAll();
  }
 
   void visitBranchInst(BranchInst &I);
   void visitLoadInst(LoadInst &I);
-
+  bool isClobberedInFunction(LoadInst * Load);
 };
 
 } // End anonymous namespace
@@ -55,6 +64,8 @@
 INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
                       "Add AMDGPU uniform metadata", false, false)
 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
                     "Add AMDGPU uniform metadata", false, false)
 
@@ -63,6 +74,46 @@
 static void setUniformMetadata(Instruction *I) {
   I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
 }
+static void setNoClobberMetadata(Instruction *I) {
+  I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
+}
+
+static void DFS(BasicBlock *Root, SetVector<BasicBlock*> & Set) {
+  for (auto I : predecessors(Root))
+    if (Set.insert(I))
+      DFS(I, Set);
+}
+
+bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
+  // 1. get Loop for the Load->getparent();
+  // 2. if it exists, collect all the BBs from the most outer
+  // loop and check for the writes. If NOT - start DFS over all preds.
+  // 3. Start DFS over all preds from the most outer loop header.
+  SetVector<BasicBlock *> Checklist;
+  BasicBlock *Start = Load->getParent();
+  Checklist.insert(Start);
+  const Value *Ptr = Load->getPointerOperand();
+  const Loop *L = LI->getLoopFor(Start);
+  if (L) {
+    const Loop *P = L;
+    do {
+      L = P;
+      P = P->getParentLoop();
+    } while (P);
+    Checklist.insert(L->block_begin(), L->block_end());
+    Start = L->getHeader();
+  }
+
+  DFS(Start, Checklist);
+  for (auto &BB : Checklist) {
+    BasicBlock::iterator StartIt = (BB == Load->getParent()) ?
+     BasicBlock::iterator(Load) : BB->end();
+     if (MDR->getPointerDependencyFrom(MemoryLocation(Ptr),
+       true, StartIt, BB, Load).isClobber())
+       return true;
+  }
+  return false;
+}
 
 void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
   if (I.isUnconditional())
@@ -79,10 +130,39 @@
   Value *Ptr = I.getPointerOperand();
   if (!DA->isUniform(Ptr))
     return;
+  auto isGlobalLoad = [](LoadInst &Load)->bool {
+    return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+  };
+  // We're tracking up to the Function boundaries
+  // We cannot go beyond because of FunctionPass restrictions
+  // Thus we can ensure that memory not clobbered for memory
+  // operations that live in kernel only.
+  bool NotClobbered = isKernelFunc &&   !isClobberedInFunction(&I);
+  Instruction *PtrI = dyn_cast<Instruction>(Ptr);
+  if (!PtrI && NotClobbered && isGlobalLoad(I)) {
+    if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
+      // Lookup for the existing GEP
+      if (noClobberClones.count(Ptr)) {
+        PtrI = noClobberClones[Ptr];
+      } else {
+        // Create GEP of the Value
+        Function *F = I.getParent()->getParent();
+        Value *Idx = Constant::getIntegerValue(
+          Type::getInt32Ty(Ptr->getContext()), APInt(64, 0));
+        // Insert GEP at the entry to make it dominate all uses
+        PtrI = GetElementPtrInst::Create(
+          Ptr->getType()->getPointerElementType(), Ptr,
+          ArrayRef<Value*>(Idx), Twine(""), F->getEntryBlock().getFirstNonPHI());
+      }
+      I.replaceUsesOfWith(Ptr, PtrI);
+    }
+  }
 
-  if (Instruction *PtrI = dyn_cast<Instruction>(Ptr))
+  if (PtrI) {
     setUniformMetadata(PtrI);
-
+    if (NotClobbered)
+      setNoClobberMetadata(PtrI);
+  }
 }
 
 bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
@@ -93,9 +173,13 @@
   if (skipFunction(F))
     return false;
 
-  DA = &getAnalysis<DivergenceAnalysis>();
-  visit(F);
+  DA  = &getAnalysis<DivergenceAnalysis>();
+  MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+  LI  = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
 
+  visit(F);
+  noClobberClones.clear();
   return true;
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d836a8e..2390fc9 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -763,6 +763,11 @@
   header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
   header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
 
+  // These alignment values are specified in powers of two, so alignment =
+  // 2^n.  The minimum alignment is 2^4 = 16.
+  header.kernarg_segment_alignment = std::max((size_t)4,
+      countTrailingZeros(MFI->getMaxKernArgAlign()));
+
   if (STM.debuggerEmitPrologue()) {
     header.debug_wavefront_private_segment_offset_sgpr =
       KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index b427de1..2e8db08 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -146,9 +146,10 @@
 
 Value *AMDGPUCodeGenPrepare::copyFlags(
     const BinaryOperator &I, Value *V) const {
-  assert(isa<BinaryOperator>(V) && "V must be binary operation");
+  BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
+  if (!BinOp) // Possibly constant expression.
+    return V;
 
-  BinaryOperator *BinOp = cast<BinaryOperator>(V);
   if (isa<OverflowingBinaryOperator>(BinOp)) {
     BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
     BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index bf493c9..c74fc4a 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -135,6 +135,8 @@
 
   void SelectADD_SUB_I64(SDNode *N);
   void SelectDIV_SCALE(SDNode *N);
+  void SelectFMA_W_CHAIN(SDNode *N);
+  void SelectFMUL_W_CHAIN(SDNode *N);
 
   SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
                    uint32_t Offset, uint32_t Width);
@@ -296,6 +298,15 @@
     SelectADD_SUB_I64(N);
     return;
   }
+  case AMDGPUISD::FMUL_W_CHAIN: {
+    SelectFMUL_W_CHAIN(N);
+    return;
+  }
+  case AMDGPUISD::FMA_W_CHAIN: {
+    SelectFMA_W_CHAIN(N);
+    return;
+  }
+
   case ISD::SCALAR_TO_VECTOR:
   case AMDGPUISD::BUILD_VERTICAL_VECTOR:
   case ISD::BUILD_VECTOR: {
@@ -653,6 +664,33 @@
   CurDAG->RemoveDeadNode(N);
 }
 
+void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
+  SDLoc SL(N);
+  //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+  SDValue Ops[10];
+
+  SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
+  SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
+  SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
+  Ops[8] = N->getOperand(0);
+  Ops[9] = N->getOperand(4);
+
+  CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops);
+}
+
+void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
+  SDLoc SL(N);
+  //	src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
+  SDValue Ops[8];
+
+  SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
+  SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
+  Ops[6] = N->getOperand(0);
+  Ops[7] = N->getOperand(3);
+
+  CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
+}
+
 // We need to handle this here because tablegen doesn't support matching
 // instructions with multiple outputs.
 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 93dcd72..8cc995c 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2953,6 +2953,9 @@
   NODE_NAME_CASE(DWORDADDR)
   NODE_NAME_CASE(FRACT)
   NODE_NAME_CASE(SETCC)
+  NODE_NAME_CASE(SETREG)
+  NODE_NAME_CASE(FMA_W_CHAIN)
+  NODE_NAME_CASE(FMUL_W_CHAIN)
   NODE_NAME_CASE(CLAMP)
   NODE_NAME_CASE(COS_HW)
   NODE_NAME_CASE(SIN_HW)
@@ -2999,6 +3002,8 @@
   NODE_NAME_CASE(MAD_I24)
   NODE_NAME_CASE(TEXTURE_FETCH)
   NODE_NAME_CASE(EXPORT)
+  NODE_NAME_CASE(EXPORT_DONE)
+  NODE_NAME_CASE(R600_EXPORT)
   NODE_NAME_CASE(CONST_ADDRESS)
   NODE_NAME_CASE(REGISTER_LOAD)
   NODE_NAME_CASE(REGISTER_STORE)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 6c6fc2e..f01afef 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -230,6 +230,10 @@
   // This is SETCC with the full mask result which is used for a compare with a
   // result bit per item in the wavefront.
   SETCC,
+  SETREG,
+  // FP ops with input and output chain.
+  FMA_W_CHAIN,
+  FMUL_W_CHAIN,
 
   // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
   // Denormals handled on some parts.
@@ -280,7 +284,9 @@
   MUL_LOHI_I24,
   MUL_LOHI_U24,
   TEXTURE_FETCH,
-  EXPORT,
+  EXPORT, // exp on SI+
+  EXPORT_DONE, // exp on SI+ with done bit set
+  R600_EXPORT,
   CONST_ADDRESS,
   REGISTER_LOAD,
   REGISTER_STORE,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 4bccd81..e7b4001 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -150,6 +150,19 @@
 
 def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;
 
+def AMDGPUSetRegOp :  SDTypeProfile<0, 2, [
+  SDTCisInt<0>, SDTCisInt<1>
+]>;
+
+def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [
+  SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [
+   SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [
+  SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
 def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
   SDTIntToFPOp, []>;
 def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
@@ -265,9 +278,35 @@
                       SDTypeProfile<1, 4, [SDTCisFP<0>]>,
                       [SDNPInGlue]>;
 
+
 def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
   [SDNPHasChain, SDNPSideEffect]>;
 
+// SI+ export
+def AMDGPUExportOp : SDTypeProfile<0, 8, [
+  SDTCisInt<0>, // i8 en
+  SDTCisInt<1>, // i1 vm
+  // skip done
+  SDTCisInt<2>, // i8 tgt
+  SDTCisSameAs<3, 1>, // i1 compr
+  SDTCisFP<4>,        // f32 src0
+  SDTCisSameAs<5, 4>, // f32 src1
+  SDTCisSameAs<6, 4>, // f32 src2
+  SDTCisSameAs<7, 4>  // f32 src3
+]>;
+
+def AMDGPUexport: SDNode<"AMDGPUISD::EXPORT", AMDGPUExportOp,
+  [SDNPHasChain, SDNPMayStore]>;
+
+def AMDGPUexport_done: SDNode<"AMDGPUISD::EXPORT_DONE", AMDGPUExportOp,
+  [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
+
+
+def R600ExportOp : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
+
+def R600_EXPORT: SDNode<"AMDGPUISD::R600_EXPORT", R600ExportOp,
+  [SDNPHasChain, SDNPSideEffect]>;
+
 //===----------------------------------------------------------------------===//
 // Flow Control Profile Types
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 0c8b79d..5d0640b 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -50,6 +50,10 @@
     return KernArgSize;
   }
 
+  unsigned getMaxKernArgAlign() const {
+    return MaxKernArgAlign;
+  }
+
   void setABIArgOffset(unsigned NewOffset) {
     ABIArgOffset = NewOffset;
   }
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index d0dd7a9..6a0275a 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -121,6 +121,7 @@
     CFALUBug(false),
     HasVertexCache(false),
     TexVTXClauseSize(0),
+    ScalarizeGlobal(false),
 
     FeatureDisable(false),
     InstrItins(getInstrItineraryForCPU(GPU)),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 842711b..939d137 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -114,6 +114,7 @@
   bool CFALUBug;
   bool HasVertexCache;
   short TexVTXClauseSize;
+  bool ScalarizeGlobal;
 
   // Dummy feature to use for assembler in tablegen.
   bool FeatureDisable;
@@ -401,6 +402,9 @@
     return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize();
   }
 
+  void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
+  bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
+
   /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
   /// for function \p F, or minimum/maximum flat work group sizes explicitly
   /// requested using "amdgpu-flat-work-group-size" attribute attached to
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 7287b56..712c549 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Vectorize.h"
+#include "llvm/IR/LegacyPassManager.h"
 
 using namespace llvm;
 
@@ -61,6 +62,14 @@
   cl::init(true),
   cl::Hidden);
 
+// Option to to control global loads scalarization
+static cl::opt<bool> ScalarizeGlobal(
+  "amdgpu-scalarize-global-loads",
+  cl::desc("Enable global load scalarization"),
+  cl::init(false),
+  cl::Hidden);
+
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -77,6 +86,7 @@
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
   initializeAMDGPUPromoteAllocaPass(*PR);
   initializeAMDGPUCodeGenPreparePass(*PR);
+  initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
   initializeSIInsertWaitsPass(*PR);
   initializeSIWholeQuadModePass(*PR);
@@ -162,7 +172,6 @@
                       FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
     TLOF(createTLOF(getTargetTriple())),
     IntrinsicInfo() {
-  setRequiresStructuredCFG(true);
   initAsmInfo();
 }
 
@@ -182,6 +191,10 @@
     FSAttr.getValueAsString();
 }
 
+void AMDGPUTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) {
+  PM.add(llvm::createAMDGPUUnifyMetadataPass());
+}
+
 //===----------------------------------------------------------------------===//
 // R600 Target Machine (R600 -> Cayman)
 //===----------------------------------------------------------------------===//
@@ -191,7 +204,9 @@
                                      TargetOptions Options,
                                      Optional<Reloc::Model> RM,
                                      CodeModel::Model CM, CodeGenOpt::Level OL)
-  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
+  setRequiresStructuredCFG(true);
+}
 
 const R600Subtarget *R600TargetMachine::getSubtargetImpl(
   const Function &F) const {
@@ -261,6 +276,8 @@
     I->setGISelAccessor(*GISel);
   }
 
+  I->setScalarizeGlobalBehavior(ScalarizeGlobal);
+
   return I.get();
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index d8a71b4..1b56f46 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -50,6 +50,7 @@
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+  void addEarlyAsPossiblePasses(PassManagerBase &PM) override;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
new file mode 100644
index 0000000..1118eee
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -0,0 +1,147 @@
+//===-- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// \brief This pass that unifies multiple OpenCL metadata due to linking.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+  namespace kOCLMD {
+    const char SpirVer[]            = "opencl.spir.version";
+    const char OCLVer[]             = "opencl.ocl.version";
+    const char UsedExt[]            = "opencl.used.extensions";
+    const char UsedOptCoreFeat[]    = "opencl.used.optional.core.features";
+    const char CompilerOptions[]    = "opencl.compiler.options";
+    const char LLVMIdent[]          = "llvm.ident";
+  }
+
+  /// \brief Unify multiple OpenCL metadata due to linking.
+  class AMDGPUUnifyMetadata : public FunctionPass {
+  public:
+    static char ID;
+    explicit AMDGPUUnifyMetadata() : FunctionPass(ID) {};
+
+  private:
+    // This should really be a module pass but we have to run it as early
+    // as possible, so given function passes are executed first and
+    // TargetMachine::addEarlyAsPossiblePasses() expects only function passes
+    // it has to be a function pass.
+    virtual bool runOnModule(Module &M);
+
+    // \todo: Convert to a module pass.
+    virtual bool runOnFunction(Function &F);
+
+    /// \brief Unify version metadata.
+    /// \return true if changes are made.
+    /// Assume the named metadata has operands each of which is a pair of
+    /// integer constant, e.g.
+    /// !Name = {!n1, !n2}
+    /// !n1 = {i32 1, i32 2}
+    /// !n2 = {i32 2, i32 0}
+    /// Keep the largest version as the sole operand if PickFirst is false.
+    /// Otherwise pick it from the first value, representing kernel module.
+    bool unifyVersionMD(Module &M, StringRef Name, bool PickFirst) {
+      auto NamedMD = M.getNamedMetadata(Name);
+      if (!NamedMD || NamedMD->getNumOperands() <= 1)
+        return false;
+      MDNode *MaxMD = nullptr;
+      auto MaxVer = 0U;
+      for (const auto &VersionMD : NamedMD->operands()) {
+        assert(VersionMD->getNumOperands() == 2);
+        auto CMajor = mdconst::extract<ConstantInt>(VersionMD->getOperand(0));
+        auto VersionMajor = CMajor->getZExtValue();
+        auto CMinor = mdconst::extract<ConstantInt>(VersionMD->getOperand(1));
+        auto VersionMinor = CMinor->getZExtValue();
+        auto Ver = (VersionMajor * 100) + (VersionMinor * 10);
+        if (Ver > MaxVer) {
+          MaxVer = Ver;
+          MaxMD = VersionMD;
+        }
+        if (PickFirst)
+          break;
+      }
+      NamedMD->eraseFromParent();
+      NamedMD = M.getOrInsertNamedMetadata(Name);
+      NamedMD->addOperand(MaxMD);
+      return true;
+    }
+
+  /// \brief Unify version metadata.
+  /// \return true if changes are made.
+  /// Assume the named metadata has operands each of which is a list e.g.
+  /// !Name = {!n1, !n2}
+  /// !n1 = !{!"cl_khr_fp16", {!"cl_khr_fp64"}}
+  /// !n2 = !{!"cl_khr_image"}
+  /// Combine it into a single list with unique operands.
+  bool unifyExtensionMD(Module &M, StringRef Name) {
+    auto NamedMD = M.getNamedMetadata(Name);
+    if (!NamedMD || NamedMD->getNumOperands() == 1)
+      return false;
+
+    SmallVector<Metadata *, 4> All;
+    for (const auto &MD : NamedMD->operands())
+      for (const auto &Op : MD->operands())
+        if (std::find(All.begin(), All.end(), Op.get()) == All.end())
+          All.push_back(Op.get());
+
+    NamedMD->eraseFromParent();
+    NamedMD = M.getOrInsertNamedMetadata(Name);
+    NamedMD->addOperand(MDNode::get(M.getContext(), All));
+    return true;
+  }
+};
+
+} // end anonymous namespace
+
+char AMDGPUUnifyMetadata::ID = 0;
+
+char &llvm::AMDGPUUnifyMetadataID = AMDGPUUnifyMetadata::ID;
+
+INITIALIZE_PASS(AMDGPUUnifyMetadata, "amdgpu-unify-metadata",
+                "Unify multiple OpenCL metadata due to linking",
+                false, false)
+
+FunctionPass* llvm::createAMDGPUUnifyMetadataPass() {
+  return new AMDGPUUnifyMetadata();
+}
+
+bool AMDGPUUnifyMetadata::runOnModule(Module &M) {
+  const char* Vers[] = {
+      kOCLMD::SpirVer,
+      kOCLMD::OCLVer
+  };
+  const char* Exts[] = {
+      kOCLMD::UsedExt,
+      kOCLMD::UsedOptCoreFeat,
+      kOCLMD::CompilerOptions,
+      kOCLMD::LLVMIdent
+  };
+
+  bool Changed = false;
+
+  for (auto &I:Vers)
+    Changed |= unifyVersionMD(M, I, true);
+
+  for (auto &I:Exts)
+    Changed |= unifyExtensionMD(M, I);
+
+  return Changed;
+}
+
+bool AMDGPUUnifyMetadata::runOnFunction(Function &F) {
+  return runOnModule(*F.getParent());
+}
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 4ed2673..440d0fa 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -135,7 +135,11 @@
     ImmTyDA,
     ImmTyR128,
     ImmTyLWE,
+    ImmTyExpTgt,
+    ImmTyExpCompr,
+    ImmTyExpVM,
     ImmTyHwreg,
+    ImmTyOff,
     ImmTySendMsg,
   };
 
@@ -213,6 +217,10 @@
     return isRegOrImmWithInputMods(MVT::f64);
   }
 
+  bool isVReg32OrOff() const {
+    return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
+  }
+
   bool isImmTy(ImmTy ImmT) const {
     return isImm() && Imm.Type == ImmT;
   }
@@ -228,6 +236,10 @@
   bool isDA() const { return isImmTy(ImmTyDA); }
   bool isR128() const { return isImmTy(ImmTyUNorm); }
   bool isLWE() const { return isImmTy(ImmTyLWE); }
+  bool isOff() const { return isImmTy(ImmTyOff); }
+  bool isExpTgt() const { return isImmTy(ImmTyExpTgt); }
+  bool isExpVM() const { return isImmTy(ImmTyExpVM); }
+  bool isExpCompr() const { return isImmTy(ImmTyExpCompr); }
   bool isOffen() const { return isImmTy(ImmTyOffen); }
   bool isIdxen() const { return isImmTy(ImmTyIdxen); }
   bool isAddr64() const { return isImmTy(ImmTyAddr64); }
@@ -456,7 +468,7 @@
     }
   }
 
-  void printImmTy(raw_ostream& OS, ImmTy Type) const {
+  static void printImmTy(raw_ostream& OS, ImmTy Type) {
     switch (Type) {
     case ImmTyNone: OS << "None"; break;
     case ImmTyGDS: OS << "GDS"; break;
@@ -484,6 +496,10 @@
     case ImmTyDA: OS << "DA"; break;
     case ImmTyR128: OS << "R128"; break;
     case ImmTyLWE: OS << "LWE"; break;
+    case ImmTyOff: OS << "Off"; break;
+    case ImmTyExpTgt: OS << "ExpTgt"; break;
+    case ImmTyExpCompr: OS << "ExpCompr"; break;
+    case ImmTyExpVM: OS << "ExpVM"; break;
     case ImmTyHwreg: OS << "Hwreg"; break;
     case ImmTySendMsg: OS << "SendMsg"; break;
     }
@@ -650,6 +666,10 @@
     return AMDGPU::isVI(getSTI());
   }
 
+  bool hasInv2PiInlineImm() const {
+    return getSTI().getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
+  }
+
   bool hasSGPR102_SGPR103() const {
     return !isVI();
   }
@@ -707,9 +727,11 @@
   OperandMatchResultTy parseRegOrImm(OperandVector &Operands);
   OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands);
   OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands);
+  OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
 
   void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
   void cvtDS(MCInst &Inst, const OperandVector &Operands);
+  void cvtExp(MCInst &Inst, const OperandVector &Operands);
 
   bool parseCnt(int64_t &IntVal);
   OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands);
@@ -724,9 +746,14 @@
 
   bool parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId);
   bool parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width);
+
+  void errorExpTgt();
+  OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
+
 public:
   OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
 
+  OperandMatchResultTy parseExpTgt(OperandVector &Operands);
   OperandMatchResultTy parseSendMsgOp(OperandVector &Operands);
   OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
 
@@ -745,6 +772,9 @@
   AMDGPUOperand::Ptr defaultSMRDOffset8() const;
   AMDGPUOperand::Ptr defaultSMRDOffset20() const;
   AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
+  AMDGPUOperand::Ptr defaultExpTgt() const;
+  AMDGPUOperand::Ptr defaultExpCompr() const;
+  AMDGPUOperand::Ptr defaultExpVM() const;
 
   OperandMatchResultTy parseOModOperand(OperandVector &Operands);
 
@@ -778,12 +808,44 @@
   bool (*ConvertResult)(int64_t&);
 };
 
+// May be called with integer type with equivalent bitwidth.
+static const fltSemantics *getFltSemantics(MVT VT) {
+  switch (VT.getSizeInBits()) {
+  case 32:
+    return &APFloat::IEEEsingle;
+  case 64:
+    return &APFloat::IEEEdouble;
+  case 16:
+    return &APFloat::IEEEhalf;
+  default:
+    llvm_unreachable("unsupported fp type");
+  }
+}
+
 }
 
 //===----------------------------------------------------------------------===//
 // Operand
 //===----------------------------------------------------------------------===//
 
+static bool canLosslesslyConvertToFPType(APFloat &FPLiteral, MVT VT) {
+  bool Lost;
+
+  // Convert literal to single precision
+  APFloat::opStatus Status = FPLiteral.convert(*getFltSemantics(VT),
+                                               APFloat::rmNearestTiesToEven,
+                                               &Lost);
+  // We allow precision lost but not overflow or underflow
+  if (Status != APFloat::opOK &&
+      Lost &&
+      ((Status & APFloat::opOverflow)  != 0 ||
+       (Status & APFloat::opUnderflow) != 0)) {
+    return false;
+  }
+
+  return true;
+}
+
 bool AMDGPUOperand::isInlinableImm(MVT type) const {
   if (!isImmTy(ImmTyNone)) {
     // Only plain immediates are inlinable (e.g. "clamp" attribute is not)
@@ -797,36 +859,30 @@
 
   if (Imm.IsFPImm) { // We got fp literal token
     if (type == MVT::f64 || type == MVT::i64) { // Expected 64-bit operand
-      return AMDGPU::isInlinableLiteral64(Imm.Val, AsmParser->isVI());
-    } else { // Expected 32-bit operand
-      bool lost;
-      APFloat FPLiteral(APFloat::IEEEdouble, Literal);
-      // Convert literal to single precision
-      APFloat::opStatus status = FPLiteral.convert(APFloat::IEEEsingle,
-                                                    APFloat::rmNearestTiesToEven,
-                                                    &lost);
-      // We allow precision lost but not overflow or underflow
-      if (status != APFloat::opOK &&
-          lost &&
-          ((status & APFloat::opOverflow)  != 0 ||
-            (status & APFloat::opUnderflow) != 0)) {
-        return false;
-      }
-      // Check if single precision literal is inlinable
-      return AMDGPU::isInlinableLiteral32(
-              static_cast<int32_t>(FPLiteral.bitcastToAPInt().getZExtValue()),
-              AsmParser->isVI());
+      return AMDGPU::isInlinableLiteral64(Imm.Val,
+                                          AsmParser->hasInv2PiInlineImm());
     }
-  } else { // We got int literal token
-    if (type == MVT::f64 || type == MVT::i64) { // Expected 64-bit operand
-      return AMDGPU::isInlinableLiteral64(Imm.Val, AsmParser->isVI());
-    } else { // Expected 32-bit operand
-      return AMDGPU::isInlinableLiteral32(
-            static_cast<int32_t>(Literal.getLoBits(32).getZExtValue()),
-            AsmParser->isVI());
-    }
+
+    APFloat FPLiteral(APFloat::IEEEdouble, APInt(64, Imm.Val));
+    if (!canLosslesslyConvertToFPType(FPLiteral, type))
+      return false;
+
+    // Check if single precision literal is inlinable
+    return AMDGPU::isInlinableLiteral32(
+      static_cast<int32_t>(FPLiteral.bitcastToAPInt().getZExtValue()),
+      AsmParser->hasInv2PiInlineImm());
   }
-  return false;
+
+
+  // We got int literal token.
+  if (type == MVT::f64 || type == MVT::i64) { // Expected 64-bit operand
+    return AMDGPU::isInlinableLiteral64(Imm.Val,
+                                        AsmParser->hasInv2PiInlineImm());
+  }
+
+  return AMDGPU::isInlinableLiteral32(
+    static_cast<int32_t>(Literal.getLoBits(32).getZExtValue()),
+    AsmParser->hasInv2PiInlineImm());
 }
 
 bool AMDGPUOperand::isLiteralImm(MVT type) const {
@@ -835,45 +891,28 @@
     return false;
   }
 
-  APInt Literal(64, Imm.Val);
+  if (!Imm.IsFPImm) {
+    // We got int literal token.
 
-  if (Imm.IsFPImm) { // We got fp literal token
-    if (type == MVT::f64) { // Expected 64-bit fp operand
-      // We would set low 64-bits of literal to zeroes but we accept this literals
-      return true;
-    } else if (type == MVT::i64) { // Expected 64-bit int operand
-        // We don't allow fp literals in 64-bit integer instructions. It is
-        // unclear how we should encode them.
-      return false;
-    } else { // Expected 32-bit operand
-      bool lost;
-      APFloat FPLiteral(APFloat::IEEEdouble, Literal);
-      // Convert literal to single precision
-      APFloat::opStatus status = FPLiteral.convert(APFloat::IEEEsingle,
-                                                    APFloat::rmNearestTiesToEven,
-                                                    &lost);
-      // We allow precision lost but not overflow or underflow
-      if (status != APFloat::opOK &&
-          lost &&
-          ((status & APFloat::opOverflow)  != 0 ||
-            (status & APFloat::opUnderflow) != 0)) {
-        return false;
-      }
-      return true;
-    }
-  } else { // We got int literal token
-    APInt HiBits = Literal.getHiBits(32);
-    if (HiBits == 0xffffffff &&
-        (*Literal.getLoBits(32).getRawData() & 0x80000000) != 0) {
-      // If high 32 bits aren't zeroes then they all should be ones and 32nd
-      // bit should be set. So that this 64-bit literal is sign-extension of
-      // 32-bit value.
-      return true;
-    } else if (HiBits == 0) {
-      return true;
-    }
+    // FIXME: 64-bit operands can zero extend, sign extend, or pad zeroes for FP
+    // types.
+    return isUInt<32>(Imm.Val) || isInt<32>(Imm.Val);
   }
-  return false;
+
+  // We got fp literal token
+  if (type == MVT::f64) { // Expected 64-bit fp operand
+    // We would set low 64-bits of literal to zeroes but we accept this literals
+    return true;
+  }
+
+  if (type == MVT::i64) { // Expected 64-bit int operand
+    // We don't allow fp literals in 64-bit integer instructions. It is
+    // unclear how we should encode them.
+    return false;
+  }
+
+  APFloat FPLiteral(APFloat::IEEEdouble, APInt(64, Imm.Val));
+  return canLosslesslyConvertToFPType(FPLiteral, type);
 }
 
 bool AMDGPUOperand::isRegClass(unsigned RCID) const {
@@ -912,7 +951,8 @@
   if (Imm.IsFPImm) { // We got fp literal token
     if (OpSize == 8) { // Expected 64-bit operand
       // Check if literal is inlinable
-      if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(), AsmParser->isVI())) {
+      if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
+                                       AsmParser->hasInv2PiInlineImm())) {
         Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
       } else if (AMDGPU::isSISrcFPOperand(InstDesc, OpNum)) { // Expected 64-bit fp operand
         // For fp operands we check if low 32 bits are zeros
@@ -941,13 +981,15 @@
   } else { // We got int literal token
     if (OpSize == 8) { // Expected 64-bit operand
       auto LiteralVal = Literal.getZExtValue();
-      if (AMDGPU::isInlinableLiteral64(LiteralVal, AsmParser->isVI())) {
+      if (AMDGPU::isInlinableLiteral64(LiteralVal,
+                                       AsmParser->hasInv2PiInlineImm())) {
         Inst.addOperand(MCOperand::createImm(LiteralVal));
         return;
       }
     } else { // Expected 32-bit operand
       auto LiteralVal = static_cast<int32_t>(Literal.getLoBits(32).getZExtValue());
-      if (AMDGPU::isInlinableLiteral32(LiteralVal, AsmParser->isVI())) {
+      if (AMDGPU::isInlinableLiteral32(LiteralVal,
+                                       AsmParser->hasInv2PiInlineImm())) {
         Inst.addOperand(MCOperand::createImm(LiteralVal));
         return;
       }
@@ -1350,9 +1392,28 @@
     AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
     Op.setModifiers(Mods);
   }
+
   return MatchOperand_Success;
 }
 
+OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) {
+  std::unique_ptr<AMDGPUOperand> Reg = parseRegister();
+  if (Reg) {
+    Operands.push_back(std::move(Reg));
+    return MatchOperand_Success;
+  }
+
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.getString() == "off") {
+    Operands.push_back(AMDGPUOperand::CreateImm(this, 0, Tok.getLoc(),
+                                                AMDGPUOperand::ImmTyOff, false));
+    Parser.Lex();
+    return MatchOperand_Success;
+  }
+
+  return MatchOperand_NoMatch;
+}
+
 unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
 
   uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
@@ -1992,6 +2053,46 @@
   Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
 }
 
+void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+
+  unsigned EnMask = 0;
+  int SrcIdx = 0;
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+    // Add the register arguments
+    if (Op.isReg()) {
+      EnMask |= (1 << SrcIdx);
+      Op.addRegOperands(Inst, 1);
+      ++SrcIdx;
+      continue;
+    }
+
+    if (Op.isOff()) {
+      ++SrcIdx;
+      Inst.addOperand(MCOperand::createReg(AMDGPU::NoRegister));
+      continue;
+    }
+
+    if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyExpTgt) {
+      Op.addImmOperands(Inst, 1);
+      continue;
+    }
+
+    if (Op.isToken() && Op.getToken() == "done")
+      continue;
+
+    // Handle optional arguments
+    OptionalIdx[Op.getImmTy()] = i;
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpVM);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpCompr);
+
+  Inst.addOperand(MCOperand::createImm(EnMask));
+}
 
 //===----------------------------------------------------------------------===//
 // s_waitcnt
@@ -2266,6 +2367,85 @@
   return false;
 }
 
+void AMDGPUAsmParser::errorExpTgt() {
+  Error(Parser.getTok().getLoc(), "invalid exp target");
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseExpTgtImpl(StringRef Str,
+                                                      uint8_t &Val) {
+  if (Str == "null") {
+    Val = 9;
+    return MatchOperand_Success;
+  }
+
+  if (Str.startswith("mrt")) {
+    Str = Str.drop_front(3);
+    if (Str == "z") { // == mrtz
+      Val = 8;
+      return MatchOperand_Success;
+    }
+
+    if (Str.getAsInteger(10, Val))
+      return MatchOperand_ParseFail;
+
+    if (Val > 7)
+      errorExpTgt();
+
+    return MatchOperand_Success;
+  }
+
+  if (Str.startswith("pos")) {
+    Str = Str.drop_front(3);
+    if (Str.getAsInteger(10, Val))
+      return MatchOperand_ParseFail;
+
+    if (Val > 3)
+      errorExpTgt();
+
+    Val += 12;
+    return MatchOperand_Success;
+  }
+
+  if (Str.startswith("param")) {
+    Str = Str.drop_front(5);
+    if (Str.getAsInteger(10, Val))
+      return MatchOperand_ParseFail;
+
+    if (Val >= 32)
+      errorExpTgt();
+
+    Val += 32;
+    return MatchOperand_Success;
+  }
+
+  if (Str.startswith("invalid_target_")) {
+    Str = Str.drop_front(15);
+    if (Str.getAsInteger(10, Val))
+      return MatchOperand_ParseFail;
+
+    errorExpTgt();
+    return MatchOperand_Success;
+  }
+
+  return MatchOperand_NoMatch;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) {
+  uint8_t Val;
+  StringRef Str = Parser.getTok().getString();
+
+  auto Res = parseExpTgtImpl(Str, Val);
+  if (Res != MatchOperand_Success)
+    return Res;
+
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex();
+
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S,
+                                              AMDGPUOperand::ImmTyExpTgt));
+  return MatchOperand_Success;
+}
+
 OperandMatchResultTy
 AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
   using namespace llvm::AMDGPU::SendMsg;
@@ -2531,6 +2711,18 @@
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyLWE);
 }
 
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultExpTgt() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyExpTgt);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultExpCompr() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyExpCompr);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultExpVM() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyExpVM);
+}
+
 //===----------------------------------------------------------------------===//
 // smrd
 //===----------------------------------------------------------------------===//
@@ -2627,6 +2819,7 @@
   {"src0_sel",   AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr},
   {"src1_sel",   AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr},
   {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr},
+  {"vm", AMDGPUOperand::ImmTyExpVM, true, nullptr},
 };
 
 OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
@@ -3109,6 +3302,9 @@
     return Operand.isSSrcF32() ? Match_Success : Match_InvalidOperand;
   case MCK_SoppBrTarget:
     return Operand.isSoppBrTarget() ? Match_Success : Match_InvalidOperand;
-  default: return Match_InvalidOperand;
+  case MCK_VReg32OrOff:
+    return Operand.isVReg32OrOff() ? Match_Success : Match_InvalidOperand;
+  default:
+    return Match_InvalidOperand;
   }
 }
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index ae54fa3..02d4417 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -41,6 +41,7 @@
   AMDGPUISelDAGToDAG.cpp
   AMDGPUMCInstLower.cpp
   AMDGPUMachineFunction.cpp
+  AMDGPUUnifyMetadata.cpp
   AMDGPUOpenCLImageTypeLoweringPass.cpp
   AMDGPUSubtarget.cpp
   AMDGPUTargetMachine.cpp
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index e20e6f5..81337f4 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -195,6 +195,20 @@
   printNamedBit(MI, OpNo, O, "lwe");
 }
 
+void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " compr";
+}
+
+void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI,
+                                   raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " vm";
+}
+
 void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
                                         const MCRegisterInfo &MRI) {
   switch (RegNo) {
@@ -599,10 +613,72 @@
   }
 }
 
-void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNo,
+template <unsigned N>
+void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  int EnIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::en);
+  unsigned En = MI->getOperand(EnIdx).getImm();
+
+  // FIXME: What do we do with compr? The meaning of en changes depending on if
+  // compr is set.
+
+  if (En & (1 << N))
+    printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI);
+  else
+    O << "off";
+}
+
+void AMDGPUInstPrinter::printExpSrc0(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  printExpSrcN<0>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpSrc1(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  printExpSrcN<1>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpSrc2(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  printExpSrcN<2>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpSrc3(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  printExpSrcN<3>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo,
+                                    const MCSubtargetInfo &STI,
+                                    raw_ostream &O) {
+  // This is really a 6 bit field.
+  uint32_t Tgt = MI->getOperand(OpNo).getImm() & ((1 << 6) - 1);
+
+  if (Tgt <= 7)
+    O << " mrt" << Tgt;
+  else if (Tgt == 8)
+    O << " mrtz";
+  else if (Tgt == 9)
+    O << " null";
+  else if (Tgt >= 12 && Tgt <= 15)
+    O << " pos" << Tgt - 12;
+  else if (Tgt >= 32 && Tgt <= 63)
+    O << " param" << Tgt - 32;
+  else {
+    // Reserved values 10, 11
+    O << " invalid_target_" << Tgt;
+  }
+}
+
+void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
-  unsigned Imm = MI->getOperand(OpNo).getImm();
+  unsigned Imm = MI->getOperand(OpNum).getImm();
 
   if (Imm == 2) {
     O << "P0";
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index 960e658..9d6a203 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -78,8 +78,13 @@
                raw_ostream &O);
   void printR128(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                  raw_ostream &O);
-  void printLWE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                raw_ostream &O);
+  void printLWE(const MCInst *MI, unsigned OpNo,
+                const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExpCompr(const MCInst *MI, unsigned OpNo,
+                     const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExpVM(const MCInst *MI, unsigned OpNo,
+                  const MCSubtargetInfo &STI, raw_ostream &O);
+
   void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
@@ -116,6 +121,22 @@
                           const MCSubtargetInfo &STI, raw_ostream &O);
   void printMemOperand(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
+
+
+  template <unsigned N>
+  void printExpSrcN(const MCInst *MI, unsigned OpNo,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExpSrc0(const MCInst *MI, unsigned OpNo,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExpSrc1(const MCInst *MI, unsigned OpNo,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExpSrc2(const MCInst *MI, unsigned OpNo,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExpSrc3(const MCInst *MI, unsigned OpNo,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExpTgt(const MCInst *MI, unsigned OpNo,
+                   const MCSubtargetInfo &STI, raw_ostream &O);
+
   static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
                          StringRef Asm, StringRef Default = "");
   static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 35e6c9d..9a0d2c1 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -443,7 +443,7 @@
         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
       };
-      return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
+      return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, Op.getValueType(), Args);
     }
 
     // default for switch(IntrinsicID)
@@ -1882,7 +1882,7 @@
     return SDValue();
   }
 
-  case AMDGPUISD::EXPORT: {
+  case AMDGPUISD::R600_EXPORT: {
     SDValue Arg = N->getOperand(1);
     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
       break;
@@ -1898,7 +1898,7 @@
       N->getOperand(7) // SWZ_W
     };
     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
-    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
+    return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, N->getVTList(), NewArgs);
   }
   case AMDGPUISD::TEXTURE_FETCH: {
     SDValue Arg = N->getOperand(1);
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index f843729..3a72e07 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -436,11 +436,6 @@
 // Export Instructions
 //===----------------------------------------------------------------------===//
 
-def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
-
-def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType,
-  [SDNPHasChain, SDNPSideEffect]>;
-
 class ExportWord0 {
   field bits<32> Word0;
 
@@ -486,7 +481,7 @@
 }
 
 multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
-  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
+  def : Pat<(R600_EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
     (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)),
         (ExportInst R600_Reg128:$src, imm:$type, imm:$base,
         imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0)
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index 5e6e754..479c6fc 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -15,7 +15,7 @@
 
 namespace SIInstrFlags {
 // This needs to be kept in sync with the field bits in InstSI.
-enum {
+enum : uint32_t {
   SALU = 1 << 3,
   VALU = 1 << 4,
 
@@ -38,15 +38,16 @@
   DS = 1 << 19,
   MIMG = 1 << 20,
   FLAT = 1 << 21,
-  WQM = 1 << 22,
-  VGPRSpill = 1 << 23,
-  SGPRSpill = 1 << 24,
-  VOPAsmPrefer32Bit = 1 << 25,
-  Gather4 = 1 << 26,
-  DisableWQM = 1 << 27,
-  SOPK_ZEXT = 1 << 28,
-  SCALAR_STORE = 1 << 29,
-  FIXED_SIZE = 1 << 30
+  EXP = 1 << 22,
+  WQM = 1 << 23,
+  VGPRSpill = 1 << 24,
+  SGPRSpill = 1 << 25,
+  VOPAsmPrefer32Bit = 1 << 26,
+  Gather4 = 1 << 27,
+  DisableWQM = 1 << 28,
+  SOPK_ZEXT = 1 << 29,
+  SCALAR_STORE = 1 << 30,
+  FIXED_SIZE = 1u << 31
 };
 }
 
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 8c4e0ad..6a422e7 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -294,6 +294,38 @@
   return false;
 }
 
+static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
+                                    const MachineInstr *MoveImm,
+                                    const SIInstrInfo *TII,
+                                    unsigned &SMovOp,
+                                    int64_t &Imm) {
+
+  if (!MoveImm->isMoveImmediate())
+    return false;
+
+  const MachineOperand *ImmOp =
+      TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0);
+  if (!ImmOp->isImm())
+    return false;
+
+  // FIXME: Handle copies with sub-regs.
+  if (Copy->getOperand(0).getSubReg())
+    return false;
+
+  switch (MoveImm->getOpcode()) {
+  default:
+    return false;
+  case AMDGPU::V_MOV_B32_e32:
+    SMovOp = AMDGPU::S_MOV_B32;
+    break;
+  case AMDGPU::V_MOV_B64_PSEUDO:
+    SMovOp = AMDGPU::S_MOV_B64;
+    break;
+  }
+  Imm = ImmOp->getImm();
+  return true;
+}
+
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -323,7 +355,17 @@
         const TargetRegisterClass *SrcRC, *DstRC;
         std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
         if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
-          DEBUG(dbgs() << "Fixing VGPR -> SGPR copy: " << MI);
+          MachineInstr *DefMI = MRI.getVRegDef(MI.getOperand(1).getReg());
+          unsigned SMovOp;
+          int64_t Imm;
+          // If we are just copying an immediate, we can replace the copy with
+          // s_mov_b32.
+          if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) {
+            MI.getOperand(1).ChangeToImmediate(Imm);
+            MI.addImplicitDefUseOperands(MF);
+            MI.setDesc(TII->get(SMovOp));
+            break;
+          }
           TII->moveToVALU(MI);
         }
 
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 7bf6ec2..3d59f8d 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -173,6 +173,13 @@
       MI->setDesc(TII->get(Opc));
     }
 
+    // Special case for s_setreg_b32
+    if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
+      MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
+      FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+      return true;
+    }
+
     // If we are already folding into another operand of MI, then
     // we can't commute the instruction, otherwise we risk making the
     // other fold illegal.
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index ef61fc4..eeab482 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -21,6 +21,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
@@ -276,7 +277,7 @@
     setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
 
     // F16 - Constant Actions.
-    setOperationAction(ISD::ConstantFP, MVT::f16, Custom);
+    setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
 
     // F16 - Load/Store Actions.
     setOperationAction(ISD::LOAD, MVT::f16, Promote);
@@ -609,6 +610,13 @@
   return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
 }
 
+bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
+  const MemSDNode *MemNode = cast<MemSDNode>(N);
+  const Value *Ptr = MemNode->getMemOperand()->getValue();
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.noclobber");
+}
+
 bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
                                             unsigned DestAS) const {
   // Flat -> private/local is a simple truncate.
@@ -1840,9 +1848,6 @@
   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
   case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
   case ISD::TRAP: return lowerTRAP(Op, DAG);
-
-  case ISD::ConstantFP:
-    return lowerConstantFP(Op, DAG);
   case ISD::FP_ROUND:
     return lowerFP_ROUND(Op, DAG);
   }
@@ -2047,15 +2052,6 @@
       DAG.getNode(ISD::FTRUNC, DL, VT, Op);
 }
 
-SDValue SITargetLowering::lowerConstantFP(SDValue Op, SelectionDAG &DAG) const {
-  if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(Op)) {
-    return DAG.getConstant(FP->getValueAPF().bitcastToAPInt().getZExtValue(),
-                           SDLoc(Op), MVT::i32);
-  }
-
-  return SDValue();
-}
-
 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getValueType() == MVT::f16 &&
          "Do not know how to custom lower FP_ROUND for non-f16 type");
@@ -2497,6 +2493,12 @@
     return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
                              Op.getOperand(1), Op.getOperand(2), Glue);
   }
+  case Intrinsic::amdgcn_interp_mov: {
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
+    SDValue Glue = M0.getValue(1);
+    return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3), Glue);
+  }
   case Intrinsic::amdgcn_interp_p1: {
     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
     SDValue Glue = M0.getValue(1);
@@ -2683,6 +2685,29 @@
     SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
     return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
   }
+  case AMDGPUIntrinsic::SI_export: {
+    const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(2));
+    const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(3));
+    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(4));
+    const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(5));
+    const ConstantSDNode *Compr = cast<ConstantSDNode>(Op.getOperand(6));
+
+    const SDValue Ops[] = {
+      Chain,
+      DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),
+      DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1),
+      DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8),
+      DAG.getTargetConstant(Compr->getZExtValue(), DL, MVT::i1),
+      Op.getOperand(7), // src0
+      Op.getOperand(8), // src1
+      Op.getOperand(9), // src2
+      Op.getOperand(10) // src3
+    };
+
+    unsigned Opc = Done->isNullValue() ?
+      AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
+    return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
+  }
   default:
     return SDValue();
   }
@@ -2743,11 +2768,19 @@
     if (isMemOpUniform(Load))
       return SDValue();
     // Non-uniform loads will be selected to MUBUF instructions, so they
-    // have the same legalization requires ments as global and private
+    // have the same legalization requirements as global and private
     // loads.
     //
     LLVM_FALLTHROUGH;
-  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::GLOBAL_ADDRESS: {
+    if (isMemOpUniform(Load) && isMemOpHasNoClobberedMemOperand(Load))
+      return SDValue();
+    // Non-uniform loads will be selected to MUBUF instructions, so they
+    // have the same legalization requirements as global and private
+    // loads.
+    //
+  }
+    LLVM_FALLTHROUGH;
   case AMDGPUAS::FLAT_ADDRESS:
     if (NumElements > 4)
       return SplitVectorLoad(Op, DAG);
@@ -2868,6 +2901,47 @@
   return SDValue();
 }
 
+static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
+                          EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
+  if (GlueChain->getNumValues() <= 1) {
+    return DAG.getNode(Opcode, SL, VT, A, B);
+  }
+
+  assert(GlueChain->getNumValues() == 3);
+
+  SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
+  switch (Opcode) {
+  default: llvm_unreachable("no chain equivalent for opcode");
+  case ISD::FMUL:
+    Opcode = AMDGPUISD::FMUL_W_CHAIN;
+    break;
+  }
+
+  return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
+                     GlueChain.getValue(2));
+}
+
+static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
+                           EVT VT, SDValue A, SDValue B, SDValue C,
+                           SDValue GlueChain) {
+  if (GlueChain->getNumValues() <= 1) {
+    return DAG.getNode(Opcode, SL, VT, A, B, C);
+  }
+
+  assert(GlueChain->getNumValues() == 3);
+
+  SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
+  switch (Opcode) {
+  default: llvm_unreachable("no chain equivalent for opcode");
+  case ISD::FMA:
+    Opcode = AMDGPUISD::FMA_W_CHAIN;
+    break;
+  }
+
+  return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
+                     GlueChain.getValue(2));
+}
+
 // Faster 2.5 ULP division that does not support denormals.
 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
@@ -2914,25 +2988,73 @@
 
   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
 
-  SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS);
-  SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS);
+  SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
+                                          RHS, RHS, LHS);
+  SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
+                                        LHS, RHS, LHS);
 
   // Denominator is scaled to not be denormal, so using rcp is ok.
-  SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled);
+  SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
+                                  DenominatorScaled);
+  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
+                                     DenominatorScaled);
 
-  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled);
+  const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
+                               (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
+                               (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
 
-  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One);
-  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp);
+  const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
 
-  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1);
+  if (!Subtarget->hasFP32Denormals()) {
+    SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
+                                                      SL, MVT::i32);
+    SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
+                                       DAG.getEntryNode(),
+                                       EnableDenormValue, BitField);
+    SDValue Ops[3] = {
+      NegDivScale0,
+      EnableDenorm.getValue(0),
+      EnableDenorm.getValue(1)
+    };
 
-  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled);
-  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul);
-  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled);
+    NegDivScale0 = DAG.getMergeValues(Ops, SL);
+  }
+
+  SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
+                             ApproxRcp, One, NegDivScale0);
+
+  SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
+                             ApproxRcp, Fma0);
+
+  SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
+                           Fma1, Fma1);
+
+  SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
+                             NumeratorScaled, Mul);
+
+  SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
+
+  SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
+                             NumeratorScaled, Fma3);
+
+  if (!Subtarget->hasFP32Denormals()) {
+    const SDValue DisableDenormValue =
+        DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
+    SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
+                                        Fma4.getValue(1),
+                                        DisableDenormValue,
+                                        BitField,
+                                        Fma4.getValue(2));
+
+    SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
+                                      DisableDenorm, DAG.getRoot());
+    DAG.setRoot(OutputChain);
+  }
 
   SDValue Scale = NumeratorScaled.getValue(1);
-  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale);
+  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
+                             Fma4, Fma1, Fma3, Scale);
 
   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
 }
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 03846fd..cb6d536 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -53,9 +53,6 @@
                             const SDLoc &DL,
                             EVT VT) const;
 
-  /// \brief Custom lowering for ISD::ConstantFP.
-  SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const;
-
   /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
 
@@ -129,6 +126,7 @@
                           MachineFunction &MF) const override;
 
   bool isMemOpUniform(const SDNode *N) const;
+  bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const;
   bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
   bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
 
diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp
index 9df0838..91e4bf7 100644
--- a/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -159,16 +159,15 @@
   MachineBasicBlock::iterator Insert = SkipBB->begin();
 
   // Exec mask is zero: Export to NULL target...
-  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
-    .addImm(0)
+  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
     .addImm(0x09) // V_008DFC_SQ_EXP_NULL
-    .addImm(0)
-    .addImm(1)
-    .addImm(1)
     .addReg(AMDGPU::VGPR0, RegState::Undef)
     .addReg(AMDGPU::VGPR0, RegState::Undef)
     .addReg(AMDGPU::VGPR0, RegState::Undef)
-    .addReg(AMDGPU::VGPR0, RegState::Undef);
+    .addReg(AMDGPU::VGPR0, RegState::Undef)
+    .addImm(1)  // vm
+    .addImm(0)  // compr
+    .addImm(0); // en
 
   // ... and terminate wavefront.
   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
index 7bec2b6..202a1e9 100644
--- a/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -195,8 +195,7 @@
   Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
 
   // Only consider stores or EXP for EXP_CNT
-  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
-      (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
+  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
 
   // LGKM may uses larger values
   if (TSFlags & SIInstrFlags::LGKM_CNT) {
@@ -238,9 +237,10 @@
   if (Op.isDef())
     return true;
 
-  // For exports all registers are relevant
+  // For exports all registers are relevant.
+  // TODO: Skip undef/disabled registers.
   MachineInstr &MI = *Op.getParent();
-  if (MI.getOpcode() == AMDGPU::EXP)
+  if (TII->isEXP(MI))
     return true;
 
   // For stores the stored value is also relevant
@@ -340,7 +340,7 @@
 
   // Remember which export instructions we have seen
   if (Increment.Named.EXP) {
-    ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
+    ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
   }
 
   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index b95f209..5f260ba 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -41,6 +41,7 @@
   field bit DS = 0;
   field bit MIMG = 0;
   field bit FLAT = 0;
+  field bit EXP = 0;
 
   // Whether WQM _must_ be enabled for this instruction.
   field bit WQM = 0;
@@ -96,15 +97,16 @@
   let TSFlags{19} = DS;
   let TSFlags{20} = MIMG;
   let TSFlags{21} = FLAT;
-  let TSFlags{22} = WQM;
-  let TSFlags{23} = VGPRSpill;
-  let TSFlags{24} = SGPRSpill;
-  let TSFlags{25} = VOPAsmPrefer32Bit;
-  let TSFlags{26} = Gather4;
-  let TSFlags{27} = DisableWQM;
-  let TSFlags{28} = SOPKZext;
-  let TSFlags{29} = ScalarStore;
-  let TSFlags{30} = FixedSize;
+  let TSFlags{22} = EXP;
+  let TSFlags{23} = WQM;
+  let TSFlags{24} = VGPRSpill;
+  let TSFlags{25} = SGPRSpill;
+  let TSFlags{26} = VOPAsmPrefer32Bit;
+  let TSFlags{27} = Gather4;
+  let TSFlags{28} = DisableWQM;
+  let TSFlags{29} = SOPKZext;
+  let TSFlags{30} = ScalarStore;
+  let TSFlags{31} = FixedSize;
 
   let SchedRW = [Write32Bit];
 
@@ -232,6 +234,17 @@
   let hasSideEffects = 0;
 }
 
+class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> :
+  InstSI<outs, ins, asm, pattern> {
+  let EXP = 1;
+  let EXP_CNT = 1;
+  let mayLoad = 0; // Set to 1 if done bit is set.
+  let mayStore = 1;
+  let UseNamedOperandTable = 1;
+  let Uses = [EXEC];
+  let SchedRW = [WriteExport];
+}
+
 } // End Uses = [EXEC]
 
 class MIMG <dag outs, dag ins, string asm, list<dag> pattern> :
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index f4b94b3..9071ded 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1669,48 +1669,22 @@
   // boundaries prevents incorrect movements of such instructions.
   return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
          MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
+         MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
+         MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
          changesVGPRIndexingMode(MI);
 }
 
 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
-  int64_t SVal = Imm.getSExtValue();
-  if (SVal >= -16 && SVal <= 64)
-    return true;
-
-  if (Imm.getBitWidth() == 64) {
-    uint64_t Val = Imm.getZExtValue();
-    return (DoubleToBits(0.0) == Val) ||
-           (DoubleToBits(1.0) == Val) ||
-           (DoubleToBits(-1.0) == Val) ||
-           (DoubleToBits(0.5) == Val) ||
-           (DoubleToBits(-0.5) == Val) ||
-           (DoubleToBits(2.0) == Val) ||
-           (DoubleToBits(-2.0) == Val) ||
-           (DoubleToBits(4.0) == Val) ||
-           (DoubleToBits(-4.0) == Val) ||
-           (ST.hasInv2PiInlineImm() && Val == 0x3fc45f306dc9c882);
+  switch (Imm.getBitWidth()) {
+  case 32:
+    return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
+                                        ST.hasInv2PiInlineImm());
+  case 64:
+    return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
+                                        ST.hasInv2PiInlineImm());
+  default:
+    llvm_unreachable("invalid bitwidth");
   }
-
-  // The actual type of the operand does not seem to matter as long
-  // as the bits match one of the inline immediate values.  For example:
-  //
-  // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
-  // so it is a legal inline immediate.
-  //
-  // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
-  // floating-point, so it is a legal inline immediate.
-  uint32_t Val = Imm.getZExtValue();
-
-  return (FloatToBits(0.0f) == Val) ||
-         (FloatToBits(1.0f) == Val) ||
-         (FloatToBits(-1.0f) == Val) ||
-         (FloatToBits(0.5f) == Val) ||
-         (FloatToBits(-0.5f) == Val) ||
-         (FloatToBits(2.0f) == Val) ||
-         (FloatToBits(-2.0f) == Val) ||
-         (FloatToBits(4.0f) == Val) ||
-         (FloatToBits(-4.0f) == Val) ||
-         (ST.hasInv2PiInlineImm() && Val == 0x3e22f983);
 }
 
 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
@@ -1721,9 +1695,16 @@
     // 32-bit floating point immediate bit pattern is legal for an integer
     // immediate. It would be for any 32-bit integer operand, but would not be
     // for a 64-bit one.
-
-    unsigned BitSize = 8 * OpSize;
-    return isInlineConstant(APInt(BitSize, MO.getImm(), true));
+    switch (OpSize) {
+    case 4:
+      return AMDGPU::isInlinableLiteral32(static_cast<int32_t>(MO.getImm()),
+                                          ST.hasInv2PiInlineImm());
+    case 8:
+      return AMDGPU::isInlinableLiteral64(MO.getImm(),
+                                          ST.hasInv2PiInlineImm());
+    default:
+      llvm_unreachable("invalid bitwidth");
+    }
   }
 
   return false;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 71f1968..0f16fa0 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -372,6 +372,14 @@
     return get(Opcode).TSFlags & SIInstrFlags::FLAT;
   }
 
+  static bool isEXP(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::EXP;
+  }
+
+  bool isEXP(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::EXP;
+  }
+
   static bool isWQM(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::WQM;
   }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 0f30d7b..aeef7ac 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -315,6 +315,13 @@
   let RenderMethod = "addImmOperands";
 }
 
+def ExpTgtMatchClass : AsmOperandClass {
+  let Name = "ExpTgt";
+  let PredicateMethod = "isExpTgt";
+  let ParserMethod = "parseExpTgt";
+  let RenderMethod = "printExpTgt";
+}
+
 def SendMsgImm : Operand<i32> {
   let PrintMethod = "printSendMsg";
   let ParserMatchClass = SendMsgMatchClass;
@@ -326,6 +333,11 @@
   let ParserMethod = "parseSWaitCntOps";
 }
 
+def VReg32OrOffClass : AsmOperandClass {
+  let Name = "VReg32OrOff";
+  let ParserMethod = "parseVReg32OrOff";
+}
+
 def WAIT_FLAG : Operand <i32> {
   let ParserMatchClass = SWaitMatchClass;
   let PrintMethod = "printWaitFlag";
@@ -334,6 +346,31 @@
 include "SIInstrFormats.td"
 include "VIInstrFormats.td"
 
+// ===----------------------------------------------------------------------===//
+// ExpSrc* Special cases for exp src operands which are printed as
+// "off" depending on en operand.
+// ===----------------------------------------------------------------------===//
+
+def ExpSrc0 : RegisterOperand<VGPR_32> {
+  let PrintMethod = "printExpSrc0";
+  let ParserMatchClass = VReg32OrOffClass;
+}
+
+def ExpSrc1 : RegisterOperand<VGPR_32> {
+  let PrintMethod = "printExpSrc1";
+  let ParserMatchClass = VReg32OrOffClass;
+}
+
+def ExpSrc2 : RegisterOperand<VGPR_32> {
+  let PrintMethod = "printExpSrc2";
+  let ParserMatchClass = VReg32OrOffClass;
+}
+
+def ExpSrc3 : RegisterOperand<VGPR_32> {
+  let PrintMethod = "printExpSrc3";
+  let ParserMatchClass = VReg32OrOffClass;
+}
+
 class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
   let Name = "Imm"#CName;
   let PredicateMethod = "is"#CName;
@@ -385,6 +422,8 @@
 def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
 def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
 def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
+def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
+def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
 
 def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
 
@@ -400,6 +439,10 @@
 
 def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
 
+def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
+
+}
+
 } // End OperandType = "OPERAND_IMMEDIATE"
 
 
@@ -518,32 +561,41 @@
 // EXP classes
 //===----------------------------------------------------------------------===//
 
-class EXPCommon : InstSI<
+class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon<
   (outs),
-  (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
-       VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3),
-  "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
-  [] > {
-
-  let EXP_CNT = 1;
-  let Uses = [EXEC];
-  let SchedRW = [WriteExport];
+  (ins exp_tgt:$tgt,
+       ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3,
+       exp_vm:$vm, exp_compr:$compr, i8imm:$en),
+  "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm",
+  [(node (i8 timm:$en), (i1 timm:$vm), (i8 timm:$tgt), (i1 timm:$compr),
+         f32:$src0, f32:$src1, f32:$src2, f32:$src3)]> {
+  let AsmMatchConverter = "cvtExp";
 }
 
-multiclass EXP_m {
+// Split EXP instruction into EXP and EXP_DONE so we can set
+// mayLoad for done=1.
+multiclass EXP_m<bit done, SDPatternOperator node> {
+  let mayLoad = done in {
+    let isPseudo = 1, isCodeGenOnly = 1 in {
+      def "" : EXP_Helper<done, node>,
+               SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>;
+    }
 
-  let isPseudo = 1, isCodeGenOnly = 1 in {
-    def "" : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.NONE> ;
-  }
+    let done = done in {
+      def _si : EXP_Helper<done>,
+                SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>,
+                EXPe {
+        let DecoderNamespace = "SICI";
+        let DisableDecoder = DisableSIDecoder;
+      }
 
-  def _si : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.SI>, EXPe {
-    let DecoderNamespace="SICI";
-    let DisableDecoder = DisableSIDecoder;
-  }
-
-  def _vi : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.VI>, EXPe_vi {
-    let DecoderNamespace="VI";
-    let DisableDecoder = DisableVIDecoder;
+      def _vi : EXP_Helper<done>,
+                SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>,
+                EXPe_vi {
+        let DecoderNamespace = "VI";
+        let DisableDecoder = DisableVIDecoder;
+      }
+    }
   }
 }
 
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 0aa8e19..93e7bcd 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -37,7 +37,8 @@
 // EXP Instructions
 //===----------------------------------------------------------------------===//
 
-defm EXP : EXP_m;
+defm EXP : EXP_m<0, AMDGPUexport>;
+defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
 
 //===----------------------------------------------------------------------===//
 // VINTRP Instructions
@@ -49,11 +50,11 @@
 
 multiclass V_INTERP_P1_F32_m : VINTRP_m <
   0x00000000,
-  (outs VGPR_32:$dst),
+  (outs VGPR_32:$vdst),
   (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr),
-  "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [m0]",
-  [(set f32:$dst, (AMDGPUinterp_p1 f32:$i, (i32 imm:$attr_chan),
-                                           (i32 imm:$attr)))]
+  "v_interp_p1_f32 $vdst, $i, $attr_chan, $attr, [m0]",
+  [(set f32:$vdst, (AMDGPUinterp_p1 f32:$i, (i32 imm:$attr_chan),
+                                            (i32 imm:$attr)))]
 >;
 
 let OtherPredicates = [has32BankLDS] in {
@@ -62,31 +63,31 @@
 
 } // End OtherPredicates = [has32BankLDS]
 
-let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1 in {
+let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
 
 defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
 
-} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1
+} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
 
-let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in {
+let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
 
 defm V_INTERP_P2_F32 : VINTRP_m <
   0x00000001,
-  (outs VGPR_32:$dst),
+  (outs VGPR_32:$vdst),
   (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr),
-  "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [m0]",
-  [(set f32:$dst, (AMDGPUinterp_p2 f32:$src0, f32:$j, (i32 imm:$attr_chan),
-                                                     (i32 imm:$attr)))]>;
+  "v_interp_p2_f32 $vdst, [$src0], $j, $attr_chan, $attr, [m0]",
+  [(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$j, (i32 imm:$attr_chan),
+                                                       (i32 imm:$attr)))]>;
 
-} // End DisableEncoding = "$src0", Constraints = "$src0 = $dst"
+} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
 
 defm V_INTERP_MOV_F32 : VINTRP_m <
   0x00000002,
-  (outs VGPR_32:$dst),
+  (outs VGPR_32:$vdst),
   (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr),
-  "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [m0]",
-  [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan),
-                                    (i32 imm:$attr)))]>;
+  "v_interp_mov_f32 $vdst, $src0, $attr_chan, $attr, [m0]",
+  [(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan),
+                                     (i32 imm:$attr)))]>;
 
 } // End Uses = [M0, EXEC]
 
@@ -388,13 +389,6 @@
   (SI_KILL (i32 0xbf800000))
 >;
 
-def : Pat <
-  (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
-                 f32:$src0, f32:$src1, f32:$src2, f32:$src3),
-  (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm,
-       $src0, $src1, $src2, $src3)
->;
-
 //===----------------------------------------------------------------------===//
 // VOP1 Patterns
 //===----------------------------------------------------------------------===//
@@ -712,12 +706,25 @@
   (S_MOV_B32 imm:$imm)
 >;
 
+// FIXME: Workaround for ordering issue with peephole optimizer where
+// a register class copy interferes with immediate folding.  Should
+// use s_mov_b32, which can be shrunk to s_movk_i32
+def : Pat <
+  (VGPRImm<(f16 fpimm)>:$imm),
+  (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
+>;
+
 def : Pat <
   (f32 fpimm:$imm),
   (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
 >;
 
 def : Pat <
+  (f16 fpimm:$imm),
+  (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
+>;
+
+def : Pat <
  (i32 frameindex:$fi),
  (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
 >;
diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td
index b2857f0..5da3754 100644
--- a/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/lib/Target/AMDGPU/SIIntrinsics.td
@@ -15,7 +15,20 @@
 
 let TargetPrefix = "SI", isTarget = 1 in {
   def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
+
+  def int_SI_export : Intrinsic <[],
+    [llvm_i32_ty,   // en
+    llvm_i32_ty,    // vm   (FIXME: should be i1)
+    llvm_i32_ty,    // done (FIXME: should be i1)
+    llvm_i32_ty,    // tgt
+    llvm_i32_ty,    // compr (FIXME: should be i1)
+    llvm_float_ty,  // src0
+    llvm_float_ty,  // src1
+    llvm_float_ty,  // src2
+    llvm_float_ty], // src3
+    []
+  >;
+
   def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
 
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 2898f68..440ce1b 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -203,6 +203,14 @@
   return true;
 }
 
+int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
+  assert(SIInstrInfo::isMUBUF(*MI));
+
+  int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                          AMDGPU::OpName::offset);
+  return MI->getOperand(OffIdx).getImm();
+}
+
 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
                                                  int Idx) const {
   if (!SIInstrInfo::isMUBUF(*MI))
@@ -212,13 +220,16 @@
                                            AMDGPU::OpName::vaddr) &&
          "Should never see frame index on non-address operand");
 
-  int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                          AMDGPU::OpName::offset);
-  return MI->getOperand(OffIdx).getImm();
+  return getMUBUFInstrOffset(MI);
 }
 
 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
-  return MI->mayLoadOrStore();
+  if (!MI->mayLoadOrStore())
+    return false;
+
+  int64_t FullOffset = Offset + getMUBUFInstrOffset(MI);
+
+  return !isUInt<12>(FullOffset);
 }
 
 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
@@ -295,7 +306,12 @@
 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
                                         unsigned BaseReg,
                                         int64_t Offset) const {
-  return SIInstrInfo::isMUBUF(*MI) && isUInt<12>(Offset);
+  if (!SIInstrInfo::isMUBUF(*MI))
+    return false;
+
+  int64_t NewOffset = Offset + getMUBUFInstrOffset(MI);
+
+  return isUInt<12>(NewOffset);
 }
 
 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
@@ -1165,6 +1181,8 @@
 unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST) const {
   if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
     return 6; // VCC, FLAT_SCRATCH, XNACK.
+  if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
+    return 4; // VCC, FLAT_SCRATCH.
   return 2; // VCC.
 }
 
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index bd83ef1..477d232 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -56,6 +56,8 @@
   bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override;
   bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
 
+  int64_t getMUBUFInstrOffset(const MachineInstr *MI) const;
+
   int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
                                    int Idx) const override;
 
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 1ae3645..0265648 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -222,11 +222,15 @@
 // Scalar Memory Patterns
 //===----------------------------------------------------------------------===//
 
+
 def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
   auto Ld = cast<LoadSDNode>(N);
   return Ld->getAlignment() >= 4  &&
-    Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
-    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N);
+    ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
+    (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
+    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
 }]>;
 
 def SMRDImm         : ComplexPattern<i64, 2, "SelectSMRDImm">;
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index 2486fbf..0aeb129 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -590,10 +590,13 @@
 >;
 }
 
+let hasSideEffects = 1 in {
+
 def S_SETREG_B32 : SOPK_Pseudo <
   "s_setreg_b32",
   (outs), (ins SReg_32:$sdst, hwreg:$simm16),
-  "$simm16, $sdst"
+  "$simm16, $sdst",
+  [(AMDGPUsetreg i32:$sdst, (i16 timm:$simm16))]
 >;
 
 // FIXME: Not on SI?
@@ -607,6 +610,7 @@
   let has_sdst = 0;
 }
 
+} // End hasSideEffects = 1
 
 //===----------------------------------------------------------------------===//
 // SOPC Instructions
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index fb27675..29cac2f 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -392,40 +392,38 @@
   return getRegBitWidth(MRI->getRegClass(RCID)) / 8;
 }
 
-bool isInlinableLiteral64(int64_t Literal, bool IsVI) {
+bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
   if (Literal >= -16 && Literal <= 64)
     return true;
 
-  double D = BitsToDouble(Literal);
-
-  if (D == 0.5 || D == -0.5 ||
-      D == 1.0 || D == -1.0 ||
-      D == 2.0 || D == -2.0 ||
-      D == 4.0 || D == -4.0)
-    return true;
-
-  if (IsVI && Literal == 0x3fc45f306dc9c882)
-    return true;
-
-  return false;
+  uint64_t Val = static_cast<uint64_t>(Literal);
+  return (Val == DoubleToBits(0.0)) ||
+         (Val == DoubleToBits(1.0)) ||
+         (Val == DoubleToBits(-1.0)) ||
+         (Val == DoubleToBits(0.5)) ||
+         (Val == DoubleToBits(-0.5)) ||
+         (Val == DoubleToBits(2.0)) ||
+         (Val == DoubleToBits(-2.0)) ||
+         (Val == DoubleToBits(4.0)) ||
+         (Val == DoubleToBits(-4.0)) ||
+         (Val == 0x3fc45f306dc9c882 && HasInv2Pi);
 }
 
-bool isInlinableLiteral32(int32_t Literal, bool IsVI) {
+bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) {
   if (Literal >= -16 && Literal <= 64)
     return true;
 
-  float F = BitsToFloat(Literal);
-
-  if (F == 0.5 || F == -0.5 ||
-      F == 1.0 || F == -1.0 ||
-      F == 2.0 || F == -2.0 ||
-      F == 4.0 || F == -4.0)
-    return true;
-
-  if (IsVI && Literal == 0x3e22f983)
-    return true;
-
-  return false;
+  uint32_t Val = static_cast<uint32_t>(Literal);
+  return (Val == FloatToBits(0.0f)) ||
+         (Val == FloatToBits(1.0f)) ||
+         (Val == FloatToBits(-1.0f)) ||
+         (Val == FloatToBits(0.5f)) ||
+         (Val == FloatToBits(-0.5f)) ||
+         (Val == FloatToBits(2.0f)) ||
+         (Val == FloatToBits(-2.0f)) ||
+         (Val == FloatToBits(4.0f)) ||
+         (Val == FloatToBits(-4.0f)) ||
+         (Val == 0x3e22f983 && HasInv2Pi);
 }
 
 
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 484f681..3101b96 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -168,8 +168,12 @@
                            unsigned OpNo);
 
 /// \brief Is this literal inlinable
-bool isInlinableLiteral64(int64_t Literal, bool IsVI);
-bool isInlinableLiteral32(int32_t Literal, bool IsVI);
+LLVM_READNONE
+bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);
+
+LLVM_READNONE
+bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi);
+
 
 } // end namespace AMDGPU
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index dec133c..0e87f90 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -349,7 +349,7 @@
 def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16>;
 defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
 defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
-defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16>;
+defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
 defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
 defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>;
 defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>;
@@ -416,27 +416,27 @@
 
 let Predicates = [isVI] in {
 
-defm : Arithmetic_i16_Pats<add, V_ADD_U16_e32>;
-defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e32>;
-defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e32>;
-defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e32>;
-defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e32>;
-defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e32>;
-defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e32>;
+defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;
+defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>;
+defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64>;
+defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64>;
+defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>;
+defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>;
+defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>;
 
 def : Pat <
   (and i16:$src0, i16:$src1),
-  (V_AND_B32_e32 $src0, $src1)
+  (V_AND_B32_e64 $src0, $src1)
 >;
 
 def : Pat <
   (or i16:$src0, i16:$src1),
-  (V_OR_B32_e32 $src0, $src1)
+  (V_OR_B32_e64 $src0, $src1)
 >;
 
 def : Pat <
   (xor i16:$src0, i16:$src1),
-  (V_XOR_B32_e32 $src0, $src1)
+  (V_XOR_B32_e64 $src0, $src1)
 >;
 
 defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e32>;
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index f638b94..48ab491 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -560,7 +560,7 @@
     MachineBasicBlock::const_iterator Before) {
   // Initialize if we never queried in this block.
   if (!LiveRegsValid) {
-    LiveRegs.init(TRI);
+    LiveRegs.init(*TRI);
     LiveRegs.addLiveOuts(MBB);
     LiveRegPos = MBB.end();
     LiveRegsValid = true;
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 9950dae..61062e4 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -523,6 +523,7 @@
     Match_RequiresV6,
     Match_RequiresThumb2,
     Match_RequiresV8,
+    Match_RequiresFlagSetting,
 #define GET_OPERAND_DIAGNOSTIC_TYPES
 #include "ARMGenAsmMatcher.inc"
 
@@ -8905,7 +8906,7 @@
       ;
     // If we're parsing Thumb1, reject it completely.
     if (isThumbOne() && Inst.getOperand(OpNo).getReg() != ARM::CPSR)
-      return Match_MnemonicFail;
+      return Match_RequiresFlagSetting;
     // If we're parsing Thumb2, which form is legal depends on whether we're
     // in an IT block.
     if (isThumbTwo() && Inst.getOperand(OpNo).getReg() != ARM::CPSR &&
@@ -9171,6 +9172,8 @@
     return Error(IDLoc, "instruction variant requires Thumb2");
   case Match_RequiresV8:
     return Error(IDLoc, "instruction variant requires ARMv8 or later");
+  case Match_RequiresFlagSetting:
+    return Error(IDLoc, "no flag-preserving variant of this instruction available");
   case Match_ImmRange0_15: {
     SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
diff --git a/lib/Target/AVR/AVR.h b/lib/Target/AVR/AVR.h
index 041c77c..0b4ba45 100644
--- a/lib/Target/AVR/AVR.h
+++ b/lib/Target/AVR/AVR.h
@@ -30,6 +30,8 @@
 FunctionPass *createAVRDynAllocaSRPass();
 FunctionPass *createAVRBranchSelectionPass();
 
+void initializeAVRExpandPseudoPass(PassRegistry&);
+
 /// Contains the AVR backend.
 namespace AVR {
 
diff --git a/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index a50e2fd..ea4a179 100644
--- a/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -25,6 +25,8 @@
 
 using namespace llvm;
 
+#define AVR_EXPAND_PSEUDO_NAME "AVR pseudo instruction expansion pass"
+
 namespace {
 
 /// Expands "placeholder" instructions marked as pseudo into
@@ -33,13 +35,13 @@
 public:
   static char ID;
 
-  AVRExpandPseudo() : MachineFunctionPass(ID) {}
+  AVRExpandPseudo() : MachineFunctionPass(ID) {
+    initializeAVRExpandPseudoPass(*PassRegistry::getPassRegistry());
+  }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
-  StringRef getPassName() const override {
-    return "AVR pseudo instruction expansion pass";
-  }
+  StringRef getPassName() const override { return AVR_EXPAND_PSEUDO_NAME; }
 
 private:
   typedef MachineBasicBlock Block;
@@ -653,18 +655,47 @@
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   assert(Imm < 63 && "Offset is out of range");
-  assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
+
+  unsigned TmpLoReg = DstLoReg;
+  unsigned TmpHiReg = DstHiReg;
+
+  // HACK: We shouldn't have instances of this instruction
+  // where src==dest because the instruction itself is
+  // marked earlyclobber. We do however get this instruction when
+  // loading from stack slots where the earlyclobber isn't useful.
+  //
+  // In this case, just use a temporary register.
+  if (DstReg == SrcReg) {
+    TmpLoReg = SCRATCH_REGISTER;
+    TmpHiReg = SCRATCH_REGISTER;
+  }
 
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(TmpLoReg, RegState::Define | getDeadRegState(DstIsDead))
     .addReg(SrcReg)
     .addImm(Imm);
 
+  // Push the low part of the temporary register to the stack.
+  if (TmpLoReg != DstLoReg)
+    buildMI(MBB, MBBI, AVR::PUSHRr)
+      .addReg(AVR::R0);
+
   auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(TmpHiReg, RegState::Define | getDeadRegState(DstIsDead))
     .addReg(SrcReg, getKillRegState(SrcIsKill))
     .addImm(Imm + 1);
 
+  // If we need to use a temporary register.
+  if (TmpHiReg != DstHiReg) {
+    // Move the hi result from the tmp register to the destination.
+    buildMI(MBB, MBBI, AVR::MOVRdRr)
+      .addReg(DstHiReg).addReg(SCRATCH_REGISTER);
+
+    // Pop the lo result calculated previously and put it into
+    // the lo destination.
+    buildMI(MBB, MBBI, AVR::POPRd).addReg(DstLoReg);
+  }
+
   MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
   MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
 
@@ -672,6 +703,16 @@
   return true;
 }
 
+template <>
+bool AVRExpandPseudo::expand<AVR::LPMWRdZ>(Block &MBB, BlockIt MBBI) {
+  llvm_unreachable("wide LPM is unimplemented");
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LPMWRdZPi>(Block &MBB, BlockIt MBBI) {
+  llvm_unreachable("wide LPMPi is unimplemented");
+}
+
 template<typename Func>
 bool AVRExpandPseudo::expandAtomic(Block &MBB, BlockIt MBBI, Func f) {
   // Remove the pseudo instruction.
@@ -1384,6 +1425,8 @@
     EXPAND(AVR::LDWRdPtrPd);
   case AVR::LDDWRdYQ: //:FIXME: remove this once PR13375 gets fixed
     EXPAND(AVR::LDDWRdPtrQ);
+    EXPAND(AVR::LPMWRdZ);
+    EXPAND(AVR::LPMWRdZPi);
     EXPAND(AVR::AtomicLoad8);
     EXPAND(AVR::AtomicLoad16);
     EXPAND(AVR::AtomicStore8);
@@ -1424,6 +1467,8 @@
 
 } // end of anonymous namespace
 
+INITIALIZE_PASS(AVRExpandPseudo, "avr-expand-pseudo",
+                AVR_EXPAND_PSEUDO_NAME, false, false)
 namespace llvm {
 
 FunctionPass *createAVRExpandPseudoPass() { return new AVRExpandPseudo(); }
diff --git a/lib/Target/AVR/AVRISelDAGToDAG.cpp b/lib/Target/AVR/AVRISelDAGToDAG.cpp
index d190a60..098ee61 100644
--- a/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -366,6 +366,8 @@
     return selectIndexedLoad(N);
   }
 
+  assert(Subtarget->hasLPM() && "cannot load from program memory on this mcu");
+
   // This is a flash memory load, move the pointer into R31R30 and emit
   // the lpm instruction.
   MVT VT = LD->getMemoryVT().getSimpleVT();
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index e12e38e..53668f0 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -84,8 +84,8 @@
 
   setOperationAction(ISD::SELECT_CC, MVT::i8, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i16, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
   setOperationAction(ISD::SETCC, MVT::i8, Custom);
   setOperationAction(ISD::SETCC, MVT::i16, Custom);
   setOperationAction(ISD::SETCC, MVT::i32, Custom);
diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td
index f30cbf0..bc66379 100644
--- a/lib/Target/AVR/AVRInstrInfo.td
+++ b/lib/Target/AVR/AVRInstrInfo.td
@@ -1207,6 +1207,7 @@
 let canFoldAsLoad = 1,
 isReMaterializable = 1 in
 {
+  let Constraints = "@earlyclobber $reg" in
   def LDDRdPtrQ : FSTDLDD<0,
                           (outs GPR8:$reg),
                           (ins memri:$memri),
@@ -1227,7 +1228,8 @@
                    Requires<[HasSRAM]>;
 
   let mayLoad = 1,
-  hasSideEffects = 0 in
+  hasSideEffects = 0,
+  Constraints = "@earlyclobber $dst" in
   def LDDWRdYQ : Pseudo<(outs DREGS:$dst),
                         (ins memri:$memri),
                         "lddw\t$dst, $memri",
@@ -1500,7 +1502,7 @@
                      [(set i8:$dst, (load ioaddr8:$src))]>;
 
   def INWRdA : Pseudo<(outs DREGS:$dst),
-                      (ins i16imm:$src),
+                      (ins imm_port6:$src),
                       "inw\t$dst, $src",
                       [(set i16:$dst, (load ioaddr16:$src))]>;
 }
@@ -1512,7 +1514,7 @@
                     [(store i8:$src, ioaddr8:$dst)]>;
 
 def OUTWARr : Pseudo<(outs),
-                     (ins i16imm:$dst, DREGS:$src),
+                     (ins imm_port6:$dst, DREGS:$src),
                      "outw\t$dst, $src",
                      [(store i16:$src, ioaddr16:$dst)]>;
 
diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp
index a5381d8..4189a24 100644
--- a/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/lib/Target/AVR/AVRTargetMachine.cpp
@@ -77,6 +77,9 @@
 extern "C" void LLVMInitializeAVRTarget() {
   // Register the target.
   RegisterTargetMachine<AVRTargetMachine> X(getTheAVRTarget());
+
+  auto &PR = *PassRegistry::getPassRegistry();
+  initializeAVRExpandPseudoPass(PR);
 }
 
 const AVRSubtarget *AVRTargetMachine::getSubtargetImpl() const {
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
index 02b0300..044db10 100644
--- a/lib/Target/CMakeLists.txt
+++ b/lib/Target/CMakeLists.txt
@@ -1,5 +1,7 @@
 list(APPEND LLVM_COMMON_DEPENDS intrinsics_gen)
 
+list(APPEND LLVM_TABLEGEN_FLAGS -I ${LLVM_MAIN_SRC_DIR}/lib/Target)
+
 add_llvm_library(LLVMTarget
   Target.cpp
   TargetIntrinsicInfo.cpp
diff --git a/lib/Target/Hexagon/RDFGraph.cpp b/lib/Target/Hexagon/RDFGraph.cpp
index c696eb6..963b04b 100644
--- a/lib/Target/Hexagon/RDFGraph.cpp
+++ b/lib/Target/Hexagon/RDFGraph.cpp
@@ -651,10 +651,10 @@
     SuperReg = *SR;
   }
 
-  uint32_t Sub = TRI.getSubRegIndex(SuperReg, RR.Reg);
   const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.Reg);
-  LaneBitmask SuperMask = RR.Mask &
-                          TRI.composeSubRegIndexLaneMask(Sub, RC.LaneMask);
+  LaneBitmask Common = RR.Mask & RC.LaneMask;
+  uint32_t Sub = TRI.getSubRegIndex(SuperReg, RR.Reg);
+  LaneBitmask SuperMask = TRI.composeSubRegIndexLaneMask(Sub, Common);
   return RegisterRef(SuperReg, SuperMask);
 }
 
@@ -1206,12 +1206,36 @@
   while (UMA.isValid() && UMB.isValid()) {
     std::pair<uint32_t,LaneBitmask> PA = *UMA;
     std::pair<uint32_t,LaneBitmask> PB = *UMB;
-    // If the returned lane mask is 0, it should be treated as ~0
-    // (or the lane mask from the given register ref should be ignored).
-    // This can happen when a register has only one unit.
     if (PA.first == PB.first) {
-      if (!PA.second || !PB.second || (PA.second & PB.second))
+      // Lane mask of 0 (given by the iterator) should be treated as "full".
+      // This can happen when the register has only one unit, or when the
+      // unit corresponds to explicit aliasing. In such cases, the lane mask
+      // from RegisterRef should be ignored.
+      if (!PA.second || !PB.second)
         return true;
+
+      // At this point the common unit corresponds to a subregister. The lane
+      // masks correspond to the lane mask of that unit within the original
+      // register, for example assuming register quadruple q0 = r3:0, and
+      // a register pair d1 = r3:2, the lane mask of r2 in q0 may be 0b0100,
+      // while the lane mask of r2 in d1 may be 0b0001.
+      LaneBitmask LA = PA.second & RA.Mask;
+      LaneBitmask LB = PB.second & RB.Mask;
+      if (LA != 0 && LB != 0) {
+        unsigned Root = *MCRegUnitRootIterator(PA.first, &TRI);
+        // If register units were guaranteed to only have 1 bit in any lane
+        // mask, the code below would not be necessary. This is because LA
+        // and LB would have at most 1 bit set each, and that bit would be
+        // guaranteed to correspond to the given register unit.
+        uint32_t SubA = TRI.getSubRegIndex(RA.Reg, Root);
+        uint32_t SubB = TRI.getSubRegIndex(RB.Reg, Root);
+        const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(Root);
+        LaneBitmask MaskA = TRI.reverseComposeSubRegIndexLaneMask(SubA, LA);
+        LaneBitmask MaskB = TRI.reverseComposeSubRegIndexLaneMask(SubB, LB);
+        if (MaskA & MaskB & RC.LaneMask)
+          return true;
+      }
+
       ++UMA;
       ++UMB;
       continue;
diff --git a/lib/Target/Lanai/InstPrinter/LLVMBuild.txt b/lib/Target/Lanai/InstPrinter/LLVMBuild.txt
index 6366d7e..eed9a58 100644
--- a/lib/Target/Lanai/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/Lanai/InstPrinter/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = LanaiInstPrinter
 parent = Lanai
-required_libraries = LanaiInfo MC Support
+required_libraries = MC Support
 add_to_library_groups = Lanai
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index b0c1bcc..e5d375a 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -844,19 +844,9 @@
   def XXPERMDI : XX3Form_2<60, 10,
                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM),
                        "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, []>;
-  let isCodeGenOnly = 1 in {
-  def XXPERMDIs : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vfrc:$XA, u2imm:$DM),
+  let isCodeGenOnly = 1 in
+  def XXPERMDIs : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$DM),
                              "xxpermdi $XT, $XA, $XA, $DM", IIC_VecPerm, []>;
-  let D = 0 in
-  def XXSPLTD0s : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vfrc:$XA),
-                             "xxspltd $XT, $XA, 0", IIC_VecPerm, []>;
-  let D = 1 in
-  def XXSPLTD1s : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vfrc:$XA),
-                             "xxspltd $XT, $XA, 1", IIC_VecPerm, []>;
-  let D = 2 in
-  def XXSWAPDs  : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vfrc:$XA),
-                             "xxswapd $XT, $XA", IIC_VecPerm, []>;
-  }
   def XXSEL : XX4Form<60, 3,
                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, vsrc:$XC),
                       "xxsel $XT, $XA, $XB, $XC", IIC_VecPerm, []>;
@@ -928,6 +918,12 @@
                 (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 3)>;
 def : InstAlias<"xxswapd $XT, $XB",
                 (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>;
+def : InstAlias<"xxspltd $XT, $XB, 0",
+                (XXPERMDIs vsrc:$XT, vsfrc:$XB, 0)>;
+def : InstAlias<"xxspltd $XT, $XB, 1",
+                (XXPERMDIs vsrc:$XT, vsfrc:$XB, 3)>;
+def : InstAlias<"xxswapd $XT, $XB",
+                (XXPERMDIs vsrc:$XT, vsfrc:$XB, 2)>;
 
 let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
 
@@ -2510,11 +2506,11 @@
   def : Pat<(v4i32 (scalar_to_vector ScalarLoads.ZELi8)),
            (v4i32 (XXSPLTWs (LXSIBZX xoaddr:$src), 1))>;
   def : Pat<(v2i64 (scalar_to_vector ScalarLoads.ZELi8i64)),
-            (v2i64 (XXSPLTD0s (LXSIBZX xoaddr:$src)))>;
+            (v2i64 (XXPERMDIs (LXSIBZX xoaddr:$src), 0))>;
   def : Pat<(v4i32 (scalar_to_vector ScalarLoads.SELi8)),
             (v4i32 (XXSPLTWs (VEXTSB2Ws (LXSIBZX xoaddr:$src)), 1))>;
   def : Pat<(v2i64 (scalar_to_vector ScalarLoads.SELi8i64)),
-            (v2i64 (XXSPLTD0s (VEXTSB2Ds (LXSIBZX xoaddr:$src))))>;
+            (v2i64 (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0))>;
 
   // Build vectors from i16 loads
   def : Pat<(v8i16 (scalar_to_vector ScalarLoads.Li16)),
@@ -2522,11 +2518,11 @@
   def : Pat<(v4i32 (scalar_to_vector ScalarLoads.ZELi16)),
             (v4i32 (XXSPLTWs (LXSIHZX xoaddr:$src), 1))>;
   def : Pat<(v2i64 (scalar_to_vector ScalarLoads.ZELi16i64)),
-           (v2i64 (XXSPLTD0s (LXSIHZX xoaddr:$src)))>;
+           (v2i64 (XXPERMDIs (LXSIHZX xoaddr:$src), 0))>;
   def : Pat<(v4i32 (scalar_to_vector ScalarLoads.SELi16)),
             (v4i32 (XXSPLTWs (VEXTSH2Ws (LXSIHZX xoaddr:$src)), 1))>;
   def : Pat<(v2i64 (scalar_to_vector ScalarLoads.SELi16i64)),
-            (v2i64 (XXSPLTD0s (VEXTSH2Ds (LXSIHZX xoaddr:$src))))>;
+            (v2i64 (XXPERMDIs (VEXTSH2Ds (LXSIHZX xoaddr:$src)), 0))>;
 
   let Predicates = [IsBigEndian, HasP9Vector] in {
   // Scalar stores of i8
@@ -2760,9 +2756,11 @@
 
     // Build vectors of floating point converted to i64.
     def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)),
-              (v2i64 (XXPERMDIs (COPY_TO_REGCLASS (XSCVDPSXDSs $A), VSFRC), 0))>;
+              (v2i64 (XXPERMDIs
+                       (COPY_TO_REGCLASS (XSCVDPSXDSs $A), VSFRC), 0))>;
     def : Pat<(v2i64 (build_vector FltToULong.A, FltToULong.A)),
-              (v2i64 (XXPERMDIs (COPY_TO_REGCLASS (XSCVDPUXDSs $A), VSFRC), 0))>;
+              (v2i64 (XXPERMDIs
+                       (COPY_TO_REGCLASS (XSCVDPUXDSs $A), VSFRC), 0))>;
     def : Pat<(v2i64 (scalar_to_vector DblToLongLoad.A)),
               (v2i64 (XVCVDPSXDS (LXVDSX xoaddr:$A)))>;
     def : Pat<(v2i64 (scalar_to_vector DblToULongLoad.A)),
diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp
index ee62bb3..2413af3 100644
--- a/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -124,10 +124,40 @@
           if (TrueReg1 == TrueReg2
               && TargetRegisterInfo::isVirtualRegister(TrueReg1)) {
             MachineInstr *DefMI = MRI->getVRegDef(TrueReg1);
+            unsigned DefOpc = DefMI ? DefMI->getOpcode() : 0;
+
+            // If this is a splat fed by a splatting load, the splat is
+            // redundant. Replace with a copy. This doesn't happen directly due
+            // to code in PPCDAGToDAGISel.cpp, but it can happen when converting
+            // a load of a double to a vector of 64-bit integers.
+            auto isConversionOfLoadAndSplat = [=]() -> bool {
+              if (DefOpc != PPC::XVCVDPSXDS && DefOpc != PPC::XVCVDPUXDS)
+                return false;
+              unsigned DefReg = lookThruCopyLike(DefMI->getOperand(1).getReg());
+              if (TargetRegisterInfo::isVirtualRegister(DefReg)) {
+                MachineInstr *LoadMI = MRI->getVRegDef(DefReg);
+                if (LoadMI && LoadMI->getOpcode() == PPC::LXVDSX)
+                  return true;
+              }
+              return false;
+            };
+            if (DefMI && (Immed == 0 || Immed == 3)) {
+              if (DefOpc == PPC::LXVDSX || isConversionOfLoadAndSplat()) {
+                DEBUG(dbgs()
+                      << "Optimizing load-and-splat/splat "
+                      "to load-and-splat/copy: ");
+                DEBUG(MI.dump());
+                BuildMI(MBB, &MI, MI.getDebugLoc(),
+                        TII->get(PPC::COPY), MI.getOperand(0).getReg())
+                  .addOperand(MI.getOperand(1));
+                ToErase = &MI;
+                Simplified = true;
+              }
+            }
 
             // If this is a splat or a swap fed by another splat, we
             // can replace it with a copy.
-            if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) {
+            if (DefOpc == PPC::XXPERMDI) {
               unsigned FeedImmed = DefMI->getOperand(3).getImm();
               unsigned FeedReg1
                 = lookThruCopyLike(DefMI->getOperand(1).getReg());
@@ -170,8 +200,9 @@
                 ToErase = &MI;
                 Simplified = true;
               }
-            } else if ((Immed == 0 || Immed == 3) &&
-                       DefMI && DefMI->getOpcode() == PPC::XXPERMDIs) {
+            } else if ((Immed == 0 || Immed == 3) && DefOpc == PPC::XXPERMDIs &&
+                       (DefMI->getOperand(2).getImm() == 0 ||
+                        DefMI->getOperand(2).getImm() == 3)) {
               // Splat fed by another splat - switch the output of the first
               // and remove the second.
               DefMI->getOperand(0).setReg(MI.getOperand(0).getReg());
@@ -190,17 +221,32 @@
         unsigned MyOpcode = MI.getOpcode();
         unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2;
         unsigned TrueReg = lookThruCopyLike(MI.getOperand(OpNo).getReg());
+        if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
+          break;
         MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
         if (!DefMI)
           break;
         unsigned DefOpcode = DefMI->getOpcode();
-        bool SameOpcode = (MyOpcode == DefOpcode) ||
+        auto isConvertOfSplat = [=]() -> bool {
+          if (DefOpcode != PPC::XVCVSPSXWS && DefOpcode != PPC::XVCVSPUXWS)
+            return false;
+          unsigned ConvReg = DefMI->getOperand(1).getReg();
+          if (!TargetRegisterInfo::isVirtualRegister(ConvReg))
+            return false;
+          MachineInstr *Splt = MRI->getVRegDef(ConvReg);
+          return Splt && (Splt->getOpcode() == PPC::LXVWSX ||
+            Splt->getOpcode() == PPC::XXSPLTW);
+        };
+        bool AlreadySplat = (MyOpcode == DefOpcode) ||
           (MyOpcode == PPC::VSPLTB && DefOpcode == PPC::VSPLTBs) ||
           (MyOpcode == PPC::VSPLTH && DefOpcode == PPC::VSPLTHs) ||
-          (MyOpcode == PPC::XXSPLTW && DefOpcode == PPC::XXSPLTWs);
-        // Splat fed by another splat - switch the output of the first
-        // and remove the second.
-        if (SameOpcode) {
+          (MyOpcode == PPC::XXSPLTW && DefOpcode == PPC::XXSPLTWs) ||
+          (MyOpcode == PPC::XXSPLTW && DefOpcode == PPC::LXVWSX) ||
+          (MyOpcode == PPC::XXSPLTW && DefOpcode == PPC::MTVSRWS)||
+          (MyOpcode == PPC::XXSPLTW && isConvertOfSplat());
+        // If the instruction[s] that feed this splat have already splat
+        // the value, this splat is redundant.
+        if (AlreadySplat) {
           DEBUG(dbgs() << "Changing redundant splat to a copy: ");
           DEBUG(MI.dump());
           BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
@@ -234,9 +280,64 @@
         }
         break;
       }
+      case PPC::XVCVDPSP: {
+        // If this is a DP->SP conversion fed by an FRSP, the FRSP is redundant.
+        unsigned TrueReg = lookThruCopyLike(MI.getOperand(1).getReg());
+        if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
+          break;
+        MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
+
+        // This can occur when building a vector of single precision or integer
+        // values.
+        if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) {
+          unsigned DefsReg1 = lookThruCopyLike(DefMI->getOperand(1).getReg());
+          unsigned DefsReg2 = lookThruCopyLike(DefMI->getOperand(2).getReg());
+          if (!TargetRegisterInfo::isVirtualRegister(DefsReg1) ||
+              !TargetRegisterInfo::isVirtualRegister(DefsReg2))
+            break;
+          MachineInstr *P1 = MRI->getVRegDef(DefsReg1);
+          MachineInstr *P2 = MRI->getVRegDef(DefsReg2);
+
+          if (!P1 || !P2)
+            break;
+
+          // Remove the passed FRSP instruction if it only feeds this MI and
+          // set any uses of that FRSP (in this MI) to the source of the FRSP.
+          auto removeFRSPIfPossible = [&](MachineInstr *RoundInstr) {
+            if (RoundInstr->getOpcode() == PPC::FRSP &&
+                MRI->hasOneNonDBGUse(RoundInstr->getOperand(0).getReg())) {
+              Simplified = true;
+              unsigned ConvReg1 = RoundInstr->getOperand(1).getReg();
+              unsigned FRSPDefines = RoundInstr->getOperand(0).getReg();
+              MachineInstr &Use = *(MRI->use_instr_begin(FRSPDefines));
+              for (int i = 0, e = Use.getNumOperands(); i < e; ++i)
+                if (Use.getOperand(i).isReg() &&
+                    Use.getOperand(i).getReg() == FRSPDefines)
+                  Use.getOperand(i).setReg(ConvReg1);
+              DEBUG(dbgs() << "Removing redundant FRSP:\n");
+              DEBUG(RoundInstr->dump());
+              DEBUG(dbgs() << "As it feeds instruction:\n");
+              DEBUG(MI.dump());
+              DEBUG(dbgs() << "Through instruction:\n");
+              DEBUG(DefMI->dump());
+              RoundInstr->eraseFromParent();
+            }
+          };
+
+          // If the input to XVCVDPSP is a vector that was built (even
+          // partially) out of FRSP's, the FRSP(s) can safely be removed
+          // since this instruction performs the same operation.
+          if (P1 != P2) {
+            removeFRSPIfPossible(P1);
+            removeFRSPIfPossible(P2);
+            break;
+          }
+          removeFRSPIfPossible(P1);
+        }
+        break;
+      }
       }
     }
-
     // If the last instruction was marked for elimination,
     // remove it now.
     if (ToErase) {
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index 43f01b0..83882fc 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -275,7 +275,7 @@
   const SystemZSubtarget &ST = F.getSubtarget<SystemZSubtarget>();
   TII = ST.getInstrInfo();
   TRI = ST.getRegisterInfo();
-  LiveRegs.init(TRI);
+  LiveRegs.init(*TRI);
 
   bool Changed = false;
   for (auto &MBB : F)
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 3c04bf4..1be5aec 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -548,10 +548,11 @@
   unsigned VecSize = VT.getSizeInBits();
   unsigned EltSize = VT.getScalarSizeInBits();
   unsigned NumLanes = VecSize / 128;
-  unsigned NumEltsPerLane = VT.getVectorNumElements() / NumLanes;
-  assert((VecSize == 128 || VecSize == 256) &&
-         "Unexpected vector size");
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumEltsPerLane = NumElts / NumLanes;
+  assert((VecSize == 128 || VecSize == 256) && "Unexpected vector size");
   assert((EltSize == 32 || EltSize == 64) && "Unexpected element size");
+  assert((NumElts == RawMask.size()) && "Unexpected mask size");
 
   for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
     // VPERMIL2 Operation.
@@ -572,14 +573,15 @@
       continue;
     }
 
-    unsigned Index = i & ~(NumEltsPerLane - 1);
+    int Index = i & ~(NumEltsPerLane - 1);
     if (EltSize == 64)
       Index += (Selector >> 1) & 0x1;
     else
       Index += Selector & 0x3;
 
-    unsigned SrcOffset = (Selector >> 2) & 1;
-    ShuffleMask.push_back((int)(SrcOffset + Index));
+    int Src = (Selector >> 2) & 0x1;
+    Index += Src * NumElts;
+    ShuffleMask.push_back(Index);
   }
 }
 
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index e8fbe82..dc18a59 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -99,6 +99,8 @@
                                        "Bit testing of memory is slow">;
 def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
                                        "SHLD instruction is slow">;
+def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
+                                        "PMULLD instruction is slow">;
 // FIXME: This should not apply to CPUs that do not have SSE.
 def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
                                 "IsUAMem16Slow", "true",
@@ -403,6 +405,7 @@
   FeatureSlowLEA,
   FeatureSlowIncDec,
   FeatureSlowBTMem,
+  FeatureSlowPMULLD,
   FeatureLAHFSAHF
 ]>;
 def : SilvermontProc<"silvermont">;
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 25d1c51..78bd2ad 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -340,10 +340,10 @@
     return;
   }
 
-  // For globals in PIC mode, we can have some LEAs here.
-  // Ignore them, they don't bother us.
+  // Skip over DEBUG_VALUE.
+  // For globals in PIC mode, we can have some LEAs here. Skip them as well.
   // TODO: Extend this to something that covers more cases.
-  while (I->getOpcode() == X86::LEA32r)
+  while (I->getOpcode() == X86::LEA32r || I->isDebugValue())
     ++I;
 
   unsigned StackPtr = RegInfo.getStackRegister();
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 2c6e5ec..c890fdd 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -2155,8 +2155,8 @@
 
   // Choose the SSE instruction sequence based on data type (float or double).
   static const uint16_t OpcTable[2][4] = {
-    { X86::CMPSSrr,  X86::FsANDPSrr,  X86::FsANDNPSrr,  X86::FsORPSrr  },
-    { X86::CMPSDrr,  X86::FsANDPDrr,  X86::FsANDNPDrr,  X86::FsORPDrr  }
+    { X86::CMPSSrr,  X86::ANDPSrr,  X86::ANDNPSrr,  X86::ORPSrr  },
+    { X86::CMPSDrr,  X86::ANDPDrr,  X86::ANDNPDrr,  X86::ORPDrr  }
   };
 
   const uint16_t *Opc = nullptr;
@@ -2236,14 +2236,18 @@
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
   } else {
+    const TargetRegisterClass *VR128 = &X86::VR128RegClass;
     unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
                                        CmpRHSReg, CmpRHSIsKill, CC);
-    unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
+    unsigned AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false,
                                       LHSReg, LHSIsKill);
-    unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
+    unsigned AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true,
                                        RHSReg, RHSIsKill);
-    ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
-                                         AndReg, /*IsKill=*/true);
+    unsigned OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true,
+                                     AndReg, /*IsKill=*/true);
+    ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg);
   }
   updateValueMap(I, ResultReg);
   return true;
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index 345181c..8bde4bf 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -154,7 +154,7 @@
   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
   OptForSize = MF.getFunction()->optForSize();
   MLI = &getAnalysis<MachineLoopInfo>();
-  LiveRegs.init(&TII->getRegisterInfo());
+  LiveRegs.init(TII->getRegisterInfo());
 
   DEBUG(dbgs() << "Start X86FixupBWInsts\n";);
 
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 566d459..1deefe1 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1550,19 +1550,22 @@
   }
   uint64_t SEHStackAllocAmt = NumBytes;
 
+  MachineBasicBlock::iterator FirstCSPop = MBBI;
   // Skip the callee-saved pop instructions.
   while (MBBI != MBB.begin()) {
     MachineBasicBlock::iterator PI = std::prev(MBBI);
     unsigned Opc = PI->getOpcode();
 
-    if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
-        (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
-        Opc != X86::DBG_VALUE && !PI->isTerminator())
-      break;
+    if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {
+      if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+          (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)))
+        break;
+      FirstCSPop = PI;
+    }
 
     --MBBI;
   }
-  MachineBasicBlock::iterator FirstCSPop = MBBI;
+  MBBI = FirstCSPop;
 
   if (TargetMBB) {
     // Fill EAX/RAX with the address of the target block.
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 19d1dd6..14c95f9 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
@@ -1571,7 +1572,15 @@
     return false;
 
   Imm = N;
-  return TM.getCodeModel() == CodeModel::Small;
+  if (N->getOpcode() != ISD::TargetGlobalAddress)
+    return TM.getCodeModel() == CodeModel::Small;
+
+  Optional<ConstantRange> CR =
+      cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
+  if (!CR)
+    return TM.getCodeModel() == CodeModel::Small;
+
+  return CR->getUnsignedMax().ult(1ull << 32);
 }
 
 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
@@ -1710,10 +1719,39 @@
     return true;
   }
 
+  // Keep track of the original value type and whether this value was
+  // truncated. If we see a truncation from pointer type to VT that truncates
+  // bits that are known to be zero, we can use a narrow reference.
+  EVT VT = N.getValueType();
+  bool WasTruncated = false;
+  if (N.getOpcode() == ISD::TRUNCATE) {
+    WasTruncated = true;
+    N = N.getOperand(0);
+  }
+
   if (N.getOpcode() != X86ISD::Wrapper)
     return false;
 
-  Op = N.getOperand(0);
+  // We can only use non-GlobalValues as immediates if they were not truncated,
+  // as we do not have any range information. If we have a GlobalValue and the
+  // address was not truncated, we can select it as an operand directly.
+  unsigned Opc = N.getOperand(0)->getOpcode();
+  if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
+    Op = N.getOperand(0);
+    // We can only select the operand directly if we didn't have to look past a
+    // truncate.
+    return !WasTruncated;
+  }
+
+  // Check that the global's range fits into VT.
+  auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
+  Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
+  if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
+    return false;
+
+  // Okay, we can use a narrow reference.
+  Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
+                                      GA->getOffset(), GA->getTargetFlags());
   return true;
 }
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 6b23b61..8db8ed8 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6316,8 +6316,47 @@
   return SDValue();
 }
 
-/// Attempt to use the vbroadcast instruction to generate a splat value for a
-/// splat BUILD_VECTOR which uses a single scalar load, or a constant.
+static Constant *getConstantVector(MVT VT, APInt SplatValue,
+                                   unsigned SplatBitSize, LLVMContext &C) {
+  unsigned ScalarSize = VT.getScalarSizeInBits();
+  unsigned NumElm = SplatBitSize / ScalarSize;
+
+  SmallVector<Constant *, 32> ConstantVec;
+  for (unsigned i = 0; i < NumElm; i++) {
+    APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize);
+    Constant *Const;
+    if (VT.isFloatingPoint()) {
+      assert((ScalarSize == 32 || ScalarSize == 64) &&
+             "Unsupported floating point scalar size");
+      if (ScalarSize == 32)
+        Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
+      else
+        Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
+    } else
+      Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
+    ConstantVec.push_back(Const);
+  }
+  return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
+}
+
+static bool isUseOfShuffle(SDNode *N) {
+  for (auto *U : N->uses()) {
+    if (isTargetShuffle(U->getOpcode()))
+      return true;
+    if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
+      return isUseOfShuffle(U);
+  }
+  return false;
+}
+
+/// Attempt to use the vbroadcast instruction to generate a splat value for the
+/// following cases:
+/// 1. A splat BUILD_VECTOR which uses:
+///    a. A single scalar load, or a constant.
+///    b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
+/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
+/// a scalar load, or a constant.
+///
 /// The VBROADCAST node is returned when a pattern is found,
 /// or SDValue() otherwise.
 static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget,
@@ -6339,8 +6378,82 @@
 
   // We need a splat of a single value to use broadcast, and it doesn't
   // make any sense if the value is only in one element of the vector.
-  if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
+  if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
+    APInt SplatValue, Undef;
+    unsigned SplatBitSize;
+    bool HasUndef;
+    // Check if this is a repeated constant pattern suitable for broadcasting.
+    if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
+        SplatBitSize > VT.getScalarSizeInBits() &&
+        SplatBitSize < VT.getSizeInBits()) {
+      // Avoid replacing with broadcast when it's a use of a shuffle
+      // instruction to preserve the present custom lowering of shuffles.
+      if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
+        return SDValue();
+      // replace BUILD_VECTOR with broadcast of the repeated constants.
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      LLVMContext *Ctx = DAG.getContext();
+      MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
+      if (Subtarget.hasAVX()) {
+        if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
+            !(SplatBitSize == 64 && Subtarget.is32Bit())) {
+          // Splatted value can fit in one INTEGER constant in constant pool.
+          // Load the constant and broadcast it.
+          MVT CVT = MVT::getIntegerVT(SplatBitSize);
+          Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
+          Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
+          SDValue CP = DAG.getConstantPool(C, PVT);
+          unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
+
+          unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+          Ld = DAG.getLoad(
+              CVT, dl, DAG.getEntryNode(), CP,
+              MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+              Alignment);
+          SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
+                                       MVT::getVectorVT(CVT, Repeat), Ld);
+          return DAG.getBitcast(VT, Brdcst);
+        } else if (SplatBitSize == 32 || SplatBitSize == 64) {
+          // Splatted value can fit in one FLOAT constant in constant pool.
+          // Load the constant and broadcast it.
+          // AVX have support for 32 and 64 bit broadcast for floats only.
+          // No 64bit integer in 32bit subtarget.
+          MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
+          Constant *C = SplatBitSize == 32
+                            ? ConstantFP::get(Type::getFloatTy(*Ctx),
+                                              SplatValue.bitsToFloat())
+                            : ConstantFP::get(Type::getDoubleTy(*Ctx),
+                                              SplatValue.bitsToDouble());
+          SDValue CP = DAG.getConstantPool(C, PVT);
+          unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
+
+          unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+          Ld = DAG.getLoad(
+              CVT, dl, DAG.getEntryNode(), CP,
+              MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+              Alignment);
+          SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
+                                       MVT::getVectorVT(CVT, Repeat), Ld);
+          return DAG.getBitcast(VT, Brdcst);
+        } else if (SplatBitSize > 64) {
+          // Load the vector of constants and broadcast it.
+          MVT CVT = VT.getScalarType();
+          Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
+                                             *Ctx);
+          SDValue VCP = DAG.getConstantPool(VecC, PVT);
+          unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
+          unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
+          Ld = DAG.getLoad(
+              MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
+              MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+              Alignment);
+          SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
+          return DAG.getBitcast(VT, Brdcst);
+        }
+      }
+    }
     return SDValue();
+  }
 
   bool ConstSplatVal =
       (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
@@ -13563,7 +13676,11 @@
 }
 
 // Returns the appropriate wrapper opcode for a global reference.
-unsigned X86TargetLowering::getGlobalWrapperKind() const {
+unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
+  // References to absolute symbols are never PC-relative.
+  if (GV && GV->isAbsoluteSymbolRef())
+    return X86ISD::Wrapper;
+
   CodeModel::Model M = getTargetMachine().getCodeModel();
   if (Subtarget.isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
@@ -13692,7 +13809,7 @@
     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
   }
 
-  Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
+  Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (isGlobalRelativeToPICBase(OpFlags)) {
@@ -23318,6 +23435,14 @@
   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
+  case X86ISD::FMADDS1_RND:        return "X86ISD::FMADDS1_RND";
+  case X86ISD::FNMADDS1_RND:       return "X86ISD::FNMADDS1_RND";
+  case X86ISD::FMSUBS1_RND:        return "X86ISD::FMSUBS1_RND";
+  case X86ISD::FNMSUBS1_RND:       return "X86ISD::FNMSUBS1_RND";
+  case X86ISD::FMADDS3_RND:        return "X86ISD::FMADDS3_RND";
+  case X86ISD::FNMADDS3_RND:       return "X86ISD::FNMADDS3_RND";
+  case X86ISD::FMSUBS3_RND:        return "X86ISD::FMSUBS3_RND";
+  case X86ISD::FNMSUBS3_RND:       return "X86ISD::FNMSUBS3_RND";
   case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
   case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
   case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
@@ -29189,10 +29314,17 @@
 /// generate pmullw+pmulhuw for it (MULU16 mode).
 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
-  // pmulld is supported since SSE41. It is better to use pmulld
-  // instead of pmullw+pmulhw.
+  // Check for legality
   // pmullw/pmulhw are not supported by SSE.
-  if (Subtarget.hasSSE41() || !Subtarget.hasSSE2())
+  if (!Subtarget.hasSSE2())
+    return SDValue();
+
+  // Check for profitability
+  // pmulld is supported since SSE41. It is better to use pmulld
+  // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
+  // the expansion.
+  bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
+  if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
     return SDValue();
 
   ShrinkMode Mode;
@@ -29773,95 +29905,6 @@
   }
 }
 
-static SDValue combineVectorZext(SDNode *N, SelectionDAG &DAG,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const X86Subtarget &Subtarget) {
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDLoc DL(N);
-
-  // A vector zext_in_reg may be represented as a shuffle,
-  // feeding into a bitcast (this represents anyext) feeding into
-  // an and with a mask.
-  // We'd like to try to combine that into a shuffle with zero
-  // plus a bitcast, removing the and.
-  if (N0.getOpcode() != ISD::BITCAST ||
-      N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
-    return SDValue();
-
-  // The other side of the AND should be a splat of 2^C, where C
-  // is the number of bits in the source type.
-  N1 = peekThroughBitcasts(N1);
-  if (N1.getOpcode() != ISD::BUILD_VECTOR)
-    return SDValue();
-  BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
-
-  ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
-  EVT SrcType = Shuffle->getValueType(0);
-
-  // We expect a single-source shuffle
-  if (!Shuffle->getOperand(1)->isUndef())
-    return SDValue();
-
-  unsigned SrcSize = SrcType.getScalarSizeInBits();
-  unsigned NumElems = SrcType.getVectorNumElements();
-
-  APInt SplatValue, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (!Vector->isConstantSplat(SplatValue, SplatUndef,
-                                SplatBitSize, HasAnyUndefs))
-    return SDValue();
-
-  unsigned ResSize = N1.getScalarValueSizeInBits();
-  // Make sure the splat matches the mask we expect
-  if (SplatBitSize > ResSize ||
-      (SplatValue + 1).exactLogBase2() != (int)SrcSize)
-    return SDValue();
-
-  // Make sure the input and output size make sense
-  if (SrcSize >= ResSize || ResSize % SrcSize)
-    return SDValue();
-
-  // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
-  // The number of u's between each two values depends on the ratio between
-  // the source and dest type.
-  unsigned ZextRatio = ResSize / SrcSize;
-  bool IsZext = true;
-  for (unsigned i = 0; i != NumElems; ++i) {
-    if (i % ZextRatio) {
-      if (Shuffle->getMaskElt(i) > 0) {
-        // Expected undef
-        IsZext = false;
-        break;
-      }
-    } else {
-      if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
-        // Expected element number
-        IsZext = false;
-        break;
-      }
-    }
-  }
-
-  if (!IsZext)
-    return SDValue();
-
-  // Ok, perform the transformation - replace the shuffle with
-  // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
-  // (instead of undef) where the k elements come from the zero vector.
-  SmallVector<int, 8> Mask;
-  for (unsigned i = 0; i != NumElems; ++i)
-    if (i % ZextRatio)
-      Mask.push_back(NumElems);
-    else
-      Mask.push_back(i / ZextRatio);
-
-  SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
-    Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask);
-  return DAG.getBitcast(N0.getValueType(), NewShuffle);
-}
-
 /// If both input operands of a logic op are being cast from floating point
 /// types, try to convert this into a floating point logic node to avoid
 /// unnecessary moves from SSE to integer registers.
@@ -29939,9 +29982,6 @@
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  if (SDValue Zext = combineVectorZext(N, DAG, DCI, Subtarget))
-    return Zext;
-
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
@@ -31677,14 +31717,17 @@
   unsigned NewOpcode = 0;
   if (Arg.hasOneUse()) {
     switch (Arg.getOpcode()) {
-    case X86ISD::FMADD:      NewOpcode = X86ISD::FNMSUB;     break;
-    case X86ISD::FMSUB:      NewOpcode = X86ISD::FNMADD;     break;
-    case X86ISD::FNMADD:     NewOpcode = X86ISD::FMSUB;      break;
-    case X86ISD::FNMSUB:     NewOpcode = X86ISD::FMADD;      break;
-    case X86ISD::FMADD_RND:  NewOpcode = X86ISD::FNMSUB_RND; break;
-    case X86ISD::FMSUB_RND:  NewOpcode = X86ISD::FNMADD_RND; break;
-    case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND;  break;
-    case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND;  break;
+    case X86ISD::FMADD:        NewOpcode = X86ISD::FNMSUB;       break;
+    case X86ISD::FMSUB:        NewOpcode = X86ISD::FNMADD;       break;
+    case X86ISD::FNMADD:       NewOpcode = X86ISD::FMSUB;        break;
+    case X86ISD::FNMSUB:       NewOpcode = X86ISD::FMADD;        break;
+    case X86ISD::FMADD_RND:    NewOpcode = X86ISD::FNMSUB_RND;   break;
+    case X86ISD::FMSUB_RND:    NewOpcode = X86ISD::FNMADD_RND;   break;
+    case X86ISD::FNMADD_RND:   NewOpcode = X86ISD::FMSUB_RND;    break;
+    case X86ISD::FNMSUB_RND:   NewOpcode = X86ISD::FMADD_RND;    break;
+    // We can't handle scalar intrinsic node here because it would only
+    // invert one element and not the whole vector. But we could try to handle
+    // a negation of the lower element only.
     }
   }
   if (NewOpcode)
@@ -31764,6 +31807,34 @@
   return V;
 }
 
+static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
+  if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
+        (VT == MVT::f64 && Subtarget.hasSSE2())))
+    return SDValue();
+
+  auto isAllOnesConstantFP = [](SDValue V) {
+    auto *C = dyn_cast<ConstantFPSDNode>(V);
+    return C && C->getConstantFPValue()->isAllOnesValue();
+  };
+
+  // fand (fxor X, -1), Y --> fandn X, Y
+  if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
+    return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
+
+  // fand X, (fxor Y, -1) --> fandn Y, X
+  if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
+    return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
+
+  return SDValue();
+}
+
 /// Do target-specific dag combines on X86ISD::FAND nodes.
 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
                            const X86Subtarget &Subtarget) {
@@ -31775,6 +31846,9 @@
   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
     return V;
 
+  if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
+    return V;
+
   return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
@@ -32187,15 +32261,6 @@
   SDValue B = N->getOperand(1);
   SDValue C = N->getOperand(2);
 
-  auto isScalarMaskedNode = [&](SDValue &V) {
-    if (V.hasOneUse())
-      return false;
-    for (auto User : V.getNode()->uses())
-      if (User->getOpcode() == X86ISD::SELECTS && N->isOperandOf(User))
-        return true;
-    return false;
-  };
-
   auto invertIfNegative = [](SDValue &V) {
     if (SDValue NegVal = isFNEG(V.getNode())) {
       V = NegVal;
@@ -32204,10 +32269,11 @@
     return false;
   };
 
-  // Do not convert scalar masked operations.
-  bool NegA = !isScalarMaskedNode(A) && invertIfNegative(A);
-  bool NegB = !isScalarMaskedNode(B) && invertIfNegative(B);
-  bool NegC = !isScalarMaskedNode(C) && invertIfNegative(C);
+  // Do not convert the passthru input of scalar intrinsics.
+  // FIXME: We could allow negations of the lower element only.
+  bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
+  bool NegB = invertIfNegative(B);
+  bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
 
   // Negative multiplication when NegA xor NegB
   bool NegMul = (NegA != NegB);
@@ -32218,16 +32284,35 @@
   else
     NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
 
+
   if (N->getOpcode() == X86ISD::FMADD_RND) {
     switch (NewOpcode) {
-      case X86ISD::FMADD:  NewOpcode = X86ISD::FMADD_RND; break;
-      case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUB_RND; break;
-      case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
-      case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
+    case X86ISD::FMADD:  NewOpcode = X86ISD::FMADD_RND; break;
+    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUB_RND; break;
+    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
+    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
     }
-    return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
+  } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
+    switch (NewOpcode) {
+    case X86ISD::FMADD:  NewOpcode = X86ISD::FMADDS1_RND; break;
+    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS1_RND; break;
+    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
+    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
+    }
+  } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
+    switch (NewOpcode) {
+    case X86ISD::FMADD:  NewOpcode = X86ISD::FMADDS3_RND; break;
+    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS3_RND; break;
+    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
+    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
+    }
+  } else {
+    assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
+           "Unexpected opcode!");
+    return DAG.getNode(NewOpcode, dl, VT, A, B, C);
   }
-  return DAG.getNode(NewOpcode, dl, VT, A, B, C);
+
+  return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
 }
 
 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
@@ -32994,6 +33079,8 @@
   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
   case X86ISD::FMADD:
   case X86ISD::FMADD_RND:
+  case X86ISD::FMADDS1_RND:
+  case X86ISD::FMADDS3_RND:
   case ISD::FMA:            return combineFMA(N, DAG, Subtarget);
   case ISD::MGATHER:
   case ISD::MSCATTER:       return combineGatherScatter(N, DAG);
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 4aeaf1b..74bbe51 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -488,6 +488,13 @@
       FMADDSUB_RND,
       FMSUBADD_RND,
 
+      // Scalar intrinsic FMA with rounding mode.
+      // Two versions, passthru bits on op1 or op3.
+      FMADDS1_RND, FMADDS3_RND,
+      FNMADDS1_RND, FNMADDS3_RND,
+      FMSUBS1_RND, FMSUBS3_RND,
+      FNMSUBS1_RND, FNMSUBS3_RND,
+
       // Compress and expand.
       COMPRESS,
       EXPAND,
@@ -1115,7 +1122,7 @@
     SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
 
-    unsigned getGlobalWrapperKind() const;
+    unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(const GlobalValue *GV, const SDLoc &dl,
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 92bb27f..fa6eba1 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -3127,12 +3127,13 @@
      (VMOVDQU32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
 }
 
-
-// Move Int Doubleword to Packed Double Int
-//
-def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
-                      "vmovd\t{$src, $dst|$dst, $src}",
-                      [(set VR128X:$dst,
+

+// Move Int Doubleword to Packed Double Int

+//

+let ExeDomain = SSEPackedInt in {

+def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),

+                      "vmovd\t{$src, $dst|$dst, $src}",

+                      [(set VR128X:$dst,

                         (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
                         EVEX;
 def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
@@ -3162,43 +3163,47 @@
 def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src),
                          "vmovq\t{$src, $dst|$dst, $src}",
                          [(store (i64 (bitconvert FR64X:$src)), addr:$dst)],
-                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
-                         EVEX_CD8<64, CD8VT1>;
-}
-
-// Move Int Doubleword to Single Scalar
-//
-let isCodeGenOnly = 1 in {
-def VMOVDI2SSZrr  : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
-                      "vmovd\t{$src, $dst|$dst, $src}",
-                      [(set FR32X:$dst, (bitconvert GR32:$src))],
+                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,

+                         EVEX_CD8<64, CD8VT1>;

+}

+} // ExeDomain = SSEPackedInt

+

+// Move Int Doubleword to Single Scalar

+//

+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {

+def VMOVDI2SSZrr  : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),

+                      "vmovd\t{$src, $dst|$dst, $src}",

+                      [(set FR32X:$dst, (bitconvert GR32:$src))],

                       IIC_SSE_MOVDQ>, EVEX;
 
 def VMOVDI2SSZrm  : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
-                      "vmovd\t{$src, $dst|$dst, $src}",
-                      [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
-                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
-}
-
-// Move doubleword from xmm register to r/m32
-//
-def VMOVPDI2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
-                       "vmovd\t{$src, $dst|$dst, $src}",
-                       [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",

+                      [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],

+                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;

+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1

+

+// Move doubleword from xmm register to r/m32

+//

+let ExeDomain = SSEPackedInt in {

+def VMOVPDI2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),

+                       "vmovd\t{$src, $dst|$dst, $src}",

+                       [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),

                                         (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
                        EVEX;
 def VMOVPDI2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
                        (ins i32mem:$dst, VR128X:$src),
                        "vmovd\t{$src, $dst|$dst, $src}",
-                       [(store (i32 (extractelt (v4i32 VR128X:$src),
-                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
-                       EVEX, EVEX_CD8<32, CD8VT1>;
-
-// Move quadword from xmm1 register to r/m64
-//
-def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
-                      "vmovq\t{$src, $dst|$dst, $src}",
-                      [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
+                       [(store (i32 (extractelt (v4i32 VR128X:$src),

+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,

+                       EVEX, EVEX_CD8<32, CD8VT1>;

+} // ExeDomain = SSEPackedInt

+

+// Move quadword from xmm1 register to r/m64

+//

+let ExeDomain = SSEPackedInt in {

+def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),

+                      "vmovq\t{$src, $dst|$dst, $src}",

+                      [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),

                                                    (iPTR 0)))],
                       IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
                       Requires<[HasAVX512, In64BitMode]>;
@@ -3219,36 +3224,39 @@
 
 let hasSideEffects = 0 in
 def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
-                             (ins VR128X:$src),
-                             "vmovq.s\t{$src, $dst|$dst, $src}",[]>,
-                             EVEX, VEX_W;
-
-// Move Scalar Single to Double Int
-//
-let isCodeGenOnly = 1 in {
-def VMOVSS2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
-                      (ins FR32X:$src),
-                      "vmovd\t{$src, $dst|$dst, $src}",
+                             (ins VR128X:$src),

+                             "vmovq.s\t{$src, $dst|$dst, $src}",[]>,

+                             EVEX, VEX_W;

+} // ExeDomain = SSEPackedInt

+

+// Move Scalar Single to Double Int

+//

+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {

+def VMOVSS2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),

+                      (ins FR32X:$src),

+                      "vmovd\t{$src, $dst|$dst, $src}",

                       [(set GR32:$dst, (bitconvert FR32X:$src))],
                       IIC_SSE_MOVD_ToGP>, EVEX;
 def VMOVSS2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
                       (ins i32mem:$dst, FR32X:$src),
-                      "vmovd\t{$src, $dst|$dst, $src}",
-                      [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
-                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
-}
-
-// Move Quadword Int to Packed Quadword Int
-//
-def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
-                      (ins i64mem:$src),
-                      "vmovq\t{$src, $dst|$dst, $src}",
-                      [(set VR128X:$dst,
-                        (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
-                      EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
-
-//===----------------------------------------------------------------------===//
-// AVX-512  MOVSS, MOVSD
+                      "vmovd\t{$src, $dst|$dst, $src}",

+                      [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],

+                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;

+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1

+

+// Move Quadword Int to Packed Quadword Int

+//

+let ExeDomain = SSEPackedInt in {

+def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),

+                      (ins i64mem:$src),

+                      "vmovq\t{$src, $dst|$dst, $src}",

+                      [(set VR128X:$dst,

+                        (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,

+                      EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;

+} // ExeDomain = SSEPackedInt

+

+//===----------------------------------------------------------------------===//

+// AVX-512  MOVSS, MOVSD

 //===----------------------------------------------------------------------===//
 
 multiclass avx512_move_scalar<string asm, SDNode OpNode,
@@ -5572,14 +5580,16 @@
 }// Constraints = "$src1 = $dst"
 
 multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
-         string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, X86VectorVTInfo _ ,
-                                                                  string SUFF> {
+                            string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
+                            SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> {
 
   defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
-                (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 FROUND_CURRENT))),
-                (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1,
+                // Operands for intrinsic are in 123 order to preserve passthu
+                // semantics.
+                (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 FROUND_CURRENT))),
+                (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2,
                          (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), (i32 FROUND_CURRENT))),
-                (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3,
+                (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
                          (i32 imm:$rc))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
                          _.FRC:$src3))),
@@ -5587,11 +5597,11 @@
                          (_.ScalarLdFrag addr:$src3))))>;
 
   defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
-                (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
-                (_.VT (OpNodeRnd _.RC:$src2,
+                (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
+                (_.VT (OpNodeRnds3 _.RC:$src2,
                        (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
                               _.RC:$src1, (i32 FROUND_CURRENT))),
-                (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1,
+                (_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
                                   (i32 imm:$rc))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
                                           _.FRC:$src1))),
@@ -5599,11 +5609,11 @@
                             (_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>;
 
   defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
-                (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
-                (_.VT (OpNodeRnd _.RC:$src1,
+                (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
+                (_.VT (OpNodeRnds1 _.RC:$src1,
                        (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
                               _.RC:$src2, (i32 FROUND_CURRENT))),
-                (_.VT ( OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2,
+                (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2,
                          (i32 imm:$rc))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
                          _.FRC:$src2))),
@@ -5612,21 +5622,26 @@
 }
 
 multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
-                             string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd>{
+                        string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
+                        SDNode OpNodeRnds3> {
   let Predicates = [HasAVX512] in {
     defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
-                                   OpNodeRnd, f32x_info, "SS">,
-                                   EVEX_CD8<32, CD8VT1>, VEX_LIG;
+                                 OpNodeRnds1, OpNodeRnds3, f32x_info, "SS">,
+                                 EVEX_CD8<32, CD8VT1>, VEX_LIG;
     defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
-                                   OpNodeRnd, f64x_info, "SD">,
-                                   EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
+                                 OpNodeRnds1, OpNodeRnds3, f64x_info, "SD">,
+                                 EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
   }
 }
 
-defm VFMADD  : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>;
-defm VFMSUB  : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>;
-defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>;
-defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>;
+defm VFMADD  : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnds1,
+                            X86FmaddRnds3>;
+defm VFMSUB  : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnds1,
+                            X86FmsubRnds3>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd,
+                            X86FnmaddRnds1, X86FnmaddRnds3>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub,
+                            X86FnmsubRnds1, X86FnmsubRnds3>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index abf37d9..ba970bc 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -145,6 +145,11 @@
   return MIB.addImm(1).addReg(0).addImm(Offset).addReg(0);
 }
 
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, const MachineOperand& Offset) {
+  return MIB.addImm(1).addReg(0).addOperand(Offset).addReg(0);
+}
+
 /// addRegOffset - This function is used to add a memory reference of the form
 /// [Reg + Offset], i.e., one with no scale or index, but with a
 /// displacement. An example is: DWORD PTR [EAX + 4].
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 7127349..1973684 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -466,6 +466,18 @@
 def X86FmaddsubRnd  : SDNode<"X86ISD::FMADDSUB_RND",  SDTFmaRound>;
 def X86FmsubaddRnd  : SDNode<"X86ISD::FMSUBADD_RND",  SDTFmaRound>;
 
+// Scalar FMA intrinsics with passthru bits in operand 1.
+def X86FmaddRnds1   : SDNode<"X86ISD::FMADDS1_RND",     SDTFmaRound>;
+def X86FnmaddRnds1  : SDNode<"X86ISD::FNMADDS1_RND",    SDTFmaRound>;
+def X86FmsubRnds1   : SDNode<"X86ISD::FMSUBS1_RND",     SDTFmaRound>;
+def X86FnmsubRnds1  : SDNode<"X86ISD::FNMSUBS1_RND",    SDTFmaRound>;
+
+// Scalar FMA intrinsics with passthru bits in operand 3.
+def X86FmaddRnds3   : SDNode<"X86ISD::FMADDS3_RND",     SDTFmaRound>;
+def X86FnmaddRnds3  : SDNode<"X86ISD::FNMADDS3_RND",    SDTFmaRound>;
+def X86FmsubRnds3   : SDNode<"X86ISD::FMSUBS3_RND",     SDTFmaRound>;
+def X86FnmsubRnds3  : SDNode<"X86ISD::FNMSUBS3_RND",    SDTFmaRound>;
+
 def x86vpmadd52l     : SDNode<"X86ISD::VPMADD52L",     SDTFma>;
 def x86vpmadd52h     : SDNode<"X86ISD::VPMADD52H",     SDTFma>;
 
@@ -624,15 +636,6 @@
 def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
 def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
 
-// These are needed to match a scalar load that is used in a vector-only
-// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
-// The memory operand is required to be a 128-bit load, so it must be converted
-// from a vector to a scalar.
-def loadf32_128 : PatFrag<(ops node:$ptr),
-  (f32 (extractelt (loadv4f32 node:$ptr), (iPTR 0)))>;
-def loadf64_128 : PatFrag<(ops node:$ptr),
-  (f64 (extractelt (loadv2f64 node:$ptr), (iPTR 0)))>;
-
 // Like 'store', but always requires 128-bit vector alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
                            (store node:$val, node:$ptr), [{
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index d8c1e4d..1679f99 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -592,6 +592,8 @@
     { X86::RCPSSr_Int,      X86::RCPSSm_Int,          TB_NO_REVERSE },
     { X86::ROUNDPDr,        X86::ROUNDPDm,            TB_ALIGN_16 },
     { X86::ROUNDPSr,        X86::ROUNDPSm,            TB_ALIGN_16 },
+    { X86::ROUNDSDr,        X86::ROUNDSDm,            0 },
+    { X86::ROUNDSSr,        X86::ROUNDSSm,            0 },
     { X86::RSQRTPSr,        X86::RSQRTPSm,            TB_ALIGN_16 },
     { X86::RSQRTSSr,        X86::RSQRTSSm,            0 },
     { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int,        TB_NO_REVERSE },
@@ -883,6 +885,8 @@
     { X86::VMOVUPSZrr,       X86::VMOVUPSZrm,         0 },
     { X86::VPABSDZrr,        X86::VPABSDZrm,          0 },
     { X86::VPABSQZrr,        X86::VPABSQZrm,          0 },
+    { X86::VPERMILPDZri,     X86::VPERMILPDZmi,       0 },
+    { X86::VPERMILPSZri,     X86::VPERMILPSZmi,       0 },
     { X86::VPERMPDZri,       X86::VPERMPDZmi,         0 },
     { X86::VPERMQZri,        X86::VPERMQZmi,          0 },
     { X86::VPMOVSXBDZrr,     X86::VPMOVSXBDZrm,       0 },
@@ -916,6 +920,8 @@
     { X86::VMOVDQU64Z256rr,      X86::VMOVDQU64Z256rm,      0 },
     { X86::VMOVUPDZ256rr,        X86::VMOVUPDZ256rm,        0 },
     { X86::VMOVUPSZ256rr,        X86::VMOVUPSZ256rm,        0 },
+    { X86::VPERMILPDZ256ri,      X86::VPERMILPDZ256mi,      0 },
+    { X86::VPERMILPSZ256ri,      X86::VPERMILPSZ256mi,      0 },
     { X86::VPERMPDZ256ri,        X86::VPERMPDZ256mi,        0 },
     { X86::VPERMQZ256ri,         X86::VPERMQZ256mi,         0 },
     { X86::VPMOVSXBDZ256rr,      X86::VPMOVSXBDZ256rm,      TB_NO_REVERSE },
@@ -947,6 +953,8 @@
     { X86::VMOVDQU64Z128rr,      X86::VMOVDQU64Z128rm,      0 },
     { X86::VMOVUPDZ128rr,        X86::VMOVUPDZ128rm,        0 },
     { X86::VMOVUPSZ128rr,        X86::VMOVUPSZ128rm,        0 },
+    { X86::VPERMILPDZ128ri,      X86::VPERMILPDZ128mi,      0 },
+    { X86::VPERMILPSZ128ri,      X86::VPERMILPSZ128mi,      0 },
     { X86::VPMOVSXBDZ128rr,      X86::VPMOVSXBDZ128rm,      TB_NO_REVERSE },
     { X86::VPMOVSXBQZ128rr,      X86::VPMOVSXBQZ128rm,      TB_NO_REVERSE },
     { X86::VPMOVSXBWZ128rr,      X86::VPMOVSXBWZ128rm,      TB_NO_REVERSE },
@@ -1212,8 +1220,6 @@
     { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm,  TB_ALIGN_16 },
     { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm,   TB_ALIGN_16 },
     { X86::PXORrr,          X86::PXORrm,        TB_ALIGN_16 },
-    { X86::ROUNDSDr,        X86::ROUNDSDm,      0 },
-    { X86::ROUNDSSr,        X86::ROUNDSSm,      0 },
     { X86::ROUNDSDr_Int,    X86::ROUNDSDm_Int,  TB_NO_REVERSE },
     { X86::ROUNDSSr_Int,    X86::ROUNDSSm_Int,  TB_NO_REVERSE },
     { X86::SBB32rr,         X86::SBB32rm,       0 },
@@ -1874,6 +1880,8 @@
     { X86::VPCMPWZrri,        X86::VPCMPWZrmi,          0 },
     { X86::VPERMBZrr,         X86::VPERMBZrm,           0 },
     { X86::VPERMDZrr,         X86::VPERMDZrm,           0 },
+    { X86::VPERMILPDZrr,      X86::VPERMILPDZrm,        0 },
+    { X86::VPERMILPSZrr,      X86::VPERMILPSZrm,        0 },
     { X86::VPERMPDZrr,        X86::VPERMPDZrm,          0 },
     { X86::VPERMPSZrr,        X86::VPERMPSZrm,          0 },
     { X86::VPERMQZrr,         X86::VPERMQZrm,           0 },
@@ -2042,6 +2050,10 @@
     { X86::VPERMBZ128rr,      X86::VPERMBZ128rm,        0 },
     { X86::VPERMBZ256rr,      X86::VPERMBZ256rm,        0 },
     { X86::VPERMDZ256rr,      X86::VPERMDZ256rm,        0 },
+    { X86::VPERMILPDZ128rr,   X86::VPERMILPDZ128rm,     0 },
+    { X86::VPERMILPDZ256rr,   X86::VPERMILPDZ256rm,     0 },
+    { X86::VPERMILPSZ128rr,   X86::VPERMILPSZ128rm,     0 },
+    { X86::VPERMILPSZ256rr,   X86::VPERMILPSZ256rm,     0 },
     { X86::VPERMPDZ256rr,     X86::VPERMPDZ256rm,       0 },
     { X86::VPERMPSZ256rr,     X86::VPERMPSZ256rm,       0 },
     { X86::VPERMQZ256rr,      X86::VPERMQZ256rm,        0 },
@@ -2111,6 +2123,8 @@
     { X86::VXORPSZ256rr,      X86::VXORPSZ256rm,        0 },
 
     // AVX-512 masked foldable instructions
+    { X86::VPERMILPDZrikz,    X86::VPERMILPDZmikz,      0 },
+    { X86::VPERMILPSZrikz,    X86::VPERMILPSZmikz,      0 },
     { X86::VPERMPDZrikz,      X86::VPERMPDZmikz,        0 },
     { X86::VPERMQZrikz,       X86::VPERMQZmikz,         0 },
     { X86::VPMOVSXBDZrrkz,    X86::VPMOVSXBDZrmkz,      0 },
@@ -2130,6 +2144,8 @@
     { X86::VPSHUFLWZrikz,     X86::VPSHUFLWZmikz,       0 },
 
     // AVX-512VL 256-bit masked foldable instructions
+    { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz,   0 },
+    { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz,   0 },
     { X86::VPERMPDZ256rikz,   X86::VPERMPDZ256mikz,     0 },
     { X86::VPERMQZ256rikz,    X86::VPERMQZ256mikz,      0 },
     { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz,   TB_NO_REVERSE },
@@ -2149,6 +2165,8 @@
     { X86::VPSHUFLWZ256rikz,  X86::VPSHUFLWZ256mikz,    0 },
 
     // AVX-512VL 128-bit masked foldable instructions
+    { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz,   0 },
+    { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz,   0 },
     { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz,   TB_NO_REVERSE },
     { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz,   TB_NO_REVERSE },
     { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz,   TB_NO_REVERSE },
@@ -2344,6 +2362,8 @@
     { X86::VPANDQZrrkz,           X86::VPANDQZrmkz,           0 },
     { X86::VPERMBZrrkz,           X86::VPERMBZrmkz,           0 },
     { X86::VPERMDZrrkz,           X86::VPERMDZrmkz,           0 },
+    { X86::VPERMILPDZrrkz,        X86::VPERMILPDZrmkz,        0 },
+    { X86::VPERMILPSZrrkz,        X86::VPERMILPSZrmkz,        0 },
     { X86::VPERMPDZrrkz,          X86::VPERMPDZrmkz,          0 },
     { X86::VPERMPSZrrkz,          X86::VPERMPSZrmkz,          0 },
     { X86::VPERMQZrrkz,           X86::VPERMQZrmkz,           0 },
@@ -2419,6 +2439,8 @@
     { X86::VPANDQZ256rrkz,        X86::VPANDQZ256rmkz,        0 },
     { X86::VPERMBZ256rrkz,        X86::VPERMBZ256rmkz,        0 },
     { X86::VPERMDZ256rrkz,        X86::VPERMDZ256rmkz,        0 },
+    { X86::VPERMILPDZ256rrkz,     X86::VPERMILPDZ256rmkz,     0 },
+    { X86::VPERMILPSZ256rrkz,     X86::VPERMILPSZ256rmkz,     0 },
     { X86::VPERMPDZ256rrkz,       X86::VPERMPDZ256rmkz,       0 },
     { X86::VPERMPSZ256rrkz,       X86::VPERMPSZ256rmkz,       0 },
     { X86::VPERMQZ256rrkz,        X86::VPERMQZ256rmkz,        0 },
@@ -2489,6 +2511,8 @@
     { X86::VPANDNQZ128rrkz,       X86::VPANDNQZ128rmkz,       0 },
     { X86::VPANDQZ128rrkz,        X86::VPANDQZ128rmkz,        0 },
     { X86::VPERMBZ128rrkz,        X86::VPERMBZ128rmkz,        0 },
+    { X86::VPERMILPDZ128rrkz,     X86::VPERMILPDZ128rmkz,     0 },
+    { X86::VPERMILPSZ128rrkz,     X86::VPERMILPSZ128rmkz,     0 },
     { X86::VPERMWZ128rrkz,        X86::VPERMWZ128rmkz,        0 },
     { X86::VPMADDUBSWZ128rrkz,    X86::VPMADDUBSWZ128rmkz,    0 },
     { X86::VPMADDWDZ128rrkz,      X86::VPMADDWDZ128rmkz,      0 },
@@ -2523,6 +2547,8 @@
     { X86::VXORPSZ128rrkz,        X86::VXORPSZ128rmkz,        0 },
 
     // AVX-512 masked foldable instructions
+    { X86::VPERMILPDZrik,         X86::VPERMILPDZmik,         0 },
+    { X86::VPERMILPSZrik,         X86::VPERMILPSZmik,         0 },
     { X86::VPERMPDZrik,           X86::VPERMPDZmik,           0 },
     { X86::VPERMQZrik,            X86::VPERMQZmik,            0 },
     { X86::VPMOVSXBDZrrk,         X86::VPMOVSXBDZrmk,         0 },
@@ -2542,6 +2568,8 @@
     { X86::VPSHUFLWZrik,          X86::VPSHUFLWZmik,          0 },
 
     // AVX-512VL 256-bit masked foldable instructions
+    { X86::VPERMILPDZ256rik,      X86::VPERMILPDZ256mik,      0 },
+    { X86::VPERMILPSZ256rik,      X86::VPERMILPSZ256mik,      0 },
     { X86::VPERMPDZ256rik,        X86::VPERMPDZ256mik,        0 },
     { X86::VPERMQZ256rik,         X86::VPERMQZ256mik,         0 },
     { X86::VPMOVSXBDZ256rrk,      X86::VPMOVSXBDZ256rmk,      TB_NO_REVERSE },
@@ -2561,6 +2589,8 @@
     { X86::VPSHUFLWZ256rik,       X86::VPSHUFLWZ256mik,       0 },
 
     // AVX-512VL 128-bit masked foldable instructions
+    { X86::VPERMILPDZ128rik,      X86::VPERMILPDZ128mik,      0 },
+    { X86::VPERMILPSZ128rik,      X86::VPERMILPSZ128mik,      0 },
     { X86::VPMOVSXBDZ128rrk,      X86::VPMOVSXBDZ128rmk,      TB_NO_REVERSE },
     { X86::VPMOVSXBQZ128rrk,      X86::VPMOVSXBQZ128rmk,      TB_NO_REVERSE },
     { X86::VPMOVSXBWZ128rrk,      X86::VPMOVSXBWZ128rmk,      TB_NO_REVERSE },
@@ -2645,9 +2675,23 @@
     { X86::VPANDQZrrk,         X86::VPANDQZrmk,           0 },
     { X86::VPERMBZrrk,         X86::VPERMBZrmk,           0 },
     { X86::VPERMDZrrk,         X86::VPERMDZrmk,           0 },
+    { X86::VPERMI2Brrk,        X86::VPERMI2Brmk,          0 },
+    { X86::VPERMI2Drrk,        X86::VPERMI2Drmk,          0 },
+    { X86::VPERMI2PSrrk,       X86::VPERMI2PSrmk,         0 },
+    { X86::VPERMI2PDrrk,       X86::VPERMI2PDrmk,         0 },
+    { X86::VPERMI2Qrrk,        X86::VPERMI2Qrmk,          0 },
+    { X86::VPERMI2Wrrk,        X86::VPERMI2Wrmk,          0 },
+    { X86::VPERMILPDZrrk,      X86::VPERMILPDZrmk,        0 },
+    { X86::VPERMILPSZrrk,      X86::VPERMILPSZrmk,        0 },
     { X86::VPERMPDZrrk,        X86::VPERMPDZrmk,          0 },
     { X86::VPERMPSZrrk,        X86::VPERMPSZrmk,          0 },
     { X86::VPERMQZrrk,         X86::VPERMQZrmk,           0 },
+    { X86::VPERMT2Brrk,        X86::VPERMT2Brmk,          0 },
+    { X86::VPERMT2Drrk,        X86::VPERMT2Drmk,          0 },
+    { X86::VPERMT2PSrrk,       X86::VPERMT2PSrmk,         0 },
+    { X86::VPERMT2PDrrk,       X86::VPERMT2PDrmk,         0 },
+    { X86::VPERMT2Qrrk,        X86::VPERMT2Qrmk,          0 },
+    { X86::VPERMT2Wrrk,        X86::VPERMT2Wrmk,          0 },
     { X86::VPERMWZrrk,         X86::VPERMWZrmk,           0 },
     { X86::VPMADDUBSWZrrk,     X86::VPMADDUBSWZrmk,       0 },
     { X86::VPMADDWDZrrk,       X86::VPMADDWDZrmk,         0 },
@@ -2662,9 +2706,7 @@
     { X86::VPSUBUSBZrrk,       X86::VPSUBUSBZrmk,         0 },
     { X86::VPSUBUSWZrrk,       X86::VPSUBUSWZrmk,         0 },
     { X86::VPTERNLOGDZrrik,    X86::VPTERNLOGDZrmik,      0 },
-    { X86::VPTERNLOGDZrrikz,   X86::VPTERNLOGDZrmikz,     0 },
     { X86::VPTERNLOGQZrrik,    X86::VPTERNLOGQZrmik,      0 },
-    { X86::VPTERNLOGQZrrikz,   X86::VPTERNLOGQZrmikz,     0 },
     { X86::VPUNPCKHBWZrrk,     X86::VPUNPCKHBWZrmk,       0 },
     { X86::VPUNPCKHDQZrrk,     X86::VPUNPCKHDQZrmk,       0 },
     { X86::VPUNPCKHQDQZrrk,    X86::VPUNPCKHQDQZrmk,      0 },
@@ -2723,9 +2765,23 @@
     { X86::VPANDQZ256rrk,      X86::VPANDQZ256rmk,        0 },
     { X86::VPERMBZ256rrk,      X86::VPERMBZ256rmk,        0 },
     { X86::VPERMDZ256rrk,      X86::VPERMDZ256rmk,        0 },
+    { X86::VPERMI2B256rrk,     X86::VPERMI2B256rmk,       0 },
+    { X86::VPERMI2D256rrk,     X86::VPERMI2D256rmk,       0 },
+    { X86::VPERMI2PD256rrk,    X86::VPERMI2PD256rmk,      0 },
+    { X86::VPERMI2PS256rrk,    X86::VPERMI2PS256rmk,      0 },
+    { X86::VPERMI2Q256rrk,     X86::VPERMI2Q256rmk,       0 },
+    { X86::VPERMI2W256rrk,     X86::VPERMI2W256rmk,       0 },
+    { X86::VPERMILPDZ256rrk,   X86::VPERMILPDZ256rmk,     0 },
+    { X86::VPERMILPSZ256rrk,   X86::VPERMILPSZ256rmk,     0 },
     { X86::VPERMPDZ256rrk,     X86::VPERMPDZ256rmk,       0 },
     { X86::VPERMPSZ256rrk,     X86::VPERMPSZ256rmk,       0 },
     { X86::VPERMQZ256rrk,      X86::VPERMQZ256rmk,        0 },
+    { X86::VPERMT2B256rrk,     X86::VPERMT2B256rmk,       0 },
+    { X86::VPERMT2D256rrk,     X86::VPERMT2D256rmk,       0 },
+    { X86::VPERMT2PD256rrk,    X86::VPERMT2PD256rmk,      0 },
+    { X86::VPERMT2PS256rrk,    X86::VPERMT2PS256rmk,      0 },
+    { X86::VPERMT2Q256rrk,     X86::VPERMT2Q256rmk,       0 },
+    { X86::VPERMT2W256rrk,     X86::VPERMT2W256rmk,       0 },
     { X86::VPERMWZ256rrk,      X86::VPERMWZ256rmk,        0 },
     { X86::VPMADDUBSWZ256rrk,  X86::VPMADDUBSWZ256rmk,    0 },
     { X86::VPMADDWDZ256rrk,    X86::VPMADDWDZ256rmk,      0 },
@@ -2741,9 +2797,7 @@
     { X86::VPSUBUSWZ256rrk,    X86::VPSUBUSWZ256rmk,      0 },
     { X86::VPSUBWZ256rrk,      X86::VPSUBWZ256rmk,        0 },
     { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik,   0 },
-    { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz,  0 },
     { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik,   0 },
-    { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz,  0 },
     { X86::VPUNPCKHBWZ256rrk,  X86::VPUNPCKHBWZ256rmk,    0 },
     { X86::VPUNPCKHDQZ256rrk,  X86::VPUNPCKHDQZ256rmk,    0 },
     { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk,   0 },
@@ -2797,6 +2851,20 @@
     { X86::VPANDNQZ128rrk,     X86::VPANDNQZ128rmk,       0 },
     { X86::VPANDQZ128rrk,      X86::VPANDQZ128rmk,        0 },
     { X86::VPERMBZ128rrk,      X86::VPERMBZ128rmk,        0 },
+    { X86::VPERMI2B128rrk,     X86::VPERMI2B128rmk,       0 },
+    { X86::VPERMI2D128rrk,     X86::VPERMI2D128rmk,       0 },
+    { X86::VPERMI2PD128rrk,    X86::VPERMI2PD128rmk,      0 },
+    { X86::VPERMI2PS128rrk,    X86::VPERMI2PS128rmk,      0 },
+    { X86::VPERMI2Q128rrk,     X86::VPERMI2Q128rmk,       0 },
+    { X86::VPERMI2W128rrk,     X86::VPERMI2W128rmk,       0 },
+    { X86::VPERMILPDZ128rrk,   X86::VPERMILPDZ128rmk,     0 },
+    { X86::VPERMILPSZ128rrk,   X86::VPERMILPSZ128rmk,     0 },
+    { X86::VPERMT2B128rrk,     X86::VPERMT2B128rmk,       0 },
+    { X86::VPERMT2D128rrk,     X86::VPERMT2D128rmk,       0 },
+    { X86::VPERMT2PD128rrk,    X86::VPERMT2PD128rmk,      0 },
+    { X86::VPERMT2PS128rrk,    X86::VPERMT2PS128rmk,      0 },
+    { X86::VPERMT2Q128rrk,     X86::VPERMT2Q128rmk,       0 },
+    { X86::VPERMT2W128rrk,     X86::VPERMT2W128rmk,       0 },
     { X86::VPERMWZ128rrk,      X86::VPERMWZ128rmk,        0 },
     { X86::VPMADDUBSWZ128rrk,  X86::VPMADDUBSWZ128rmk,    0 },
     { X86::VPMADDWDZ128rrk,    X86::VPMADDWDZ128rmk,      0 },
@@ -2812,9 +2880,7 @@
     { X86::VPSUBUSWZ128rrk,    X86::VPSUBUSWZ128rmk,      0 },
     { X86::VPSUBWZ128rrk,      X86::VPSUBWZ128rmk,        0 },
     { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik,   0 },
-    { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz,  0 },
     { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik,   0 },
-    { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz,  0 },
     { X86::VPUNPCKHBWZ128rrk,  X86::VPUNPCKHBWZ128rmk,    0 },
     { X86::VPUNPCKHDQZ128rrk,  X86::VPUNPCKHDQZ128rmk,    0 },
     { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk,   0 },
@@ -2833,6 +2899,54 @@
     { X86::VUNPCKLPSZ128rrk,   X86::VUNPCKLPSZ128rmk,     0 },
     { X86::VXORPDZ128rrk,      X86::VXORPDZ128rmk,        0 },
     { X86::VXORPSZ128rrk,      X86::VXORPSZ128rmk,        0 },
+
+    // 512-bit three source instructions with zero masking.
+    { X86::VPERMI2Brrkz,       X86::VPERMI2Brmkz,         0 },
+    { X86::VPERMI2Drrkz,       X86::VPERMI2Drmkz,         0 },
+    { X86::VPERMI2PSrrkz,      X86::VPERMI2PSrmkz,        0 },
+    { X86::VPERMI2PDrrkz,      X86::VPERMI2PDrmkz,        0 },
+    { X86::VPERMI2Qrrkz,       X86::VPERMI2Qrmkz,         0 },
+    { X86::VPERMI2Wrrkz,       X86::VPERMI2Wrmkz,         0 },
+    { X86::VPERMT2Brrkz,       X86::VPERMT2Brmkz,         0 },
+    { X86::VPERMT2Drrkz,       X86::VPERMT2Drmkz,         0 },
+    { X86::VPERMT2PSrrkz,      X86::VPERMT2PSrmkz,        0 },
+    { X86::VPERMT2PDrrkz,      X86::VPERMT2PDrmkz,        0 },
+    { X86::VPERMT2Qrrkz,       X86::VPERMT2Qrmkz,         0 },
+    { X86::VPERMT2Wrrkz,       X86::VPERMT2Wrmkz,         0 },
+    { X86::VPTERNLOGDZrrikz,   X86::VPTERNLOGDZrmikz,     0 },
+    { X86::VPTERNLOGQZrrikz,   X86::VPTERNLOGQZrmikz,     0 },
+
+    // 256-bit three source instructions with zero masking.
+    { X86::VPERMI2B256rrkz,    X86::VPERMI2B256rmkz,      0 },
+    { X86::VPERMI2D256rrkz,    X86::VPERMI2D256rmkz,      0 },
+    { X86::VPERMI2PD256rrkz,   X86::VPERMI2PD256rmkz,     0 },
+    { X86::VPERMI2PS256rrkz,   X86::VPERMI2PS256rmkz,     0 },
+    { X86::VPERMI2Q256rrkz,    X86::VPERMI2Q256rmkz,      0 },
+    { X86::VPERMI2W256rrkz,    X86::VPERMI2W256rmkz,      0 },
+    { X86::VPERMT2B256rrkz,    X86::VPERMT2B256rmkz,      0 },
+    { X86::VPERMT2D256rrkz,    X86::VPERMT2D256rmkz,      0 },
+    { X86::VPERMT2PD256rrkz,   X86::VPERMT2PD256rmkz,     0 },
+    { X86::VPERMT2PS256rrkz,   X86::VPERMT2PS256rmkz,     0 },
+    { X86::VPERMT2Q256rrkz,    X86::VPERMT2Q256rmkz,      0 },
+    { X86::VPERMT2W256rrkz,    X86::VPERMT2W256rmkz,      0 },
+    { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz,  0 },
+    { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz,  0 },
+
+    // 128-bit three source instructions with zero masking.
+    { X86::VPERMI2B128rrkz,    X86::VPERMI2B128rmkz,      0 },
+    { X86::VPERMI2D128rrkz,    X86::VPERMI2D128rmkz,      0 },
+    { X86::VPERMI2PD128rrkz,   X86::VPERMI2PD128rmkz,     0 },
+    { X86::VPERMI2PS128rrkz,   X86::VPERMI2PS128rmkz,     0 },
+    { X86::VPERMI2Q128rrkz,    X86::VPERMI2Q128rmkz,      0 },
+    { X86::VPERMI2W128rrkz,    X86::VPERMI2W128rmkz,      0 },
+    { X86::VPERMT2B128rrkz,    X86::VPERMT2B128rmkz,      0 },
+    { X86::VPERMT2D128rrkz,    X86::VPERMT2D128rmkz,      0 },
+    { X86::VPERMT2PD128rrkz,   X86::VPERMT2PD128rmkz,     0 },
+    { X86::VPERMT2PS128rrkz,   X86::VPERMT2PS128rmkz,     0 },
+    { X86::VPERMT2Q128rrkz,    X86::VPERMT2Q128rmkz,      0 },
+    { X86::VPERMT2W128rrkz,    X86::VPERMT2W128rmkz,      0 },
+    { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz,  0 },
+    { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz,  0 },
   };
 
   for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
@@ -3878,7 +3992,7 @@
     NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
                           .addOperand(Dest)
                           .addOperand(Src),
-                      MI.getOperand(2).getImm());
+                      MI.getOperand(2));
     break;
   case X86::ADD32ri:
   case X86::ADD32ri8:
@@ -3901,7 +4015,7 @@
     if (ImplicitOp.getReg() != 0)
       MIB.addOperand(ImplicitOp);
 
-    NewMI = addOffset(MIB, MI.getOperand(2).getImm());
+    NewMI = addOffset(MIB, MI.getOperand(2));
     break;
   }
   case X86::ADD16ri:
@@ -3915,7 +4029,7 @@
     NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
                           .addOperand(Dest)
                           .addOperand(Src),
-                      MI.getOperand(2).getImm());
+                      MI.getOperand(2));
     break;
   }
 
@@ -8365,16 +8479,12 @@
   { X86::MOVNTPSmr,  X86::MOVNTPDmr, X86::MOVNTDQmr },
   { X86::ANDNPSrm,   X86::ANDNPDrm,  X86::PANDNrm   },
   { X86::ANDNPSrr,   X86::ANDNPDrr,  X86::PANDNrr   },
-  { X86::FsANDNPSrr, X86::FsANDNPDrr,X86::PANDNrr   },
   { X86::ANDPSrm,    X86::ANDPDrm,   X86::PANDrm    },
   { X86::ANDPSrr,    X86::ANDPDrr,   X86::PANDrr    },
-  { X86::FsANDPSrr,  X86::FsANDPDrr, X86::PANDrr    },
   { X86::ORPSrm,     X86::ORPDrm,    X86::PORrm     },
   { X86::ORPSrr,     X86::ORPDrr,    X86::PORrr     },
-  { X86::FsORPSrr,   X86::FsORPDrr,  X86::PORrr     },
   { X86::XORPSrm,    X86::XORPDrm,   X86::PXORrm    },
   { X86::XORPSrr,    X86::XORPDrr,   X86::PXORrr    },
-  { X86::FsXORPSrr,  X86::FsXORPDrr, X86::PXORrr    },
   // AVX 128-bit support
   { X86::VMOVAPSmr,  X86::VMOVAPDmr,  X86::VMOVDQAmr  },
   { X86::VMOVAPSrm,  X86::VMOVAPDrm,  X86::VMOVDQArm  },
@@ -8385,16 +8495,12 @@
   { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
   { X86::VANDNPSrm,  X86::VANDNPDrm,  X86::VPANDNrm   },
   { X86::VANDNPSrr,  X86::VANDNPDrr,  X86::VPANDNrr   },
-  { X86::VFsANDNPSrr,X86::VFsANDNPDrr,X86::VPANDNrr   },
   { X86::VANDPSrm,   X86::VANDPDrm,   X86::VPANDrm    },
   { X86::VANDPSrr,   X86::VANDPDrr,   X86::VPANDrr    },
-  { X86::VFsANDPSrr, X86::VFsANDPDrr, X86::VPANDrr    },
   { X86::VORPSrm,    X86::VORPDrm,    X86::VPORrm     },
   { X86::VORPSrr,    X86::VORPDrr,    X86::VPORrr     },
-  { X86::VFsORPSrr,  X86::VFsORPDrr,  X86::VPORrr     },
   { X86::VXORPSrm,   X86::VXORPDrm,   X86::VPXORrm    },
   { X86::VXORPSrr,   X86::VXORPDrr,   X86::VPXORrr    },
-  { X86::VFsXORPSrr, X86::VFsXORPDrr, X86::VPXORrr    },
   // AVX 256-bit support
   { X86::VMOVAPSYmr,   X86::VMOVAPDYmr,   X86::VMOVDQAYmr  },
   { X86::VMOVAPSYrm,   X86::VMOVAPDYrm,   X86::VMOVDQAYrm  },
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 0118503..3803671 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -948,10 +948,10 @@
 // Eventually, it would be nice to allow ConstantHoisting to merge constants
 // globally for potentially added savings.
 //
-def imm8_su : PatLeaf<(i8 imm), [{
+def imm8_su : PatLeaf<(i8 relocImm), [{
     return !shouldAvoidImmediateInstFormsForSize(N);
 }]>;
-def imm16_su : PatLeaf<(i16 imm), [{
+def imm16_su : PatLeaf<(i16 relocImm), [{
     return !shouldAvoidImmediateInstFormsForSize(N);
 }]>;
 def imm32_su : PatLeaf<(i32 relocImm), [{
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 070bf49..c9a6b4e 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2777,39 +2777,6 @@
 // SSE 1 & 2 - Logical Instructions
 //===----------------------------------------------------------------------===//
 
-// Multiclass for scalars using the X86 logical operation aliases for FP.
-multiclass sse12_fp_packed_scalar_logical_alias<
-    bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
-  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
-                FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>,
-                PS, VEX_4V;
-
-  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
-                FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>,
-                PD, VEX_4V;
-
-  let Constraints = "$src1 = $dst" in {
-    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
-                f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS;
-
-    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
-                f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD;
-  }
-}
-
-let isCodeGenOnly = 1 in {
-  defm FsAND  : sse12_fp_packed_scalar_logical_alias<0x54, "and", X86fand,
-                SSE_BIT_ITINS_P>;
-  defm FsOR   : sse12_fp_packed_scalar_logical_alias<0x56, "or", X86for,
-                SSE_BIT_ITINS_P>;
-  defm FsXOR  : sse12_fp_packed_scalar_logical_alias<0x57, "xor", X86fxor,
-                SSE_BIT_ITINS_P>;
-
-  let isCommutable = 0 in
-    defm FsANDN : sse12_fp_packed_scalar_logical_alias<0x55, "andn", X86fandn,
-                  SSE_BIT_ITINS_P>;
-}
-
 /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
 ///
 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
@@ -2965,6 +2932,43 @@
             (VANDNPDYrm VR256:$src1, addr:$src2)>;
 }
 
+let Predicates = [HasAVX] in {
+  // Use packed logical operations for scalar ops.
+  def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (VANDPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+  def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (VORPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+  def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (VXORPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+  def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (VANDNPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+
+  def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (VANDPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+  def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (VORPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+  def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (VXORPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+  def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (VANDNPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+}
+
 let Predicates = [UseSSE1] in {
   def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
             (ANDPSrr VR128:$src1, VR128:$src2)>;
@@ -2983,6 +2987,24 @@
             (XORPSrm VR128:$src1, addr:$src2)>;
   def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
             (ANDNPSrm VR128:$src1, addr:$src2)>;
+
+  // Use packed logical operations for scalar ops.
+  def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (ANDPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+  def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (ORPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+  def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (XORPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+  def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (ANDNPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
 }
 
 let Predicates = [UseSSE2] in {
@@ -3003,6 +3025,24 @@
             (XORPDrm VR128:$src1, addr:$src2)>;
   def : Pat<(X86fandn VR128:$src1, (memopv2f64 addr:$src2)),
             (ANDNPDrm VR128:$src1, addr:$src2)>;
+
+  // Use packed logical operations for scalar ops.
+  def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (ANDPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+  def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (ORPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+  def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (XORPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+  def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (ANDNPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3346,8 +3386,8 @@
 /// the HW instructions are 2 operand / destructive.
 multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
                           ValueType vt, ValueType ScalarVT,
-                          X86MemOperand x86memop, Operand vec_memop,
-                          ComplexPattern mem_cpat, Intrinsic Intr,
+                          X86MemOperand x86memop,
+                          Intrinsic Intr,
                           SDNode OpNode, Domain d, OpndItins itins,
                           Predicate target, string Suffix> {
   let hasSideEffects = 0 in {
@@ -3367,23 +3407,17 @@
               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
             []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
   let mayLoad = 1 in
-  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, vec_memop:$src2),
+  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
             []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
   }
   }
 
   let Predicates = [target] in {
-  def : Pat<(vt (OpNode mem_cpat:$src)),
-            (vt (COPY_TO_REGCLASS (vt (!cast<Instruction>(NAME#Suffix##m_Int)
-                 (vt (IMPLICIT_DEF)), mem_cpat:$src)), RC))>;
   // These are unary operations, but they are modeled as having 2 source operands
   // because the high elements of the destination are unchanged in SSE.
   def : Pat<(Intr VR128:$src),
             (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>;
-  def : Pat<(Intr (load addr:$src)),
-            (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m)
-                                      addr:$src), VR128))>;
   }
   // We don't want to fold scalar loads into these instructions unless
   // optimizing for size. This is because the folded instruction will have a
@@ -3393,16 +3427,15 @@
   // which has a clobber before the rcp, vs.
   // rcpss mem, %xmm0
   let Predicates = [target, OptForSize] in {
-    def : Pat<(Intr mem_cpat:$src),
+    def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
                (!cast<Instruction>(NAME#Suffix##m_Int)
-                      (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
+                      (vt (IMPLICIT_DEF)), addr:$src2)>;
   }
 }
 
 multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
                           ValueType vt, ValueType ScalarVT,
-                          X86MemOperand x86memop, Operand vec_memop,
-                          ComplexPattern mem_cpat,
+                          X86MemOperand x86memop,
                           Intrinsic Intr, SDNode OpNode, Domain d,
                           OpndItins itins, string Suffix> {
   let hasSideEffects = 0 in {
@@ -3420,7 +3453,7 @@
              []>, Sched<[itins.Sched.Folded]>;
   let mayLoad = 1 in
   def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
-                (ins VR128:$src1, vec_memop:$src2),
+                (ins VR128:$src1, x86memop:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
   }
@@ -3441,21 +3474,18 @@
   }
   let Predicates = [HasAVX] in {
    def : Pat<(Intr VR128:$src),
-             (!cast<Instruction>("V"#NAME#Suffix##r_Int) (vt (IMPLICIT_DEF)),
+             (!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src,
                                  VR128:$src)>;
   }
   let Predicates = [HasAVX, OptForSize] in {
-    def : Pat<(Intr mem_cpat:$src),
+    def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
               (!cast<Instruction>("V"#NAME#Suffix##m_Int)
-                    (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
+                    (vt (IMPLICIT_DEF)), addr:$src2)>;
   }
   let Predicates = [UseAVX, OptForSize] in {
     def : Pat<(ScalarVT (OpNode (load addr:$src))),
               (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
             addr:$src)>;
-    def : Pat<(vt (OpNode mem_cpat:$src)),
-              (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)),
-                                  mem_cpat:$src)>;
   }
 }
 
@@ -3534,11 +3564,10 @@
 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           OpndItins itins> {
   defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
-                      ssmem, sse_load_f32,
                       !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
                       SSEPackedSingle, itins, UseSSE1, "SS">, XS;
   defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
-                      f32mem, ssmem, sse_load_f32,
+                      f32mem,
                       !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
                       SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG;
 }
@@ -3546,11 +3575,10 @@
 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           OpndItins itins> {
   defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
-                         sdmem, sse_load_f64,
                          !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
                          OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
   defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
-                         f64mem, sdmem, sse_load_f64,
+                         f64mem,
                          !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
                          OpNode, SSEPackedDouble, itins, "SD">,
                          XD, VEX_4V, VEX_LIG;
@@ -4595,6 +4623,7 @@
 //===---------------------------------------------------------------------===//
 // Move Int Doubleword to Packed Double Int
 //
+let ExeDomain = SSEPackedInt in {
 def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
@@ -4645,11 +4674,12 @@
                        "mov{d|q}\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))],
                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+} // ExeDomain = SSEPackedInt
 
 //===---------------------------------------------------------------------===//
 // Move Int Doubleword to Single Scalar
 //
-let isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
   def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
                         [(set FR32:$dst, (bitconvert GR32:$src))],
@@ -4669,11 +4699,12 @@
                         "movd\t{$src, $dst|$dst, $src}",
                         [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
-}
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
 
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int to Packed Double Int
 //
+let ExeDomain = SSEPackedInt in {
 def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
@@ -4695,6 +4726,7 @@
                        [(store (i32 (extractelt (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)],
                                      IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+} // ExeDomain = SSEPackedInt
 
 def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
         (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
@@ -4711,6 +4743,7 @@
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int first element to Doubleword Int
 //
+let ExeDomain = SSEPackedInt in {
 let SchedRW = [WriteMove] in {
 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                           "movq\t{$src, $dst|$dst, $src}",
@@ -4735,11 +4768,12 @@
 def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                         "mov{d|q}\t{$src, $dst|$dst, $src}",
                         [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+} // ExeDomain = SSEPackedInt
 
 //===---------------------------------------------------------------------===//
 // Bitcast FR64 <-> GR64
 //
-let isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
   let Predicates = [UseAVX] in
   def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
                           "movq\t{$src, $dst|$dst, $src}",
@@ -4766,12 +4800,12 @@
                          "movq\t{$src, $dst|$dst, $src}",
                          [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
                          IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
-}
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
 
 //===---------------------------------------------------------------------===//
 // Move Scalar Single to Double Int
 //
-let isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
   def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
                         [(set GR32:$dst, (bitconvert FR32:$src))],
@@ -4788,7 +4822,7 @@
                         "movd\t{$src, $dst|$dst, $src}",
                         [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
                         IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
-}
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
 
 let Predicates = [UseAVX] in {
   let AddedComplexity = 15 in {
@@ -6268,10 +6302,10 @@
 // SSE4.1 - Round Instructions
 //===----------------------------------------------------------------------===//
 
-multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
-                            X86MemOperand x86memop, RegisterClass RC,
-                            PatFrag mem_frag32, PatFrag mem_frag64,
-                            Intrinsic V4F32Int, Intrinsic V2F64Int> {
+multiclass sse41_fp_unop_p<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+                           X86MemOperand x86memop, RegisterClass RC,
+                           PatFrag mem_frag32, PatFrag mem_frag64,
+                           Intrinsic V4F32Int, Intrinsic V2F64Int> {
 let ExeDomain = SSEPackedSingle in {
   // Intrinsic operation, reg.
   // Vector intrinsic operation, reg
@@ -6312,35 +6346,73 @@
 } // ExeDomain = SSEPackedDouble
 }
 
-multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
+multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
+                          string OpcodeStr> {
+let ExeDomain = GenericDomain, hasSideEffects = 0 in {
+  def SSr : SS4AIi8<opcss, MRMSrcReg,
+        (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
+        !strconcat(OpcodeStr,
+            "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+      []>, Sched<[WriteFAdd]>;
+
+  let mayLoad = 1 in
+  def SSm : SS4AIi8<opcss, MRMSrcMem,
+        (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
+        !strconcat(OpcodeStr,
+             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+  def SDr : SS4AIi8<opcsd, MRMSrcReg,
+        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
+        !strconcat(OpcodeStr,
+              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        []>, Sched<[WriteFAdd]>;
+
+  let mayLoad = 1 in
+  def SDm : SS4AIi8<opcsd, MRMSrcMem,
+        (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
+        !strconcat(OpcodeStr,
+             "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain, hasSideEffects = 0
+}
+
+multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
+                           string OpcodeStr> {
+let ExeDomain = GenericDomain, hasSideEffects = 0 in {
+  def SSr : SS4AIi8<opcss, MRMSrcReg,
+                    (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, Sched<[WriteFAdd]>;
+
+  let mayLoad = 1 in
+  def SSm : SS4AIi8<opcss, MRMSrcMem,
+                    (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+  def SDr : SS4AIi8<opcsd, MRMSrcReg,
+                    (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, Sched<[WriteFAdd]>;
+
+  let mayLoad = 1 in
+  def SDm : SS4AIi8<opcsd, MRMSrcMem,
+                    (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain, hasSideEffects = 0
+}
+
+multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
                             string OpcodeStr,
                             Intrinsic F32Int,
                             Intrinsic F64Int, bit Is2Addr = 1> {
-let ExeDomain = GenericDomain in {
-  // Operation, reg.
-  let hasSideEffects = 0 in
-  def SSr : SS4AIi8<opcss, MRMSrcReg,
-      (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
-      !if(Is2Addr,
-          !strconcat(OpcodeStr,
-              "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-          !strconcat(OpcodeStr,
-              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-      []>, Sched<[WriteFAdd]>;
-
-  // Operation, mem.
-  let mayLoad = 1, hasSideEffects = 0 in
-  def SSm : SS4AIi8<opcss, MRMSrcMem,
-        (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
-        !if(Is2Addr,
-            !strconcat(OpcodeStr,
-                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-            !strconcat(OpcodeStr,
-                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        []>, Sched<[WriteFAddLd, ReadAfterLd]>;
-
-  // Intrinsic operation, reg.
-  let isCodeGenOnly = 1 in
+let ExeDomain = GenericDomain, isCodeGenOnly = 1 in {
   def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
         !if(Is2Addr,
@@ -6351,8 +6423,6 @@
         [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
         Sched<[WriteFAdd]>;
 
-  // Intrinsic operation, mem.
-  let isCodeGenOnly = 1 in
   def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
         (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
         !if(Is2Addr,
@@ -6364,30 +6434,6 @@
              (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
         Sched<[WriteFAddLd, ReadAfterLd]>;
 
-  // Operation, reg.
-  let hasSideEffects = 0 in
-  def SDr : SS4AIi8<opcsd, MRMSrcReg,
-        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
-        !if(Is2Addr,
-            !strconcat(OpcodeStr,
-                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-            !strconcat(OpcodeStr,
-                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        []>, Sched<[WriteFAdd]>;
-
-  // Operation, mem.
-  let mayLoad = 1, hasSideEffects = 0 in
-  def SDm : SS4AIi8<opcsd, MRMSrcMem,
-        (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
-        !if(Is2Addr,
-            !strconcat(OpcodeStr,
-                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-            !strconcat(OpcodeStr,
-                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        []>, Sched<[WriteFAddLd, ReadAfterLd]>;
-
-  // Intrinsic operation, reg.
-  let isCodeGenOnly = 1 in
   def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
         !if(Is2Addr,
@@ -6398,8 +6444,6 @@
         [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
         Sched<[WriteFAdd]>;
 
-  // Intrinsic operation, mem.
-  let isCodeGenOnly = 1 in
   def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
         (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
         !if(Is2Addr,
@@ -6410,23 +6454,24 @@
         [(set VR128:$dst,
               (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
         Sched<[WriteFAddLd, ReadAfterLd]>;
-} // ExeDomain = GenericDomain
+} // ExeDomain = GenericDomain, isCodeGenOnly = 1
 }
 
 // FP round - roundss, roundps, roundsd, roundpd
 let Predicates = [HasAVX] in {
   // Intrinsic form
-  defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
-                                  loadv4f32, loadv2f64,
-                                  int_x86_sse41_round_ps,
-                                  int_x86_sse41_round_pd>, VEX;
-  defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
-                                  loadv8f32, loadv4f64,
-                                  int_x86_avx_round_ps_256,
-                                  int_x86_avx_round_pd_256>, VEX, VEX_L;
-  defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
-                                  int_x86_sse41_round_ss,
-                                  int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+  defm VROUND  : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128,
+                                 loadv4f32, loadv2f64,
+                                 int_x86_sse41_round_ps,
+                                 int_x86_sse41_round_pd>, VEX;
+  defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256,
+                                 loadv8f32, loadv4f64,
+                                 int_x86_avx_round_ps_256,
+                                 int_x86_avx_round_pd_256>, VEX, VEX_L;
+  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround",
+                                 int_x86_sse41_round_ss,
+                                 int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+  defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;                                 
 }
 
 let Predicates = [UseAVX] in {
@@ -6498,34 +6543,37 @@
             (VROUNDYPDr VR256:$src, (i32 0xB))>;
 }
 
-defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
-                               memopv4f32, memopv2f64,
-                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
+defm ROUND  : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128,
+                              memopv4f32, memopv2f64, int_x86_sse41_round_ps,
+                              int_x86_sse41_round_pd>;
+
+defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round">;
+
 let Constraints = "$src1 = $dst" in
-defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
+defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round",
                                int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
 
 let Predicates = [UseSSE41] in {
   def : Pat<(ffloor FR32:$src),
-            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
+            (ROUNDSSr FR32:$src, (i32 0x9))>;
   def : Pat<(f64 (ffloor FR64:$src)),
-            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
+            (ROUNDSDr FR64:$src, (i32 0x9))>;
   def : Pat<(f32 (fnearbyint FR32:$src)),
-            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+            (ROUNDSSr FR32:$src, (i32 0xC))>;
   def : Pat<(f64 (fnearbyint FR64:$src)),
-            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+            (ROUNDSDr FR64:$src, (i32 0xC))>;
   def : Pat<(f32 (fceil FR32:$src)),
-            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
+            (ROUNDSSr FR32:$src, (i32 0xA))>;
   def : Pat<(f64 (fceil FR64:$src)),
-            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
+            (ROUNDSDr FR64:$src, (i32 0xA))>;
   def : Pat<(f32 (frint FR32:$src)),
-            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+            (ROUNDSSr FR32:$src, (i32 0x4))>;
   def : Pat<(f64 (frint FR64:$src)),
-            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+            (ROUNDSDr FR64:$src, (i32 0x4))>;
   def : Pat<(f32 (ftrunc FR32:$src)),
-            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
+            (ROUNDSSr FR32:$src, (i32 0xB))>;
   def : Pat<(f64 (ftrunc FR64:$src)),
-            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
+            (ROUNDSDr FR64:$src, (i32 0xB))>;
 
   def : Pat<(v4f32 (ffloor VR128:$src)),
             (ROUNDPSr VR128:$src, (i32 0x9))>;
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index 6303b6e..e2be735 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -591,19 +591,20 @@
 
 def ROR8ri   : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
                    "ror{b}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+                   [(set GR8:$dst, (rotr GR8:$src1, (i8 relocImm:$src2)))],
+                   IIC_SR>;
 def ROR16ri  : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
                    "ror{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))],
+                   [(set GR16:$dst, (rotr GR16:$src1, (i8 relocImm:$src2)))],
                    IIC_SR>, OpSize16;
 def ROR32ri  : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
                    "ror{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))],
+                   [(set GR32:$dst, (rotr GR32:$src1, (i8 relocImm:$src2)))],
                    IIC_SR>, OpSize32;
 def ROR64ri  : RIi8<0xC1, MRM1r, (outs GR64:$dst),
                     (ins GR64:$src1, u8imm:$src2),
                     "ror{q}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))],
+                    [(set GR64:$dst, (rotr GR64:$src1, (i8 relocImm:$src2)))],
                     IIC_SR>;
 
 // Rotate by 1
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 2dcb8ce..5e5369e 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1192,8 +1192,8 @@
   X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD,
                      X86ISD::FMADD_RND),
 
-  X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
@@ -1326,8 +1326,8 @@
   X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD,
                      X86ISD::FMADD_RND),
 
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
@@ -1345,8 +1345,8 @@
   X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
                      X86ISD::FMSUB_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUB_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUB_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
 
   X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
@@ -1365,8 +1365,8 @@
   X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
                      X86ISD::FNMSUB_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUB_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUB_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
   X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
                      X86ISD::VFIXUPIMM, 0),
   X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
@@ -1404,8 +1404,8 @@
   X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD,
                      X86ISD::FMADD_RND),
 
-  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index ee37ea7..727ff70 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -92,6 +92,10 @@
   if (TM.getCodeModel() == CodeModel::Large)
     return X86II::MO_NO_FLAG;
 
+  // Absolute symbols can be referenced directly.
+  if (GV && GV->isAbsoluteSymbolRef())
+    return X86II::MO_NO_FLAG;
+
   if (TM.shouldAssumeDSOLocal(M, GV))
     return classifyLocalReference(GV);
 
@@ -228,6 +232,9 @@
   else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
            isTargetKFreeBSD() || In64BitMode)
     stackAlignment = 16;
+
+  assert((!isPMULLDSlow() || hasSSE41()) &&
+         "Feature Slow PMULLD can only be set on a subtarget with SSE4.1");
 }
 
 void X86Subtarget::initializeEnvironment() {
@@ -275,6 +282,7 @@
   HasMWAITX = false;
   HasMPX = false;
   IsBTMemSlow = false;
+  IsPMULLDSlow = false;
   IsSHLDSlow = false;
   IsUAMem16Slow = false;
   IsUAMem32Slow = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 6940d1e..92c1621 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -178,6 +178,10 @@
   /// True if SHLD instructions are slow.
   bool IsSHLDSlow;
 
+  /// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and
+  //  PMULUDQ.
+  bool IsPMULLDSlow;
+
   /// True if unaligned memory accesses of 16-bytes are slow.
   bool IsUAMem16Slow;
 
@@ -452,6 +456,7 @@
   bool hasMWAITX() const { return HasMWAITX; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isSHLDSlow() const { return IsSHLDSlow; }
+  bool isPMULLDSlow() const { return IsPMULLDSlow; }
   bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
   bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
   bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index 241f475..753eb4c 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -30,6 +30,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/TrailingObjects.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -206,6 +207,37 @@
   Constant *Mask;
 };
 
+/// A POD-like structure that we use to store a global reference together with
+/// its metadata types. In this pass we frequently need to query the set of
+/// metadata types referenced by a global, which at the IR level is an expensive
+/// operation involving a map lookup; this data structure helps to reduce the
+/// number of times we need to do this lookup.
+class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> {
+  GlobalObject *GO;
+  size_t NTypes;
+
+  friend TrailingObjects;
+  size_t numTrailingObjects(OverloadToken<MDNode *>) const { return NTypes; }
+
+public:
+  static GlobalTypeMember *create(BumpPtrAllocator &Alloc, GlobalObject *GO,
+                                  ArrayRef<MDNode *> Types) {
+    auto *GTM = static_cast<GlobalTypeMember *>(Alloc.Allocate(
+        totalSizeToAlloc<MDNode *>(Types.size()), alignof(GlobalTypeMember)));
+    GTM->GO = GO;
+    GTM->NTypes = Types.size();
+    std::uninitialized_copy(Types.begin(), Types.end(),
+                            GTM->getTrailingObjects<MDNode *>());
+    return GTM;
+  }
+  GlobalObject *getGlobal() const {
+    return GO;
+  }
+  ArrayRef<MDNode *> types() const {
+    return makeArrayRef(getTrailingObjects<MDNode *>(), NTypes);
+  }
+};
+
 class LowerTypeTestsModule {
   Module &M;
 
@@ -233,20 +265,20 @@
 
   BitSetInfo
   buildBitSet(Metadata *TypeId,
-              const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);
+              const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
   ByteArrayInfo *createByteArray(BitSetInfo &BSI);
   void allocateByteArrays();
   Value *createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, ByteArrayInfo *&BAI,
                           Value *BitOffset);
-  void
-  lowerTypeTestCalls(ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
-                     const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);
+  void lowerTypeTestCalls(
+      ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
+      const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
   Value *
   lowerBitSetCall(CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI,
                   Constant *CombinedGlobal,
                   const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);
   void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> TypeIds,
-                                       ArrayRef<GlobalVariable *> Globals);
+                                       ArrayRef<GlobalTypeMember *> Globals);
   unsigned getJumpTableEntrySize();
   Type *getJumpTableEntryType();
   void createJumpTableEntry(raw_ostream &OS, Function *Dest, unsigned Distance);
@@ -254,13 +286,13 @@
                             GlobalVariable *JumpTable, unsigned Distance);
   void verifyTypeMDNode(GlobalObject *GO, MDNode *Type);
   void buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds,
-                                 ArrayRef<Function *> Functions);
+                                 ArrayRef<GlobalTypeMember *> Functions);
   void buildBitSetsFromFunctionsNative(ArrayRef<Metadata *> TypeIds,
-                                    ArrayRef<Function *> Functions);
+                                    ArrayRef<GlobalTypeMember *> Functions);
   void buildBitSetsFromFunctionsWASM(ArrayRef<Metadata *> TypeIds,
-                                     ArrayRef<Function *> Functions);
+                                     ArrayRef<GlobalTypeMember *> Functions);
   void buildBitSetsFromDisjointSet(ArrayRef<Metadata *> TypeIds,
-                                   ArrayRef<GlobalObject *> Globals);
+                                   ArrayRef<GlobalTypeMember *> Globals);
 
   void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT);
   void moveInitializerToModuleConstructor(GlobalVariable *GV);
@@ -296,16 +328,14 @@
 /// Build a bit set for TypeId using the object layouts in
 /// GlobalLayout.
 BitSetInfo LowerTypeTestsModule::buildBitSet(
-    Metadata *TypeId, const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) {
+    Metadata *TypeId,
+    const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
   BitSetBuilder BSB;
 
   // Compute the byte offset of each address associated with this type
   // identifier.
-  SmallVector<MDNode *, 2> Types;
   for (auto &GlobalAndOffset : GlobalLayout) {
-    Types.clear();
-    GlobalAndOffset.first->getMetadata(LLVMContext::MD_type, Types);
-    for (MDNode *Type : Types) {
+    for (MDNode *Type : GlobalAndOffset.first->types()) {
       if (Type->getOperand(1) != TypeId)
         continue;
       uint64_t Offset =
@@ -521,16 +551,17 @@
 /// Given a disjoint set of type identifiers and globals, lay out the globals,
 /// build the bit sets and lower the llvm.type.test calls.
 void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
-    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalVariable *> Globals) {
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals) {
   // Build a new global with the combined contents of the referenced globals.
   // This global is a struct whose even-indexed elements contain the original
   // contents of the referenced globals and whose odd-indexed elements contain
   // any padding required to align the next element to the next power of 2.
   std::vector<Constant *> GlobalInits;
   const DataLayout &DL = M.getDataLayout();
-  for (GlobalVariable *G : Globals) {
-    GlobalInits.push_back(G->getInitializer());
-    uint64_t InitSize = DL.getTypeAllocSize(G->getValueType());
+  for (GlobalTypeMember *G : Globals) {
+    GlobalVariable *GV = cast<GlobalVariable>(G->getGlobal());
+    GlobalInits.push_back(GV->getInitializer());
+    uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
 
     // Compute the amount of padding required.
     uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize;
@@ -554,7 +585,7 @@
   const StructLayout *CombinedGlobalLayout = DL.getStructLayout(NewTy);
 
   // Compute the offsets of the original globals within the new global.
-  DenseMap<GlobalObject *, uint64_t> GlobalLayout;
+  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
   for (unsigned I = 0; I != Globals.size(); ++I)
     // Multiply by 2 to account for padding elements.
     GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2);
@@ -565,31 +596,36 @@
   // global from which we built the combined global, and replace references
   // to the original globals with references to the aliases.
   for (unsigned I = 0; I != Globals.size(); ++I) {
+    GlobalVariable *GV = cast<GlobalVariable>(Globals[I]->getGlobal());
+
     // Multiply by 2 to account for padding elements.
     Constant *CombinedGlobalIdxs[] = {ConstantInt::get(Int32Ty, 0),
                                       ConstantInt::get(Int32Ty, I * 2)};
     Constant *CombinedGlobalElemPtr = ConstantExpr::getGetElementPtr(
         NewInit->getType(), CombinedGlobal, CombinedGlobalIdxs);
     if (LinkerSubsectionsViaSymbols) {
-      Globals[I]->replaceAllUsesWith(CombinedGlobalElemPtr);
+      GV->replaceAllUsesWith(CombinedGlobalElemPtr);
     } else {
-      assert(Globals[I]->getType()->getAddressSpace() == 0);
+      assert(GV->getType()->getAddressSpace() == 0);
       GlobalAlias *GAlias = GlobalAlias::create(NewTy->getElementType(I * 2), 0,
-                                                Globals[I]->getLinkage(), "",
+                                                GV->getLinkage(), "",
                                                 CombinedGlobalElemPtr, &M);
-      GAlias->setVisibility(Globals[I]->getVisibility());
-      GAlias->takeName(Globals[I]);
-      Globals[I]->replaceAllUsesWith(GAlias);
+      GAlias->setVisibility(GV->getVisibility());
+      GAlias->takeName(GV);
+      GV->replaceAllUsesWith(GAlias);
     }
-    Globals[I]->eraseFromParent();
+    GV->eraseFromParent();
   }
 }
 
 void LowerTypeTestsModule::lowerTypeTestCalls(
     ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
-    const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) {
+    const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
   Constant *CombinedGlobalIntAddr =
       ConstantExpr::getPtrToInt(CombinedGlobalAddr, IntPtrTy);
+  DenseMap<GlobalObject *, uint64_t> GlobalObjLayout;
+  for (auto &P : GlobalLayout)
+    GlobalObjLayout[P.first->getGlobal()] = P.second;
 
   // For each type identifier in this disjoint set...
   for (Metadata *TypeId : TypeIds) {
@@ -609,7 +645,7 @@
     for (CallInst *CI : TypeTestCallSites[TypeId]) {
       ++NumTypeTestCallsLowered;
       Value *Lowered =
-          lowerBitSetCall(CI, BSI, BAI, CombinedGlobalIntAddr, GlobalLayout);
+          lowerBitSetCall(CI, BSI, BAI, CombinedGlobalIntAddr, GlobalObjLayout);
       CI->replaceAllUsesWith(Lowered);
       CI->eraseFromParent();
     }
@@ -647,6 +683,7 @@
     case Triple::x86_64:
       return kX86JumpTableEntrySize;
     case Triple::arm:
+    case Triple::thumb:
     case Triple::aarch64:
       return kARMJumpTableEntrySize;
     default:
@@ -694,6 +731,8 @@
     OS << "int3\nint3\nint3\n";
   } else if (Arch == Triple::arm || Arch == Triple::aarch64) {
     OS << "b " << Name << "\n";
+  } else if (Arch == Triple::thumb) {
+    OS << "b.w " << Name << "\n";
   } else {
     report_fatal_error("Unsupported architecture for jump tables");
   }
@@ -718,8 +757,13 @@
   else if (!Dest->hasLocalLinkage())
     OS << ".globl " << Name << "\n";
   OS << ".type " << Name << ", function\n";
-  OS << Name << " = " << JumpTable->getName() << " + "
-     << (getJumpTableEntrySize() * Distance) << "\n";
+  if (Arch == Triple::thumb) {
+    OS << ".thumb_set " << Name << ", " << JumpTable->getName() << " + "
+       << (getJumpTableEntrySize() * Distance) << "\n";
+  } else {
+    OS << Name << " = " << JumpTable->getName() << " + "
+       << (getJumpTableEntrySize() * Distance) << "\n";
+  }
   OS << ".size " << Name << ", " << getJumpTableEntrySize() << "\n";
 }
 
@@ -730,9 +774,9 @@
 /// Given a disjoint set of type identifiers and functions, build the bit sets
 /// and lower the llvm.type.test calls, architecture dependently.
 void LowerTypeTestsModule::buildBitSetsFromFunctions(
-    ArrayRef<Metadata *> TypeIds, ArrayRef<Function *> Functions) {
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
   if (Arch == Triple::x86 || Arch == Triple::x86_64 || Arch == Triple::arm ||
-      Arch == Triple::aarch64)
+      Arch == Triple::thumb || Arch == Triple::aarch64)
     buildBitSetsFromFunctionsNative(TypeIds, Functions);
   else if (Arch == Triple::wasm32 || Arch == Triple::wasm64)
     buildBitSetsFromFunctionsWASM(TypeIds, Functions);
@@ -803,7 +847,7 @@
 /// Given a disjoint set of type identifiers and functions, build a jump table
 /// for the functions, build the bit sets and lower the llvm.type.test calls.
 void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
-    ArrayRef<Metadata *> TypeIds, ArrayRef<Function *> Functions) {
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
   // Unlike the global bitset builder, the function bitset builder cannot
   // re-arrange functions in a particular order and base its calculations on the
   // layout of the functions' entry points, as we have no idea how large a
@@ -884,7 +928,7 @@
   assert(!Functions.empty());
 
   // Build a simple layout based on the regular layout of jump tables.
-  DenseMap<GlobalObject *, uint64_t> GlobalLayout;
+  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
   unsigned EntrySize = getJumpTableEntrySize();
   for (unsigned I = 0; I != Functions.size(); ++I)
     GlobalLayout[Functions[I]] = I * EntrySize;
@@ -905,48 +949,48 @@
   // Build aliases pointing to offsets into the jump table, and replace
   // references to the original functions with references to the aliases.
   for (unsigned I = 0; I != Functions.size(); ++I) {
+    Function *F = cast<Function>(Functions[I]->getGlobal());
+
     // Need a name for the asm label. Normally, unnamed functions get temporary
     // asm labels in TargetLoweringObjectFile but we don't have access to that
     // here.
-    if (!Functions[I]->hasName())
-      Functions[I]->setName("unnamed");
-    if (LinkerSubsectionsViaSymbols || Functions[I]->isDeclarationForLinker()) {
+    if (!F->hasName())
+      F->setName("unnamed");
+    if (LinkerSubsectionsViaSymbols || F->isDeclarationForLinker()) {
       Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast(
           ConstantExpr::getGetElementPtr(
               JumpTableType, JumpTable,
               ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
                                    ConstantInt::get(IntPtrTy, I)}),
-          Functions[I]->getType());
+          F->getType());
 
-
-      if (Functions[I]->isWeakForLinker()) {
-        AsmOS << ".weak " << Functions[I]->getName() << "\n";
-        replaceWeakDeclarationWithJumpTablePtr(Functions[I],
-                                               CombinedGlobalElemPtr);
+      if (F->isWeakForLinker()) {
+        AsmOS << ".weak " << F->getName() << "\n";
+        replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr);
       } else {
-        Functions[I]->replaceAllUsesWith(CombinedGlobalElemPtr);
+        F->replaceAllUsesWith(CombinedGlobalElemPtr);
       }
     } else {
-      assert(Functions[I]->getType()->getAddressSpace() == 0);
+      assert(F->getType()->getAddressSpace() == 0);
 
-      createJumpTableAlias(AsmOS, Functions[I], JumpTable, I);
+      createJumpTableAlias(AsmOS, F, JumpTable, I);
 
       Function *DeclAlias =
-          Function::Create(cast<FunctionType>(Functions[I]->getValueType()),
+          Function::Create(cast<FunctionType>(F->getValueType()),
                            GlobalValue::ExternalLinkage, "", &M);
       // Since the alias (DeclAlias) is actually a declaration, it can not have
       // internal linkage. Compensate for that by giving it hidden visibility.
       // With this we end up with a GOT relocation against a local symbol.
-      DeclAlias->setVisibility(Functions[I]->hasLocalLinkage()
+      DeclAlias->setVisibility(F->hasLocalLinkage()
                                    ? GlobalValue::HiddenVisibility
-                                   : Functions[I]->getVisibility());
-      DeclAlias->takeName(Functions[I]);
+                                   : F->getVisibility());
+      DeclAlias->takeName(F);
       // Unnamed functions can not be added to llvm.used.
-      Functions[I]->setName(DeclAlias->getName() + ".cfi");
-      Functions[I]->replaceAllUsesWith(DeclAlias);
+      F->setName(DeclAlias->getName() + ".cfi");
+      F->replaceAllUsesWith(DeclAlias);
     }
-    if (!Functions[I]->isDeclarationForLinker())
-      Functions[I]->setLinkage(GlobalValue::InternalLinkage);
+    if (!F->isDeclarationForLinker())
+      F->setLinkage(GlobalValue::InternalLinkage);
   }
 
   // Try to emit the jump table at the end of the text segment.
@@ -954,20 +998,22 @@
   // FIXME: this magic section name seems to do the trick.
   AsmOS << ".section " << (ObjectFormat == Triple::MachO
                                ? "__TEXT,__text,regular,pure_instructions"
-                               : ".text.cfi, \"ax\", @progbits")
+                               : ".text.cfi, \"ax\", %progbits")
         << "\n";
   // Align the whole table by entry size.
   AsmOS << ".balign " << EntrySize << "\n";
+  if (Arch == Triple::thumb)
+    AsmOS << ".thumb_func\n";
   AsmOS << JumpTable->getName() << ":\n";
   for (unsigned I = 0; I != Functions.size(); ++I)
-    createJumpTableEntry(AsmOS, Functions[I], I);
+    createJumpTableEntry(AsmOS, cast<Function>(Functions[I]->getGlobal()), I);
 
   M.appendModuleInlineAsm(AsmOS.str());
 
   SmallVector<GlobalValue *, 16> Used;
   Used.reserve(Functions.size());
   for (auto *F : Functions)
-    Used.push_back(F);
+    Used.push_back(F->getGlobal());
   appendToUsed(M, Used);
 }
 
@@ -978,13 +1024,15 @@
 /// Note: Dynamic linking is not supported, as the WebAssembly ABI has not yet
 /// been finalized.
 void LowerTypeTestsModule::buildBitSetsFromFunctionsWASM(
-    ArrayRef<Metadata *> TypeIds, ArrayRef<Function *> Functions) {
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
   assert(!Functions.empty());
 
   // Build consecutive monotonic integer ranges for each call target set
-  DenseMap<GlobalObject *, uint64_t> GlobalLayout;
+  DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
 
-  for (Function *F : Functions) {
+  for (GlobalTypeMember *GTM : Functions) {
+    Function *F = cast<Function>(GTM->getGlobal());
+
     // Skip functions that are not address taken, to avoid bloating the table
     if (!F->hasAddressTaken())
       continue;
@@ -996,7 +1044,7 @@
     F->setMetadata("wasm.index", MD);
 
     // Assign the counter value
-    GlobalLayout[F] = IndirectIndex++;
+    GlobalLayout[GTM] = IndirectIndex++;
   }
 
   // The indirect function table index space starts at zero, so pass a NULL
@@ -1006,7 +1054,7 @@
 }
 
 void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
-    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalObject *> Globals) {
+    ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals) {
   llvm::DenseMap<Metadata *, uint64_t> TypeIdIndices;
   for (unsigned I = 0; I != TypeIds.size(); ++I)
     TypeIdIndices[TypeIds[I]] = I;
@@ -1014,12 +1062,9 @@
   // For each type identifier, build a set of indices that refer to members of
   // the type identifier.
   std::vector<std::set<uint64_t>> TypeMembers(TypeIds.size());
-  SmallVector<MDNode *, 2> Types;
   unsigned GlobalIndex = 0;
-  for (GlobalObject *GO : Globals) {
-    Types.clear();
-    GO->getMetadata(LLVMContext::MD_type, Types);
-    for (MDNode *Type : Types) {
+  for (GlobalTypeMember *GTM : Globals) {
+    for (MDNode *Type : GTM->types()) {
       // Type = { offset, type identifier }
       unsigned TypeIdIndex = TypeIdIndices[Type->getOperand(1)];
       TypeMembers[TypeIdIndex].insert(GlobalIndex);
@@ -1043,32 +1088,32 @@
     GLB.addFragment(MemSet);
 
   // Build the bitsets from this disjoint set.
-  if (Globals.empty() || isa<GlobalVariable>(Globals[0])) {
+  if (Globals.empty() || isa<GlobalVariable>(Globals[0]->getGlobal())) {
     // Build a vector of global variables with the computed layout.
-    std::vector<GlobalVariable *> OrderedGVs(Globals.size());
+    std::vector<GlobalTypeMember *> OrderedGVs(Globals.size());
     auto OGI = OrderedGVs.begin();
     for (auto &&F : GLB.Fragments) {
       for (auto &&Offset : F) {
-        auto GV = dyn_cast<GlobalVariable>(Globals[Offset]);
+        auto GV = dyn_cast<GlobalVariable>(Globals[Offset]->getGlobal());
         if (!GV)
           report_fatal_error("Type identifier may not contain both global "
                              "variables and functions");
-        *OGI++ = GV;
+        *OGI++ = Globals[Offset];
       }
     }
 
     buildBitSetsFromGlobalVariables(TypeIds, OrderedGVs);
   } else {
     // Build a vector of functions with the computed layout.
-    std::vector<Function *> OrderedFns(Globals.size());
+    std::vector<GlobalTypeMember *> OrderedFns(Globals.size());
     auto OFI = OrderedFns.begin();
     for (auto &&F : GLB.Fragments) {
       for (auto &&Offset : F) {
-        auto Fn = dyn_cast<Function>(Globals[Offset]);
+        auto Fn = dyn_cast<Function>(Globals[Offset]->getGlobal());
         if (!Fn)
           report_fatal_error("Type identifier may not contain both global "
                              "variables and functions");
-        *OFI++ = Fn;
+        *OFI++ = Globals[Offset];
       }
     }
 
@@ -1093,22 +1138,37 @@
   // Equivalence class set containing type identifiers and the globals that
   // reference them. This is used to partition the set of type identifiers in
   // the module into disjoint sets.
-  typedef EquivalenceClasses<PointerUnion<GlobalObject *, Metadata *>>
+  typedef EquivalenceClasses<PointerUnion<GlobalTypeMember *, Metadata *>>
       GlobalClassesTy;
   GlobalClassesTy GlobalClasses;
 
-  // Verify the type metadata and build a mapping from type identifiers to their
-  // last observed index in the list of globals. This will be used later to
-  // deterministically order the list of type identifiers.
-  llvm::DenseMap<Metadata *, unsigned> TypeIdIndices;
+  // Verify the type metadata and build a few data structures to let us
+  // efficiently enumerate the type identifiers associated with a global:
+  // a list of GlobalTypeMembers (a GlobalObject stored alongside a vector
+  // of associated type metadata) and a mapping from type identifiers to their
+  // list of GlobalTypeMembers and last observed index in the list of globals.
+  // The indices will be used later to deterministically order the list of type
+  // identifiers.
+  BumpPtrAllocator Alloc;
+  struct TIInfo {
+    unsigned Index;
+    std::vector<GlobalTypeMember *> RefGlobals;
+  };
+  llvm::DenseMap<Metadata *, TIInfo> TypeIdInfo;
   unsigned I = 0;
   SmallVector<MDNode *, 2> Types;
   for (GlobalObject &GO : M.global_objects()) {
     Types.clear();
     GO.getMetadata(LLVMContext::MD_type, Types);
+    if (Types.empty())
+      continue;
+
+    auto *GTM = GlobalTypeMember::create(Alloc, &GO, Types);
     for (MDNode *Type : Types) {
       verifyTypeMDNode(&GO, Type);
-      TypeIdIndices[cast<MDNode>(Type)->getOperand(1)] = ++I;
+      auto &Info = TypeIdInfo[cast<MDNode>(Type)->getOperand(1)];
+      Info.Index = ++I;
+      Info.RefGlobals.push_back(GTM);
     }
   }
 
@@ -1136,14 +1196,9 @@
     GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI);
 
     // Add the referenced globals to the type identifier's equivalence class.
-    for (GlobalObject &GO : M.global_objects()) {
-      Types.clear();
-      GO.getMetadata(LLVMContext::MD_type, Types);
-      for (MDNode *Type : Types)
-        if (Type->getOperand(1) == BitSet)
-          CurSet = GlobalClasses.unionSets(
-              CurSet, GlobalClasses.findLeader(GlobalClasses.insert(&GO)));
-    }
+    for (GlobalTypeMember *GTM : TypeIdInfo[BitSet].RefGlobals)
+      CurSet = GlobalClasses.unionSets(
+          CurSet, GlobalClasses.findLeader(GlobalClasses.insert(GTM)));
   }
 
   if (GlobalClasses.empty())
@@ -1163,7 +1218,7 @@
     for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I);
          MI != GlobalClasses.member_end(); ++MI) {
       if ((*MI).is<Metadata *>())
-        MaxIndex = std::max(MaxIndex, TypeIdIndices[MI->get<Metadata *>()]);
+        MaxIndex = std::max(MaxIndex, TypeIdInfo[MI->get<Metadata *>()].Index);
     }
     Sets.emplace_back(I, MaxIndex);
   }
@@ -1177,20 +1232,20 @@
   for (const auto &S : Sets) {
     // Build the list of type identifiers in this disjoint set.
     std::vector<Metadata *> TypeIds;
-    std::vector<GlobalObject *> Globals;
+    std::vector<GlobalTypeMember *> Globals;
     for (GlobalClassesTy::member_iterator MI =
              GlobalClasses.member_begin(S.first);
          MI != GlobalClasses.member_end(); ++MI) {
       if ((*MI).is<Metadata *>())
         TypeIds.push_back(MI->get<Metadata *>());
       else
-        Globals.push_back(MI->get<GlobalObject *>());
+        Globals.push_back(MI->get<GlobalTypeMember *>());
     }
 
     // Order type identifiers by global index for determinism. This ordering is
     // stable as there is a one-to-one mapping between metadata and indices.
     std::sort(TypeIds.begin(), TypeIds.end(), [&](Metadata *M1, Metadata *M2) {
-      return TypeIdIndices[M1] < TypeIdIndices[M2];
+      return TypeIdInfo[M1].Index < TypeIdInfo[M2].Index;
     });
 
     // Build bitsets for this disjoint set.
diff --git a/lib/Transforms/IPO/WholeProgramDevirt.cpp b/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 7ef5f24..844cc0f 100644
--- a/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -293,6 +293,7 @@
   void buildTypeIdentifierMap(
       std::vector<VTableBits> &Bits,
       DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
+  Constant *getPointerAtOffset(Constant *I, uint64_t Offset);
   bool
   tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot,
                             const std::set<TypeMemberInfo> &TypeMemberInfos,
@@ -382,6 +383,38 @@
   }
 }
 
+Constant *DevirtModule::getPointerAtOffset(Constant *I, uint64_t Offset) {
+  if (I->getType()->isPointerTy()) {
+    if (Offset == 0)
+      return I;
+    return nullptr;
+  }
+
+  const DataLayout &DL = M.getDataLayout();
+
+  if (auto *C = dyn_cast<ConstantStruct>(I)) {
+    const StructLayout *SL = DL.getStructLayout(C->getType());
+    if (Offset >= SL->getSizeInBytes())
+      return nullptr;
+
+    unsigned Op = SL->getElementContainingOffset(Offset);
+    return getPointerAtOffset(cast<Constant>(I->getOperand(Op)),
+                              Offset - SL->getElementOffset(Op));
+  }
+  if (auto *C = dyn_cast<ConstantArray>(I)) {
+    ArrayType *VTableTy = C->getType();
+    uint64_t ElemSize = DL.getTypeAllocSize(VTableTy->getElementType());
+
+    unsigned Op = Offset / ElemSize;
+    if (Op >= C->getNumOperands())
+      return nullptr;
+
+    return getPointerAtOffset(cast<Constant>(I->getOperand(Op)),
+                              Offset % ElemSize);
+  }
+  return nullptr;
+}
+
 bool DevirtModule::tryFindVirtualCallTargets(
     std::vector<VirtualCallTarget> &TargetsForSlot,
     const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset) {
@@ -389,22 +422,12 @@
     if (!TM.Bits->GV->isConstant())
       return false;
 
-    auto Init = dyn_cast<ConstantArray>(TM.Bits->GV->getInitializer());
-    if (!Init)
-      return false;
-    ArrayType *VTableTy = Init->getType();
-
-    uint64_t ElemSize =
-        M.getDataLayout().getTypeAllocSize(VTableTy->getElementType());
-    uint64_t GlobalSlotOffset = TM.Offset + ByteOffset;
-    if (GlobalSlotOffset % ElemSize != 0)
+    Constant *Ptr = getPointerAtOffset(TM.Bits->GV->getInitializer(),
+                                       TM.Offset + ByteOffset);
+    if (!Ptr)
       return false;
 
-    unsigned Op = GlobalSlotOffset / ElemSize;
-    if (Op >= Init->getNumOperands())
-      return false;
-
-    auto Fn = dyn_cast<Function>(Init->getOperand(Op)->stripPointerCasts());
+    auto Fn = dyn_cast<Function>(Ptr->stripPointerCasts());
     if (!Fn)
       return false;
 
diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp
index 4f6225f..251b387 100644
--- a/lib/Transforms/Scalar/BDCE.cpp
+++ b/lib/Transforms/Scalar/BDCE.cpp
@@ -39,6 +39,12 @@
   SmallVector<Instruction*, 128> Worklist;
   bool Changed = false;
   for (Instruction &I : instructions(F)) {
+    // If the instruction has side effects and no non-dbg uses,
+    // skip it. This way we avoid computing known bits on an instruction
+    // that will not help us.
+    if (I.mayHaveSideEffects() && I.use_empty())
+      continue;
+
     if (I.getType()->isIntegerTy() &&
         !DB.getDemandedBits(&I).getBoolValue()) {
       // For live instructions that have all dead bits, first make them dead by
@@ -50,7 +56,7 @@
       // undef, poison, etc.
       Value *Zero = ConstantInt::get(I.getType(), 0);
       ++NumSimplified;
-      I.replaceAllUsesWith(Zero);
+      I.replaceNonMetadataUsesWith(Zero);
       Changed = true;
     }
     if (!DB.isInstructionDead(&I))
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 81f273f..9485bfd 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -1700,7 +1700,10 @@
     if (isa<PHINode>(V))
       V->takeName(LI);
     if (Instruction *I = dyn_cast<Instruction>(V))
-      if (LI->getDebugLoc())
+      // If instruction I has debug info, then we should not update it.
+      // Also, if I has a null DebugLoc, then it is still potentially incorrect
+      // to propagate LI's DebugLoc because LI may not post-dominate I.
+      if (LI->getDebugLoc() && ValuesPerBlock.size() != 1)
         I->setDebugLoc(LI->getDebugLoc());
     if (V->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp
index 230558b..90c26e1 100644
--- a/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/lib/Transforms/Scalar/GVNHoist.cpp
@@ -802,6 +802,7 @@
 
         // Move the instruction at the end of HoistPt.
         Instruction *Last = HoistPt->getTerminator();
+        MD->removeInstruction(Repl);
         Repl->moveBefore(Last);
 
         DFSNumber[Repl] = DFSNumber[Last]++;
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 32f4867..279c710 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -916,7 +916,8 @@
 
   // If this is an AND or OR with 0 or -1, it doesn't matter that the other
   // operand is overdefined.
-  if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Or) {
+  if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Mul ||
+      I.getOpcode() == Instruction::Or) {
     LatticeVal *NonOverdefVal = nullptr;
     if (!V1State.isOverdefined())
       NonOverdefVal = &V1State;
@@ -927,8 +928,10 @@
       if (NonOverdefVal->isUnknown())
         return;
 
-      if (I.getOpcode() == Instruction::And) {
+      if (I.getOpcode() == Instruction::And ||
+          I.getOpcode() == Instruction::Mul) {
         // X and 0 = 0
+        // X * 0 = 0
         if (NonOverdefVal->getConstant()->isNullValue())
           return markConstant(IV, &I, NonOverdefVal->getConstant());
       } else {
@@ -1396,8 +1399,8 @@
               break;
         }
 
-        // undef >>a X -> all ones
-        markForcedConstant(&I, Constant::getAllOnesValue(ITy));
+        // undef >>a X -> 0
+        markForcedConstant(&I, Constant::getNullValue(ITy));
         return true;
       case Instruction::LShr:
       case Instruction::Shl:
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 1f9d085..887818b 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -3982,16 +3982,16 @@
   if (!IsSorted)
     std::sort(AS.begin(), AS.end());
 
-  /// \brief Describes the allocas introduced by rewritePartition
-  /// in order to migrate the debug info.
-  struct Piece {
+  /// Describes the allocas introduced by rewritePartition in order to migrate
+  /// the debug info.
+  struct Fragment {
     AllocaInst *Alloca;
     uint64_t Offset;
     uint64_t Size;
-    Piece(AllocaInst *AI, uint64_t O, uint64_t S)
+    Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
       : Alloca(AI), Offset(O), Size(S) {}
   };
-  SmallVector<Piece, 4> Pieces;
+  SmallVector<Fragment, 4> Fragments;
 
   // Rewrite each partition.
   for (auto &P : AS.partitions()) {
@@ -4002,7 +4002,7 @@
         uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType());
         // Don't include any padding.
         uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
-        Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size));
+        Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
       }
     }
     ++NumPartitions;
@@ -4019,32 +4019,34 @@
     auto *Expr = DbgDecl->getExpression();
     DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
     uint64_t AllocaSize = DL.getTypeSizeInBits(AI.getAllocatedType());
-    for (auto Piece : Pieces) {
-      // Create a piece expression describing the new partition or reuse AI's
+    for (auto Fragment : Fragments) {
+      // Create a fragment expression describing the new partition or reuse AI's
       // expression if there is only one partition.
-      auto *PieceExpr = Expr;
-      if (Piece.Size < AllocaSize || Expr->isBitPiece()) {
+      auto *FragmentExpr = Expr;
+      if (Fragment.Size < AllocaSize || Expr->isFragment()) {
         // If this alloca is already a scalar replacement of a larger aggregate,
-        // Piece.Offset describes the offset inside the scalar.
-        uint64_t Offset = Expr->isBitPiece() ? Expr->getBitPieceOffset() : 0;
-        uint64_t Start = Offset + Piece.Offset;
-        uint64_t Size = Piece.Size;
-        if (Expr->isBitPiece()) {
-          uint64_t AbsEnd = Expr->getBitPieceOffset() + Expr->getBitPieceSize();
+        // Fragment.Offset describes the offset inside the scalar.
+        uint64_t Offset =
+            Expr->isFragment() ? Expr->getFragmentOffsetInBits() : 0;
+        uint64_t Start = Offset + Fragment.Offset;
+        uint64_t Size = Fragment.Size;
+        if (Expr->isFragment()) {
+          uint64_t AbsEnd =
+              Expr->getFragmentOffsetInBits() + Expr->getFragmentSizeInBits();
           if (Start >= AbsEnd)
             // No need to describe a SROAed padding.
             continue;
           Size = std::min(Size, AbsEnd - Start);
         }
-        PieceExpr = DIB.createBitPieceExpression(Start, Size);
+        FragmentExpr = DIB.createFragmentExpression(Start, Size);
       }
 
       // Remove any existing dbg.declare intrinsic describing the same alloca.
-      if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Piece.Alloca))
+      if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Fragment.Alloca))
         OldDDI->eraseFromParent();
 
-      DIB.insertDeclare(Piece.Alloca, Var, PieceExpr, DbgDecl->getDebugLoc(),
-                        &AI);
+      DIB.insertDeclare(Fragment.Alloca, Var, FragmentExpr,
+                        DbgDecl->getDebugLoc(), &AI);
     }
   }
   return Changed;
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 6c02094..ee083f9 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -1350,7 +1350,7 @@
 /// Update inlined instructions' line numbers to
 /// to encode location where these instructions are inlined.
 static void fixupLineNumbers(Function *Fn, Function::iterator FI,
-                             Instruction *TheCall) {
+                             Instruction *TheCall, bool CalleeHasDebugInfo) {
   const DebugLoc &TheCallDL = TheCall->getDebugLoc();
   if (!TheCallDL)
     return;
@@ -1372,22 +1372,26 @@
   for (; FI != Fn->end(); ++FI) {
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
          BI != BE; ++BI) {
-      DebugLoc DL = BI->getDebugLoc();
-      if (!DL) {
-        // If the inlined instruction has no line number, make it look as if it
-        // originates from the call location. This is important for
-        // ((__always_inline__, __nodebug__)) functions which must use caller
-        // location for all instructions in their function body.
-
-        // Don't update static allocas, as they may get moved later.
-        if (auto *AI = dyn_cast<AllocaInst>(BI))
-          if (allocaWouldBeStaticInEntry(AI))
-            continue;
-
-        BI->setDebugLoc(TheCallDL);
-      } else {
-        BI->setDebugLoc(updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes));
+      if (DebugLoc DL = BI->getDebugLoc()) {
+        BI->setDebugLoc(
+            updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes));
+        continue;
       }
+
+      if (CalleeHasDebugInfo)
+        continue;
+      
+      // If the inlined instruction has no line number, make it look as if it
+      // originates from the call location. This is important for
+      // ((__always_inline__, __nodebug__)) functions which must use caller
+      // location for all instructions in their function body.
+
+      // Don't update static allocas, as they may get moved later.
+      if (auto *AI = dyn_cast<AllocaInst>(BI))
+        if (allocaWouldBeStaticInEntry(AI))
+          continue;
+
+      BI->setDebugLoc(TheCallDL);
     }
   }
 }
@@ -1643,8 +1647,11 @@
     if (IFI.CG)
       UpdateCallGraphAfterInlining(CS, FirstNewBlock, VMap, IFI);
 
-    // Update inlined instructions' line number information.
-    fixupLineNumbers(Caller, FirstNewBlock, TheCall);
+    // For 'nodebug' functions, the associated DISubprogram is always null.
+    // Conservatively avoid propagating the callsite debug location to
+    // instructions inlined from a function whose DISubprogram is not null.
+    fixupLineNumbers(Caller, FirstNewBlock, TheCall,
+                     CalledFunc->getSubprogram() != nullptr);
 
     // Clone existing noalias metadata if necessary.
     CloneAliasScopeMetadata(CS, VMap);
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 01a5579..6de0f34 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1102,26 +1102,26 @@
   if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
     ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
   if (ExtendedArg) {
-    // We're now only describing a subset of the variable. The piece we're
+    // We're now only describing a subset of the variable. The fragment we're
     // describing will always be smaller than the variable size, because
     // VariableSize == Size of Alloca described by DDI. Since SI stores
     // to the alloca described by DDI, if it's first operand is an extend,
     // we're guaranteed that before extension, the value was narrower than
     // the size of the alloca, hence the size of the described variable.
     SmallVector<uint64_t, 3> Ops;
-    unsigned PieceOffset = 0;
-    // If this already is a bit piece, we drop the bit piece from the expression
-    // and record the offset.
-    if (DIExpr->isBitPiece()) {
+    unsigned FragmentOffset = 0;
+    // If this already is a bit fragment, we drop the bit fragment from the
+    // expression and record the offset.
+    if (DIExpr->isFragment()) {
       Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()-3);
-      PieceOffset = DIExpr->getBitPieceOffset();
+      FragmentOffset = DIExpr->getFragmentOffsetInBits();
     } else {
       Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
     }
-    Ops.push_back(dwarf::DW_OP_bit_piece);
-    Ops.push_back(PieceOffset); // Offset
+    Ops.push_back(dwarf::DW_OP_LLVM_fragment);
+    Ops.push_back(FragmentOffset);
     const DataLayout &DL = DDI->getModule()->getDataLayout();
-    Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); // Size
+    Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
     auto NewDIExpr = Builder.createExpression(Ops);
     if (!LdStHasDebugValue(DIVar, NewDIExpr, SI))
       Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, NewDIExpr,
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index e4cf928..a7f2005 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -518,6 +518,10 @@
   /// induction variable will first be truncated to the corresponding type.
   void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr);
 
+  /// Returns true if an instruction \p I should be scalarized instead of
+  /// vectorized for the chosen vectorization factor.
+  bool shouldScalarizeInstruction(Instruction *I) const;
+
   /// Returns true if we should generate a scalar version of \p IV.
   bool needsScalarInduction(Instruction *IV) const;
 
@@ -1907,6 +1911,15 @@
     return MinBWs;
   }
 
+  /// \returns True if it is more profitable to scalarize instruction \p I for
+  /// vectorization factor \p VF.
+  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
+    auto Scalars = InstsToScalarize.find(VF);
+    assert(Scalars != InstsToScalarize.end() &&
+           "VF not yet analyzed for scalarization profitability");
+    return Scalars->second.count(I);
+  }
+
 private:
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
@@ -1949,6 +1962,29 @@
   /// to this type.
   MapVector<Instruction *, uint64_t> MinBWs;
 
+  /// A type representing the costs for instructions if they were to be
+  /// scalarized rather than vectorized. The entries are Instruction-Cost
+  /// pairs.
+  typedef DenseMap<Instruction *, unsigned> ScalarCostsTy;
+
+  /// A map holding scalar costs for different vectorization factors. The
+  /// presence of a cost for an instruction in the mapping indicates that the
+  /// instruction will be scalarized when vectorizing with the associated
+  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
+  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
+
+  /// Returns the expected difference in cost from scalarizing the expression
+  /// feeding a predicated instruction \p PredInst. The instructions to
+  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
+  /// non-negative return value implies the expression will be scalarized.
+  /// Currently, only single-use chains are considered for scalarization.
+  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
+                              unsigned VF);
+
+  /// Collects the instructions to scalarize for each predicated instruction in
+  /// the loop.
+  void collectInstsToScalarize(unsigned VF);
+
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -2183,12 +2219,17 @@
   VecInd->addIncoming(LastInduction, LoopVectorLatch);
 }
 
+bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
+  return Legal->isScalarAfterVectorization(I) ||
+         Cost->isProfitableToScalarize(I, VF);
+}
+
 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
-  if (Legal->isScalarAfterVectorization(IV))
+  if (shouldScalarizeInstruction(IV))
     return true;
   auto isScalarInst = [&](User *U) -> bool {
     auto *I = cast<Instruction>(U);
-    return (OrigLoop->contains(I) && Legal->isScalarAfterVectorization(I));
+    return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
   };
   return any_of(IV->users(), isScalarInst);
 }
@@ -2229,7 +2270,7 @@
   // create the phi node, we will splat the scalar induction variable in each
   // loop iteration.
   if (VF > 1 && IV->getType() == Induction->getType() && Step &&
-      !Legal->isScalarAfterVectorization(EntryVal)) {
+      !shouldScalarizeInstruction(EntryVal)) {
     createVectorIntInductionPHI(ID, EntryVal);
     VectorizedIV = true;
   }
@@ -4648,10 +4689,11 @@
       continue;
 
     // Scalarize instructions that should remain scalar after vectorization.
-    if (!(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
+    if (VF > 1 &&
+        !(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
           isa<DbgInfoIntrinsic>(&I)) &&
-        Legal->isScalarAfterVectorization(&I)) {
-      scalarizeInstruction(&I);
+        shouldScalarizeInstruction(&I)) {
+      scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
       continue;
     }
 
@@ -6124,6 +6166,7 @@
     DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
 
     Factor.Width = UserVF;
+    collectInstsToScalarize(UserVF);
     return Factor;
   }
 
@@ -6530,10 +6573,160 @@
   return RUs;
 }
 
+void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
+
+  // If we aren't vectorizing the loop, or if we've already collected the
+  // instructions to scalarize, there's nothing to do. Collection may already
+  // have occurred if we have a user-selected VF and are now computing the
+  // expected cost for interleaving.
+  if (VF < 2 || InstsToScalarize.count(VF))
+    return;
+
+  // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
+  // not profitable to scalarize any instructions, the presence of VF in the
+  // map will indicate that we've analyzed it already.
+  ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
+
+  // Find all the instructions that are scalar with predication in the loop and
+  // determine if it would be better to not if-convert the blocks they are in.
+  // If so, we also record the instructions to scalarize.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    if (!Legal->blockNeedsPredication(BB))
+      continue;
+    for (Instruction &I : *BB)
+      if (Legal->isScalarWithPredication(&I)) {
+        ScalarCostsTy ScalarCosts;
+        if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
+          ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
+      }
+  }
+}
+
+int LoopVectorizationCostModel::computePredInstDiscount(
+    Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
+    unsigned VF) {
+
+  assert(!Legal->isUniformAfterVectorization(PredInst) &&
+         "Instruction marked uniform-after-vectorization will be predicated");
+
+  // Initialize the discount to zero, meaning that the scalar version and the
+  // vector version cost the same.
+  int Discount = 0;
+
+  // Holds instructions to analyze. The instructions we visit are mapped in
+  // ScalarCosts. Those instructions are the ones that would be scalarized if
+  // we find that the scalar version costs less.
+  SmallVector<Instruction *, 8> Worklist;
+
+  // Returns true if the given instruction can be scalarized.
+  auto canBeScalarized = [&](Instruction *I) -> bool {
+
+    // We only attempt to scalarize instructions forming a single-use chain
+    // from the original predicated block that would otherwise be vectorized.
+    // Although not strictly necessary, we give up on instructions we know will
+    // already be scalar to avoid traversing chains that are unlikely to be
+    // beneficial.
+    if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
+        Legal->isScalarAfterVectorization(I))
+      return false;
+
+    // If the instruction is scalar with predication, it will be analyzed
+    // separately. We ignore it within the context of PredInst.
+    if (Legal->isScalarWithPredication(I))
+      return false;
+
+    // If any of the instruction's operands are uniform after vectorization,
+    // the instruction cannot be scalarized. This prevents, for example, a
+    // masked load from being scalarized.
+    //
+    // We assume we will only emit a value for lane zero of an instruction
+    // marked uniform after vectorization, rather than VF identical values.
+    // Thus, if we scalarize an instruction that uses a uniform, we would
+    // create uses of values corresponding to the lanes we aren't emitting code
+    // for. This behavior can be changed by allowing getScalarValue to clone
+    // the lane zero values for uniforms rather than asserting.
+    for (Use &U : I->operands())
+      if (auto *J = dyn_cast<Instruction>(U.get()))
+        if (Legal->isUniformAfterVectorization(J))
+          return false;
+
+    // Otherwise, we can scalarize the instruction.
+    return true;
+  };
+
+  // Returns true if an operand that cannot be scalarized must be extracted
+  // from a vector. We will account for this scalarization overhead below. Note
+  // that the non-void predicated instructions are placed in their own blocks,
+  // and their return values are inserted into vectors. Thus, an extract would
+  // still be required.
+  auto needsExtract = [&](Instruction *I) -> bool {
+    return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I);
+  };
+
+  // Compute the expected cost discount from scalarizing the entire expression
+  // feeding the predicated instruction. We currently only consider expressions
+  // that are single-use instruction chains.
+  Worklist.push_back(PredInst);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+
+    // If we've already analyzed the instruction, there's nothing to do.
+    if (ScalarCosts.count(I))
+      continue;
+
+    // Compute the cost of the vector instruction. Note that this cost already
+    // includes the scalarization overhead of the predicated instruction.
+    unsigned VectorCost = getInstructionCost(I, VF).first;
+
+    // Compute the cost of the scalarized instruction. This cost is the cost of
+    // the instruction as if it wasn't if-converted and instead remained in the
+    // predicated block. We will scale this cost by block probability after
+    // computing the scalarization overhead.
+    unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
+
+    // Compute the scalarization overhead of needed insertelement instructions
+    // and phi nodes.
+    if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+      ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true,
+                                             false, TTI);
+      ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
+    }
+
+    // Compute the scalarization overhead of needed extractelement
+    // instructions. For each of the instruction's operands, if the operand can
+    // be scalarized, add it to the worklist; otherwise, account for the
+    // overhead.
+    for (Use &U : I->operands())
+      if (auto *J = dyn_cast<Instruction>(U.get())) {
+        assert(VectorType::isValidElementType(J->getType()) &&
+               "Instruction has non-scalar type");
+        if (canBeScalarized(J))
+          Worklist.push_back(J);
+        else if (needsExtract(J))
+          ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF),
+                                                 false, true, TTI);
+      }
+
+    // Scale the total scalar cost by block probability.
+    ScalarCost /= getReciprocalPredBlockProb();
+
+    // Compute the discount. A non-negative discount means the vector version
+    // of the instruction costs more, and scalarizing would be beneficial.
+    Discount += VectorCost - ScalarCost;
+    ScalarCosts[I] = ScalarCost;
+  }
+
+  return Discount;
+}
+
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::expectedCost(unsigned VF) {
   VectorizationCostTy Cost;
 
+  // Collect the instructions (and their associated costs) that will be more
+  // profitable to scalarize.
+  collectInstsToScalarize(VF);
+
   // For each block.
   for (BasicBlock *BB : TheLoop->blocks()) {
     VectorizationCostTy BlockCost;
@@ -6641,6 +6834,9 @@
   if (Legal->isUniformAfterVectorization(I))
     VF = 1;
 
+  if (VF > 1 && isProfitableToScalarize(I, VF))
+    return VectorizationCostTy(InstsToScalarize[VF][I], false);
+
   Type *VectorTy;
   unsigned C = getInstructionCost(I, VF, VectorTy);
 
@@ -7007,7 +7203,14 @@
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
 
-  // Insert values known to be scalar into VecValuesToIgnore.
+  // Insert values known to be scalar into VecValuesToIgnore. This is a
+  // conservative estimation of the values that will later be scalarized.
+  //
+  // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may
+  //        still be scalarized. For example, we may find an instruction to be
+  //        more profitable for a given vectorization factor if it were to be
+  //        scalarized. But at this point, we haven't yet computed the
+  //        vectorization factor.
   for (auto *BB : TheLoop->getBlocks())
     for (auto &I : *BB)
       if (Legal->isScalarAfterVectorization(&I))
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d1b569d..867de25 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3870,10 +3870,11 @@
 
   unsigned Opcode0 = I0->getOpcode();
 
-  // FIXME: Register size should be a parameter to this function, so we can
-  // try different vectorization factors.
   unsigned Sz = R.getVectorElementSize(I0);
-  unsigned VF = R.getMinVecRegSize() / Sz;
+  unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
+  unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+  if (MaxVF < 2)
+    return false;
 
   for (Value *V : VL) {
     Type *Ty = V->getType();
@@ -3889,76 +3890,89 @@
   // Keep track of values that were deleted by vectorizing in the loop below.
   SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
 
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    unsigned OpsWidth = 0;
-
-    if (i + VF > e)
-      OpsWidth = e - i;
-    else
-      OpsWidth = VF;
-
-    if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
-      break;
-
-    // Check that a previous iteration of this loop did not delete the Value.
-    if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
+  unsigned NextInst = 0, MaxInst = VL.size();
+  for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
+       VF /= 2) {
+    // No actual vectorization should happen, if number of parts is the same as
+    // provided vectorization factor (i.e. the scalar type is used for vector
+    // code during codegen).
+    auto *VecTy = VectorType::get(VL[0]->getType(), VF);
+    if (TTI->getNumberOfParts(VecTy) == VF)
       continue;
+    for (unsigned I = NextInst; I < MaxInst; ++I) {
+      unsigned OpsWidth = 0;
 
-    DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
-                 << "\n");
-    ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
+      if (I + VF > MaxInst)
+        OpsWidth = MaxInst - I;
+      else
+        OpsWidth = VF;
 
-    ArrayRef<Value *> BuildVectorSlice;
-    if (!BuildVector.empty())
-      BuildVectorSlice = BuildVector.slice(i, OpsWidth);
+      if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
+        break;
 
-    R.buildTree(Ops, BuildVectorSlice);
-    // TODO: check if we can allow reordering for more cases.
-    if (AllowReorder && R.shouldReorder()) {
-      // Conceptually, there is nothing actually preventing us from trying to
-      // reorder a larger list. In fact, we do exactly this when vectorizing
-      // reductions. However, at this point, we only expect to get here from
-      // tryToVectorizePair().
-      assert(Ops.size() == 2);
-      assert(BuildVectorSlice.empty());
-      Value *ReorderedOps[] = { Ops[1], Ops[0] };
-      R.buildTree(ReorderedOps, None);
-    }
-    if (R.isTreeTinyAndNotFullyVectorizable())
-      continue;
+      // Check that a previous iteration of this loop did not delete the Value.
+      if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
+        continue;
 
-    R.computeMinimumValueSizes();
-    int Cost = R.getTreeCost();
+      DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+                   << "\n");
+      ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
 
-    if (Cost < -SLPCostThreshold) {
-      DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
-      Value *VectorizedRoot = R.vectorizeTree();
+      ArrayRef<Value *> BuildVectorSlice;
+      if (!BuildVector.empty())
+        BuildVectorSlice = BuildVector.slice(I, OpsWidth);
 
-      // Reconstruct the build vector by extracting the vectorized root. This
-      // way we handle the case where some elements of the vector are undefined.
-      //  (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
-      if (!BuildVectorSlice.empty()) {
-        // The insert point is the last build vector instruction. The vectorized
-        // root will precede it. This guarantees that we get an instruction. The
-        // vectorized tree could have been constant folded.
-        Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
-        unsigned VecIdx = 0;
-        for (auto &V : BuildVectorSlice) {
-          IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
-                                      ++BasicBlock::iterator(InsertAfter));
-          Instruction *I = cast<Instruction>(V);
-          assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
-          Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
-              VectorizedRoot, Builder.getInt32(VecIdx++)));
-          I->setOperand(1, Extract);
-          I->removeFromParent();
-          I->insertAfter(Extract);
-          InsertAfter = I;
-        }
+      R.buildTree(Ops, BuildVectorSlice);
+      // TODO: check if we can allow reordering for more cases.
+      if (AllowReorder && R.shouldReorder()) {
+        // Conceptually, there is nothing actually preventing us from trying to
+        // reorder a larger list. In fact, we do exactly this when vectorizing
+        // reductions. However, at this point, we only expect to get here from
+        // tryToVectorizePair().
+        assert(Ops.size() == 2);
+        assert(BuildVectorSlice.empty());
+        Value *ReorderedOps[] = {Ops[1], Ops[0]};
+        R.buildTree(ReorderedOps, None);
       }
-      // Move to the next bundle.
-      i += VF - 1;
-      Changed = true;
+      if (R.isTreeTinyAndNotFullyVectorizable())
+        continue;
+
+      R.computeMinimumValueSizes();
+      int Cost = R.getTreeCost();
+
+      if (Cost < -SLPCostThreshold) {
+        DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+        Value *VectorizedRoot = R.vectorizeTree();
+
+        // Reconstruct the build vector by extracting the vectorized root. This
+        // way we handle the case where some elements of the vector are
+        // undefined.
+        //  (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
+        if (!BuildVectorSlice.empty()) {
+          // The insert point is the last build vector instruction. The
+          // vectorized root will precede it. This guarantees that we get an
+          // instruction. The vectorized tree could have been constant folded.
+          Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
+          unsigned VecIdx = 0;
+          for (auto &V : BuildVectorSlice) {
+            IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
+                                        ++BasicBlock::iterator(InsertAfter));
+            Instruction *I = cast<Instruction>(V);
+            assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
+            Instruction *Extract =
+                cast<Instruction>(Builder.CreateExtractElement(
+                    VectorizedRoot, Builder.getInt32(VecIdx++)));
+            I->setOperand(1, Extract);
+            I->removeFromParent();
+            I->insertAfter(Extract);
+            InsertAfter = I;
+          }
+        }
+        // Move to the next bundle.
+        I += VF - 1;
+        NextInst = I + 1;
+        Changed = true;
+      }
     }
   }
 
diff --git a/test/Analysis/ConstantFolding/vectorgep-crash.ll b/test/Analysis/ConstantFolding/vectorgep-crash.ll
new file mode 100644
index 0000000..bcc96b2
--- /dev/null
+++ b/test/Analysis/ConstantFolding/vectorgep-crash.ll
@@ -0,0 +1,19 @@
+; RUN: opt -instcombine -S -o - %s | FileCheck %s
+; Tests that we don't crash upon encountering a vector GEP
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%Dual = type { %Dual.72, %Partials.73 }
+%Dual.72 = type { double, %Partials }
+%Partials = type { [2 x double] }
+%Partials.73 = type { [2 x %Dual.72] }
+
+; Function Attrs: sspreq
+define <8 x i64*> @"julia_axpy!_65480"(%Dual* %arg1, <8 x i64> %arg2) {
+top:
+; CHECK: %VectorGep14 = getelementptr inbounds %Dual, %Dual* %arg1, <8 x i64> %arg2, i32 1, i32 0, i64 0, i32 1, i32 0, i64 0
+  %VectorGep14 = getelementptr inbounds %Dual, %Dual* %arg1, <8 x i64> %arg2, i32 1, i32 0, i64 0, i32 1, i32 0, i64 0
+  %0 = bitcast <8 x double*> %VectorGep14 to <8 x i64*>
+  ret <8 x i64*> %0
+}
diff --git a/test/Analysis/CostModel/AArch64/gep.ll b/test/Analysis/CostModel/AArch64/gep.ll
index 99284cb..f3d83c1 100644
--- a/test/Analysis/CostModel/AArch64/gep.ll
+++ b/test/Analysis/CostModel/AArch64/gep.ll
@@ -125,7 +125,7 @@
 
 define i64 @test16(i64* %p, i32 %i) {
 ; CHECK-LABEL: test16
-; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64, i64*
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 -64
   %v = load i64, i64* %a
   ret i64 %v
@@ -194,3 +194,99 @@
   %v = load i64, i64* %a
   ret i64 %v
 }
+
+define i8 @test25(i8* %p, i32 %i) {
+; CHECK-LABEL: test25
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
+  %a = getelementptr inbounds i8, i8* %p, i32 -128
+  %v = load i8, i8* %a
+  ret i8 %v
+}
+
+define i16 @test26(i16* %p, i32 %i) {
+; CHECK-LABEL: test26
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16*
+  %a = getelementptr inbounds i16, i16* %p, i32 -128
+  %v = load i16, i16* %a
+  ret i16 %v
+}
+
+define i32 @test27(i32* %p, i32 %i) {
+; CHECK-LABEL: test27
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
+  %a = getelementptr inbounds i32, i32* %p, i32 -128
+  %v = load i32, i32* %a
+  ret i32 %v
+}
+
+define i64 @test28(i64* %p, i32 %i) {
+; CHECK-LABEL: test28
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
+  %a = getelementptr inbounds i64, i64* %p, i32 -128
+  %v = load i64, i64* %a
+  ret i64 %v
+}
+
+define i8 @test29(i8* %p, i32 %i) {
+; CHECK-LABEL: test29
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
+  %a = getelementptr inbounds i8, i8* %p, i32 -256
+  %v = load i8, i8* %a
+  ret i8 %v
+}
+
+define i16 @test30(i16* %p, i32 %i) {
+; CHECK-LABEL: test30
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16*
+  %a = getelementptr inbounds i16, i16* %p, i32 -256
+  %v = load i16, i16* %a
+  ret i16 %v
+}
+
+define i32 @test31(i32* %p, i32 %i) {
+; CHECK-LABEL: test31
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
+  %a = getelementptr inbounds i32, i32* %p, i32 -256
+  %v = load i32, i32* %a
+  ret i32 %v
+}
+
+define i64 @test32(i64* %p, i32 %i) {
+; CHECK-LABEL: test32
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
+  %a = getelementptr inbounds i64, i64* %p, i32 -256
+  %v = load i64, i64* %a
+  ret i64 %v
+}
+
+define i8 @test33(i8* %p, i32 %i) {
+; CHECK-LABEL: test33
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8, i8*
+  %a = getelementptr inbounds i8, i8* %p, i32 -512
+  %v = load i8, i8* %a
+  ret i8 %v
+}
+
+define i16 @test34(i16* %p, i32 %i) {
+; CHECK-LABEL: test34
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16*
+  %a = getelementptr inbounds i16, i16* %p, i32 -512
+  %v = load i16, i16* %a
+  ret i16 %v
+}
+
+define i32 @test35(i32* %p, i32 %i) {
+; CHECK-LABEL: test35
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
+  %a = getelementptr inbounds i32, i32* %p, i32 -512
+  %v = load i32, i32* %a
+  ret i32 %v
+}
+
+define i64 @test36(i64* %p, i32 %i) {
+; CHECK-LABEL: test36
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
+  %a = getelementptr inbounds i64, i64* %p, i32 -512
+  %v = load i64, i64* %a
+  ret i64 %v
+}
diff --git a/test/Assembler/diexpression.ll b/test/Assembler/diexpression.ll
index 31be86c..dd69c0e 100644
--- a/test/Assembler/diexpression.ll
+++ b/test/Assembler/diexpression.ll
@@ -7,10 +7,10 @@
 ; CHECK:      !0 = !DIExpression()
 ; CHECK-NEXT: !1 = !DIExpression(DW_OP_deref)
 ; CHECK-NEXT: !2 = !DIExpression(DW_OP_plus, 3)
-; CHECK-NEXT: !3 = !DIExpression(DW_OP_bit_piece, 3, 7)
-; CHECK-NEXT: !4 = !DIExpression(DW_OP_deref, DW_OP_plus, 3, DW_OP_bit_piece, 3, 7)
+; CHECK-NEXT: !3 = !DIExpression(DW_OP_LLVM_fragment, 3, 7)
+; CHECK-NEXT: !4 = !DIExpression(DW_OP_deref, DW_OP_plus, 3, DW_OP_LLVM_fragment, 3, 7)
 !0 = !DIExpression()
 !1 = !DIExpression(DW_OP_deref)
 !2 = !DIExpression(DW_OP_plus, 3)
-!3 = !DIExpression(DW_OP_bit_piece, 3, 7)
-!4 = !DIExpression(DW_OP_deref, DW_OP_plus, 3, DW_OP_bit_piece, 3, 7)
+!3 = !DIExpression(DW_OP_LLVM_fragment, 3, 7)
+!4 = !DIExpression(DW_OP_deref, DW_OP_plus, 3, DW_OP_LLVM_fragment, 3, 7)
diff --git a/test/Bitcode/DIExpression-4.0.ll b/test/Bitcode/DIExpression-4.0.ll
new file mode 100644
index 0000000..848de9a
--- /dev/null
+++ b/test/Bitcode/DIExpression-4.0.ll
@@ -0,0 +1,20 @@
+; RUN: llvm-dis -o - %s.bc | FileCheck %s
+
+@g = common global i32 0, align 4, !dbg !0
+
+!llvm.dbg.cu = !{!1}
+!llvm.module.flags = !{!7, !8}
+
+!0 = distinct !DIGlobalVariable(name: "g", scope: !1, file: !2, line: 1, type: !5, isLocal: false, isDefinition: true, expr: !6)
+!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2, producer: "clang (llvm/trunk 288154)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !3, globals: !4)
+!2 = !DIFile(filename: "a.c", directory: "/")
+!3 = !{}
+!4 = !{!0}
+!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; Old-style DIExpression bitcode records using DW_OP_bit_piece should be
+; upgraded to DW_OP_LLVM_fragment.
+;
+; CHECK: !DIExpression(DW_OP_LLVM_fragment, 8, 32)
+!6 = !DIExpression(DW_OP_bit_piece, 8, 32)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/test/Bitcode/DIExpression-4.0.ll.bc b/test/Bitcode/DIExpression-4.0.ll.bc
new file mode 100644
index 0000000..2ec9a59
--- /dev/null
+++ b/test/Bitcode/DIExpression-4.0.ll.bc
Binary files differ
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll b/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll
index e2e91af..95b2ea2 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll
@@ -49,3 +49,10 @@
                       [3 x float]* %x4, double* %x5, i8* %x6, i8* %x7) {
   ret i8* %x0
 }
+
+; CHECK-LABEL: name: args_arr
+; CHECK: %[[ARG0:[0-9]+]](s64) = COPY %d0
+; CHECK: %d0 = COPY %[[ARG0]]
+define [1 x double] @args_arr([1 x double] %d0) {
+  ret [1 x double] %d0
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
index d9e8004..8d1dbc2 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@@ -1,6 +1,8 @@
 ; RUN: not llc -O0 -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
 ; RUN: llc -O0 -global-isel -global-isel-abort=0 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=FALLBACK
-; RUN: llc -O0 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=FALLBACK_WITH_REPORT
+; RUN: llc -O0 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o %t.out 2> %t.err
+; RUN: FileCheck %s --check-prefix=FALLBACK-WITH-REPORT-OUT < %t.out
+; RUN: FileCheck %s --check-prefix=FALLBACK-WITH-REPORT-ERR < %t.err
 ; This file checks that the fallback path to selection dag works.
 ; The test is fragile in the sense that it must be updated to expose
 ; something that fails with global-isel.
@@ -16,11 +18,100 @@
 ; FALLBACK: ldr q0,
 ; FALLBACK-NEXT: bl __fixunstfti
 ;
-; FALLBACK_WITH_REPORT: warning: Instruction selection used fallback path for ABIi128
-; FALLBACK_WITH_REPORT: ldr q0,
-; FALLBACK_WITH_REPORT-NEXT: bl __fixunstfti
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for ABIi128
+; FALLBACK-WITH-REPORT-OUT-LABEL: ABIi128:
+; FALLBACK-WITH-REPORT-OUT: ldr q0,
+; FALLBACK-WITH-REPORT-OUT-NEXT: bl __fixunstfti
 define i128 @ABIi128(i128 %arg1) {
-  %farg1 =       bitcast i128 %arg1 to fp128 
+  %farg1 =       bitcast i128 %arg1 to fp128
   %res = fptoui fp128 %farg1 to i128
   ret i128 %res
 }
+
+; It happens that we don't handle ConstantArray instances yet during
+; translation. Any other constant would be fine too.
+
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for constant
+; FALLBACK-WITH-REPORT-OUT-LABEL: constant:
+; FALLBACK-WITH-REPORT-OUT: fmov d0, #1.0
+define [1 x double] @constant() {
+  ret [1 x double] [double 1.0]
+}
+
+  ; The key problem here is that we may fail to create an MBB referenced by a
+  ; PHI. If so, we cannot complete the G_PHI and mustn't try or bad things
+  ; happen.
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for pending_phis
+; FALLBACK-WITH-REPORT-OUT-LABEL: pending_phis:
+define i32 @pending_phis(i1 %tst, i32 %val, i32* %addr) {
+  br i1 %tst, label %true, label %false
+
+end:
+  %res = phi i32 [%val, %true], [42, %false]
+  ret i32 %res
+
+true:
+  store atomic i32 42, i32* %addr seq_cst, align 4
+  br label %end
+
+false:
+  br label %end
+
+}
+
+  ; General legalizer inability to handle types whose size wasn't a power of 2.
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_type
+; FALLBACK-WITH-REPORT-OUT-LABEL: odd_type:
+define void @odd_type(i42* %addr) {
+  %val42 = load i42, i42* %addr
+  ret void
+}
+
+  ; RegBankSelect crashed when given invalid mappings, and AArch64's
+  ; implementation produce valid-but-nonsense mappings for G_SEQUENCE.
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for sequence_mapping
+; FALLBACK-WITH-REPORT-OUT-LABEL: sequence_mapping:
+define void @sequence_mapping([2 x i64] %in) {
+  ret void
+}
+
+  ; Legalizer was asserting when it enountered an unexpected default action.
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for legal_default
+; FALLBACK-WITH-REPORT-LABEL: legal_default:
+define void @legal_default(i64 %in) {
+  insertvalue [2 x i64] undef, i64 %in, 0
+  ret void
+}
+
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for debug_insts
+; FALLBACK-WITH-REPORT-LABEL: debug_insts:
+define void @debug_insts(i32 %in) #0 !dbg !7 {
+entry:
+  %in.addr = alloca i32, align 4
+  store i32 %in, i32* %in.addr, align 4
+  call void @llvm.dbg.declare(metadata i32* %in.addr, metadata !11, metadata !12), !dbg !13
+  ret void, !dbg !14
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 289075) (llvm/trunk 289080)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "tmp.c", directory: "/Users/tim/llvm/build")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 4.0.0 (trunk 289075) (llvm/trunk 289080)"}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DILocalVariable(name: "in", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!12 = !DIExpression()
+!13 = !DILocation(line: 1, column: 14, scope: !7)
+!14 = !DILocation(line: 2, column: 1, scope: !7)
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir
index a1626f2..e362521 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir
@@ -2325,7 +2325,7 @@
 # CHECK: %0 = MOVi32imm 42
 body:             |
   bb.0:
-    %0(s32) = G_CONSTANT 42
+    %0(s32) = G_CONSTANT i32 42
 ...
 
 ---
@@ -2340,7 +2340,7 @@
 # CHECK: %0 = MOVi64imm 1234567890123
 body:             |
   bb.0:
-    %0(s64) = G_CONSTANT 1234567890123
+    %0(s64) = G_CONSTANT i64 1234567890123
 ...
 
 ---
@@ -2392,7 +2392,7 @@
   bb.0:
       liveins: %x0
     %0(p0) = COPY %x0
-    %1(s64) = G_CONSTANT 42
+    %1(s64) = G_CONSTANT i64 42
     %2(p0) = G_GEP %0, %1(s64)
 ...
 
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index 8ee6f1d..8a5514d 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
+; RUN: llc -O0 -aarch64-enable-atomic-cfg-tidy=0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
 
 ; This file checks that the translation from llvm IR to generic MachineInstr
 ; is correct.
@@ -7,8 +7,9 @@
 
 ; Tests for add.
 ; CHECK-LABEL: name: addi64
-; CHECK: [[ARG1:%[0-9]+]](s64) = COPY %x0
+; CHECK:      [[ARG1:%[0-9]+]](s64) = COPY %x0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s64) = COPY %x1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s64) = G_ADD [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %x0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %x0 
@@ -20,6 +21,7 @@
 ; CHECK-LABEL: name: muli64
 ; CHECK: [[ARG1:%[0-9]+]](s64) = COPY %x0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s64) = COPY %x1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s64) = G_MUL [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %x0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %x0
@@ -51,11 +53,13 @@
 ; CHECK-LABEL: name: uncondbr
 ; CHECK: body:
 ;
-; Entry basic block.
-; CHECK: {{[0-9a-zA-Z._-]+}}:
+; ABI/constant lowering basic block.
+; CHECK: {{bb.[0-9]+}}:
+; IR-level entry basic block
+; CHECK: {{bb.[0-9]+}}:
 ;
 ; Make sure we have one successor and only one.
-; CHECK-NEXT: successors: %[[END:[0-9a-zA-Z._-]+]](0x80000000)
+; CHECK-NEXT: successors: %[[END:bb.[0-9]+]](0x80000000)
 ;
 ; Check that we emit the correct branch.
 ; CHECK: G_BR %[[END]]
@@ -73,15 +77,18 @@
 ; CHECK-LABEL: name: condbr
 ; CHECK: body:
 ;
-; Entry basic block.
-; CHECK: {{[0-9a-zA-Z._-]+}}:
+; ABI/constant lowering basic block.
+; CHECK: {{bb.[0-9]+}}:
+; CHECK: [[ADDR:%.*]](p0) = COPY %x0
+
+; IR-level entry basic block
+; CHECK: {{bb.[0-9]+}}:
 ;
 ; Make sure we have two successors
-; CHECK-NEXT: successors: %[[TRUE:[0-9a-zA-Z._-]+]](0x40000000),
-; CHECK:                  %[[FALSE:[0-9a-zA-Z._-]+]](0x40000000)
+; CHECK-NEXT: successors: %[[TRUE:bb.[0-9]+]](0x40000000),
+; CHECK:                  %[[FALSE:bb.[0-9]+]](0x40000000)
 ;
 ; Check that we emit the correct branch.
-; CHECK: [[ADDR:%.*]](p0) = COPY %x0
 ; CHECK: [[TST:%.*]](s1) = G_LOAD [[ADDR]](p0)
 ; CHECK: G_BRCOND [[TST]](s1), %[[TRUE]]
 ; CHECK: G_BR %[[FALSE]]
@@ -104,6 +111,7 @@
 ; CHECK-LABEL: name: ori64
 ; CHECK: [[ARG1:%[0-9]+]](s64) = COPY %x0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s64) = COPY %x1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s64) = G_OR [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %x0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %x0
@@ -115,6 +123,7 @@
 ; CHECK-LABEL: name: ori32
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %w0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %w1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_OR [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %w0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %w0
@@ -127,6 +136,7 @@
 ; CHECK-LABEL: name: xori64
 ; CHECK: [[ARG1:%[0-9]+]](s64) = COPY %x0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s64) = COPY %x1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s64) = G_XOR [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %x0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %x0
@@ -138,6 +148,7 @@
 ; CHECK-LABEL: name: xori32
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %w0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %w1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_XOR [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %w0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %w0
@@ -150,6 +161,7 @@
 ; CHECK-LABEL: name: andi64
 ; CHECK: [[ARG1:%[0-9]+]](s64) = COPY %x0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s64) = COPY %x1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s64) = G_AND [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %x0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %x0
@@ -161,6 +173,7 @@
 ; CHECK-LABEL: name: andi32
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %w0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %w1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_AND [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %w0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %w0
@@ -173,6 +186,7 @@
 ; CHECK-LABEL: name: subi64
 ; CHECK: [[ARG1:%[0-9]+]](s64) = COPY %x0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s64) = COPY %x1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s64) = G_SUB [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %x0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %x0
@@ -184,6 +198,7 @@
 ; CHECK-LABEL: name: subi32
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %w0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %w1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_SUB [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %w0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %w0
@@ -364,7 +379,7 @@
   ; rest of the entry block.
 ; CHECK-LABEL: name: constant_int
 ; CHECK: [[IN:%[0-9]+]](s32) = COPY %w0
-; CHECK: [[ONE:%[0-9]+]](s32) = G_CONSTANT 1
+; CHECK: [[ONE:%[0-9]+]](s32) = G_CONSTANT i32 1
 ; CHECK: G_BR
 
 ; CHECK: [[SUM1:%[0-9]+]](s32) = G_ADD [[IN]], [[ONE]]
@@ -383,8 +398,8 @@
 }
 
 ; CHECK-LABEL: name: constant_int_start
-; CHECK: [[TWO:%[0-9]+]](s32) = G_CONSTANT 2
-; CHECK: [[ANSWER:%[0-9]+]](s32) = G_CONSTANT 42
+; CHECK: [[TWO:%[0-9]+]](s32) = G_CONSTANT i32 2
+; CHECK: [[ANSWER:%[0-9]+]](s32) = G_CONSTANT i32 42
 ; CHECK: [[RES:%[0-9]+]](s32) = G_ADD [[TWO]], [[ANSWER]]
 define i32 @constant_int_start() {
   %res = add i32 2, 42
@@ -399,7 +414,7 @@
 }
 
 ; CHECK-LABEL: name: test_constant_inttoptr
-; CHECK: [[ONE:%[0-9]+]](s64) = G_CONSTANT 1
+; CHECK: [[ONE:%[0-9]+]](s64) = G_CONSTANT i64 1
 ; CHECK: [[PTR:%[0-9]+]](p0) = G_INTTOPTR [[ONE]]
 ; CHECK: %x0 = COPY [[PTR]]
 define i8* @test_constant_inttoptr() {
@@ -409,7 +424,7 @@
   ; This failed purely because the Constant -> VReg map was kept across
   ; functions, so reuse the "i64 1" from above.
 ; CHECK-LABEL: name: test_reused_constant
-; CHECK: [[ONE:%[0-9]+]](s64) = G_CONSTANT 1
+; CHECK: [[ONE:%[0-9]+]](s64) = G_CONSTANT i64 1
 ; CHECK: %x0 = COPY [[ONE]]
 define i64 @test_reused_constant() {
   ret i64 1
@@ -436,6 +451,7 @@
 ; CHECK-LABEL: name: test_shl
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %w0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %w1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_SHL [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %w0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %w0
@@ -448,6 +464,7 @@
 ; CHECK-LABEL: name: test_lshr
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %w0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %w1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_LSHR [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %w0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %w0
@@ -459,6 +476,7 @@
 ; CHECK-LABEL: name: test_ashr
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %w0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %w1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_ASHR [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %w0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %w0
@@ -470,6 +488,7 @@
 ; CHECK-LABEL: name: test_sdiv
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %w0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %w1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_SDIV [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %w0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %w0
@@ -481,6 +500,7 @@
 ; CHECK-LABEL: name: test_udiv
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %w0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %w1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_UDIV [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %w0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %w0
@@ -492,6 +512,7 @@
 ; CHECK-LABEL: name: test_srem
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %w0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %w1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_SREM [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %w0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %w0
@@ -503,6 +524,7 @@
 ; CHECK-LABEL: name: test_urem
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %w0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %w1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_UREM [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %w0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %w0
@@ -512,7 +534,7 @@
 }
 
 ; CHECK-LABEL: name: test_constant_null
-; CHECK: [[NULL:%[0-9]+]](p0) = G_CONSTANT 0
+; CHECK: [[NULL:%[0-9]+]](p0) = G_CONSTANT i64 0
 ; CHECK: %x0 = COPY [[NULL]]
 define i8* @test_constant_null() {
   ret i8* null
@@ -554,6 +576,7 @@
 ; CHECK: [[LHS:%[0-9]+]](p0) = COPY %x0
 ; CHECK: [[RHS:%[0-9]+]](p0) = COPY %x1
 ; CHECK: [[ADDR:%[0-9]+]](p0) = COPY %x2
+; CHECK: bb.1:
 ; CHECK: [[TST:%[0-9]+]](s1) = G_ICMP intpred(eq), [[LHS]](p0), [[RHS]]
 ; CHECK: G_STORE [[TST]](s1), [[ADDR]](p0)
 define void @ptr_comparison(i8* %a, i8* %b, i1* %addr) {
@@ -565,6 +588,7 @@
 ; CHECK-LABEL: name: test_fadd
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %s0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %s1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_FADD [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %s0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %s0
@@ -576,6 +600,7 @@
 ; CHECK-LABEL: name: test_fsub
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %s0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %s1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_FSUB [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %s0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %s0
@@ -587,6 +612,7 @@
 ; CHECK-LABEL: name: test_fmul
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %s0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %s1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_FMUL [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %s0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %s0
@@ -598,6 +624,7 @@
 ; CHECK-LABEL: name: test_fdiv
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %s0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %s1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_FDIV [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %s0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %s0
@@ -609,6 +636,7 @@
 ; CHECK-LABEL: name: test_frem
 ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %s0
 ; CHECK-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %s1
+; CHECK: bb.1:
 ; CHECK-NEXT: [[RES:%[0-9]+]](s32) = G_FREM [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: %s0 = COPY [[RES]]
 ; CHECK-NEXT: RET_ReallyLR implicit %s0
@@ -635,7 +663,7 @@
 ; CHECK: [[LHS:%[0-9]+]](s32) = COPY %w0
 ; CHECK: [[RHS:%[0-9]+]](s32) = COPY %w1
 ; CHECK: [[ADDR:%[0-9]+]](p0) = COPY %x2
-; CHECK: [[ZERO:%[0-9]+]](s1) = G_CONSTANT 0
+; CHECK: [[ZERO:%[0-9]+]](s1) = G_CONSTANT i1 false
 ; CHECK: [[VAL:%[0-9]+]](s32), [[OVERFLOW:%[0-9]+]](s1) = G_UADDE [[LHS]], [[RHS]], [[ZERO]]
 ; CHECK: [[RES:%[0-9]+]](s64) = G_SEQUENCE [[VAL]](s32), 0, [[OVERFLOW]](s1), 32
 ; CHECK: G_STORE [[RES]](s64), [[ADDR]](p0)
@@ -664,7 +692,7 @@
 ; CHECK: [[LHS:%[0-9]+]](s32) = COPY %w0
 ; CHECK: [[RHS:%[0-9]+]](s32) = COPY %w1
 ; CHECK: [[ADDR:%[0-9]+]](p0) = COPY %x2
-; CHECK: [[ZERO:%[0-9]+]](s1) = G_CONSTANT 0
+; CHECK: [[ZERO:%[0-9]+]](s1) = G_CONSTANT i1 false
 ; CHECK: [[VAL:%[0-9]+]](s32), [[OVERFLOW:%[0-9]+]](s1) = G_USUBE [[LHS]], [[RHS]], [[ZERO]]
 ; CHECK: [[RES:%[0-9]+]](s64) = G_SEQUENCE [[VAL]](s32), 0, [[OVERFLOW]](s1), 32
 ; CHECK: G_STORE [[RES]](s64), [[ADDR]](p0)
@@ -761,6 +789,17 @@
   ret i32 %res
 }
 
+; CHECK-LABEL: name: test_select_ptr
+; CHECK: [[TST:%[0-9]+]](s1) = COPY %w0
+; CHECK: [[LHS:%[0-9]+]](p0) = COPY %x1
+; CHECK: [[RHS:%[0-9]+]](p0) = COPY %x2
+; CHECK: [[RES:%[0-9]+]](p0) = G_SELECT [[TST]](s1), [[LHS]], [[RHS]]
+; CHECK: %x0 = COPY [[RES]]
+define i8* @test_select_ptr(i1 %tst, i8* %lhs, i8* %rhs) {
+  %res = select i1 %tst, i8* %lhs, i8* %rhs
+  ret i8* %res
+}
+
 ; CHECK-LABEL: name: test_fptosi
 ; CHECK: [[FPADDR:%[0-9]+]](p0) = COPY %x0
 ; CHECK: [[FP:%[0-9]+]](s32) = G_LOAD [[FPADDR]](p0)
@@ -896,13 +935,39 @@
 ; CHECK-LABEL: name: test_objectsize
 ; CHECK: [[ADDR0:%[0-9]+]](p0) = COPY %x0
 ; CHECK: [[ADDR1:%[0-9]+]](p0) = COPY %x1
-; CHECK: {{%[0-9]+}}(s64) = G_CONSTANT -1
-; CHECK: {{%[0-9]+}}(s64) = G_CONSTANT 0
-; CHECK: {{%[0-9]+}}(s32) = G_CONSTANT -1
-; CHECK: {{%[0-9]+}}(s32) = G_CONSTANT 0
+; CHECK: {{%[0-9]+}}(s64) = G_CONSTANT i64 -1
+; CHECK: {{%[0-9]+}}(s64) = G_CONSTANT i64 0
+; CHECK: {{%[0-9]+}}(s32) = G_CONSTANT i32 -1
+; CHECK: {{%[0-9]+}}(s32) = G_CONSTANT i32 0
   %size64.0 = call i64 @llvm.objectsize.i64(i8* %addr0, i1 0)
   %size64.intmin = call i64 @llvm.objectsize.i64(i8* %addr0, i1 1)
   %size32.0 = call i32 @llvm.objectsize.i32(i8* %addr0, i1 0)
   %size32.intmin = call i32 @llvm.objectsize.i32(i8* %addr0, i1 1)
   ret void
 }
+
+define void @test_large_const(i128* %addr) {
+; CHECK-LABEL: name: test_large_const
+; CHECK: [[ADDR:%[0-9]+]](p0) = COPY %x0
+; CHECK: [[VAL:%[0-9]+]](s128) = G_CONSTANT i128 42
+; CHECK: G_STORE [[VAL]](s128), [[ADDR]](p0)
+  store i128 42, i128* %addr
+  ret void
+}
+
+; When there was no formal argument handling (so the first BB was empty) we used
+; to insert the constants at the end of the block, even if they were encountered
+; after the block's terminators had been emitted. Also make sure the order is
+; correct.
+define i8* @test_const_placement() {
+; CHECK-LABEL: name: test_const_placement
+; CHECK: bb.0:
+; CHECK:   [[VAL_INT:%[0-9]+]](s32) = G_CONSTANT i32 42
+; CHECK:   [[VAL:%[0-9]+]](p0) = G_INTTOPTR [[VAL_INT]](s32)
+; CHECK: bb.1:
+; CHECK:   G_BR
+  br label %next
+
+next:
+  ret i8* inttoptr(i32 42 to i8*)
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/call-translator-ios.ll b/test/CodeGen/AArch64/GlobalISel/call-translator-ios.ll
index 42cc4cc..4e6b9ca 100644
--- a/test/CodeGen/AArch64/GlobalISel/call-translator-ios.ll
+++ b/test/CodeGen/AArch64/GlobalISel/call-translator-ios.ll
@@ -18,14 +18,14 @@
 }
 
 ; CHECK-LABEL: name: test_call_stack
-; CHECK: [[C42:%[0-9]+]](s8) = G_CONSTANT 42
-; CHECK: [[C12:%[0-9]+]](s8) = G_CONSTANT 12
+; CHECK: [[C42:%[0-9]+]](s8) = G_CONSTANT i8 42
+; CHECK: [[C12:%[0-9]+]](s8) = G_CONSTANT i8 12
 ; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
-; CHECK: [[C42_OFFS:%[0-9]+]](s64) = G_CONSTANT 0
+; CHECK: [[C42_OFFS:%[0-9]+]](s64) = G_CONSTANT i64 0
 ; CHECK: [[C42_LOC:%[0-9]+]](p0) = G_GEP [[SP]], [[C42_OFFS]](s64)
 ; CHECK: G_STORE [[C42]](s8), [[C42_LOC]](p0) :: (store 1 into stack, align 0)
 ; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
-; CHECK: [[C12_OFFS:%[0-9]+]](s64) = G_CONSTANT 1
+; CHECK: [[C12_OFFS:%[0-9]+]](s64) = G_CONSTANT i64 1
 ; CHECK: [[C12_LOC:%[0-9]+]](p0) = G_GEP [[SP]], [[C12_OFFS]](s64)
 ; CHECK: G_STORE [[C12]](s8), [[C12_LOC]](p0) :: (store 1 into stack + 1, align 0)
 ; CHECK: BL @test_stack_slots
diff --git a/test/CodeGen/AArch64/GlobalISel/call-translator.ll b/test/CodeGen/AArch64/GlobalISel/call-translator.ll
index 3b4f2c2..4c6fff6 100644
--- a/test/CodeGen/AArch64/GlobalISel/call-translator.ll
+++ b/test/CodeGen/AArch64/GlobalISel/call-translator.ll
@@ -41,7 +41,7 @@
 
 ; CHECK-LABEL: name: test_multiple_args
 ; CHECK: [[IN:%[0-9]+]](s64) = COPY %x0
-; CHECK: [[ANSWER:%[0-9]+]](s32) = G_CONSTANT 42
+; CHECK: [[ANSWER:%[0-9]+]](s32) = G_CONSTANT i32 42
 ; CHECK: %w0 = COPY [[ANSWER]]
 ; CHECK: %x1 = COPY [[IN]]
 ; CHECK: BL @multiple_args_callee, csr_aarch64_aapcs, implicit-def %lr, implicit %sp, implicit %w0, implicit %x1
@@ -146,30 +146,48 @@
 ; CHECK: fixedStack:
 ; CHECK-DAG:  - { id: [[STACK0:[0-9]+]], offset: 0, size: 8
 ; CHECK-DAG:  - { id: [[STACK8:[0-9]+]], offset: 8, size: 8
+; CHECK-DAG:  - { id: [[STACK16:[0-9]+]], offset: 16, size: 8
 ; CHECK: [[LHS_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
 ; CHECK: [[LHS:%[0-9]+]](s64) = G_LOAD [[LHS_ADDR]](p0) :: (invariant load 8 from %fixed-stack.[[STACK0]], align 0)
 ; CHECK: [[RHS_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
 ; CHECK: [[RHS:%[0-9]+]](s64) = G_LOAD [[RHS_ADDR]](p0) :: (invariant load 8 from %fixed-stack.[[STACK8]], align 0)
+; CHECK: [[ADDR_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK16]]
+; CHECK: [[ADDR:%[0-9]+]](p0) = G_LOAD [[ADDR_ADDR]](p0) :: (invariant load 8 from %fixed-stack.[[STACK16]], align 0)
 ; CHECK: [[SUM:%[0-9]+]](s64) = G_ADD [[LHS]], [[RHS]]
-; CHECK: %x0 = COPY [[SUM]](s64)
-define i64 @test_stack_slots([8 x i64], i64 %lhs, i64 %rhs) {
+; CHECK: G_STORE [[SUM]](s64), [[ADDR]](p0)
+define void @test_stack_slots([8 x i64], i64 %lhs, i64 %rhs, i64* %addr) {
   %sum = add i64 %lhs, %rhs
-  ret i64 %sum
+  store i64 %sum, i64* %addr
+  ret void
 }
 
 ; CHECK-LABEL: name: test_call_stack
-; CHECK: [[C42:%[0-9]+]](s64) = G_CONSTANT 42
-; CHECK: [[C12:%[0-9]+]](s64) = G_CONSTANT 12
+; CHECK: [[C42:%[0-9]+]](s64) = G_CONSTANT i64 42
+; CHECK: [[C12:%[0-9]+]](s64) = G_CONSTANT i64 12
+; CHECK: [[PTR:%[0-9]+]](p0) = G_CONSTANT i64 0
 ; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
-; CHECK: [[C42_OFFS:%[0-9]+]](s64) = G_CONSTANT 0
+; CHECK: [[C42_OFFS:%[0-9]+]](s64) = G_CONSTANT i64 0
 ; CHECK: [[C42_LOC:%[0-9]+]](p0) = G_GEP [[SP]], [[C42_OFFS]](s64)
 ; CHECK: G_STORE [[C42]](s64), [[C42_LOC]](p0) :: (store 8 into stack, align 0)
 ; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
-; CHECK: [[C12_OFFS:%[0-9]+]](s64) = G_CONSTANT 8
+; CHECK: [[C12_OFFS:%[0-9]+]](s64) = G_CONSTANT i64 8
 ; CHECK: [[C12_LOC:%[0-9]+]](p0) = G_GEP [[SP]], [[C12_OFFS]](s64)
 ; CHECK: G_STORE [[C12]](s64), [[C12_LOC]](p0) :: (store 8 into stack + 8, align 0)
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[PTR_OFFS:%[0-9]+]](s64) = G_CONSTANT i64 16
+; CHECK: [[PTR_LOC:%[0-9]+]](p0) = G_GEP [[SP]], [[PTR_OFFS]](s64)
+; CHECK: G_STORE [[PTR]](p0), [[PTR_LOC]](p0) :: (store 8 into stack + 16, align 0)
 ; CHECK: BL @test_stack_slots
 define void @test_call_stack() {
-  call i64 @test_stack_slots([8 x i64] undef, i64 42, i64 12)
+  call void @test_stack_slots([8 x i64] undef, i64 42, i64 12, i64* null)
+  ret void
+}
+
+; CHECK-LABEL: name: test_mem_i1
+; CHECK: fixedStack:
+; CHECK-NEXT: - { id: [[SLOT:[0-9]+]], offset: 0, size: 1, alignment: 16, isImmutable: true, isAliased: false }
+; CHECK: [[ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[SLOT]]
+; CHECK: {{%[0-9]+}}(s1) = G_LOAD [[ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[SLOT]], align 0)
+define void @test_mem_i1([8 x i64], i1 %in) {
   ret void
 }
diff --git a/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll b/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll
index 47a085e..c177390 100644
--- a/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll
+++ b/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll
@@ -9,14 +9,15 @@
 ; CHECK: name: bar
 ; CHECK: body:
 ; CHECK:   bb.0:
-; CHECK:     successors: %bb.2{{.*}}%bb.1
+; CHECK:   bb.1:
+; CHECK:     successors: %[[GOOD:bb.[0-9]+]]{{.*}}%[[BAD:bb.[0-9]+]]
 ; CHECK:     EH_LABEL
 ; CHECK:     %w0 = COPY
 ; CHECK:     BL @foo, csr_aarch64_aapcs, implicit-def %lr, implicit %sp, implicit %w0, implicit-def %w0
 ; CHECK:     {{%[0-9]+}}(s32) = COPY %w0
 ; CHECK:     EH_LABEL
 
-; CHECK:   bb.1
+; CHECK:   [[BAD]] (landing-pad):
 ; CHECK:     EH_LABEL
 ; CHECK:     [[PTR:%[0-9]+]](p0) = COPY %x0
 ; CHECK:     [[SEL:%[0-9]+]](p0) = COPY %x1
@@ -25,8 +26,8 @@
 ; CHECK:     %x0 = COPY [[PTR_RET]]
 ; CHECK:     %w1 = COPY [[SEL_RET]]
 
-; CHECK:   bb.2:
-; CHECK:     [[SEL:%[0-9]+]](s32) = G_CONSTANT 1
+; CHECK:   [[GOOD]]:
+; CHECK:     [[SEL:%[0-9]+]](s32) = G_CONSTANT i32 1
 ; CHECK:     {{%[0-9]+}}(s128) = G_INSERT {{%[0-9]+}}(s128), [[SEL]](s32), 64
 
 define { i8*, i32 } @bar() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
index c151ee6..252e60c 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
@@ -35,7 +35,7 @@
     ; CHECK-LABEL: name: test_scalar_add_big
     ; CHECK-NOT: G_EXTRACT
     ; CHECK-NOT: G_SEQUENCE
-    ; CHECK-DAG: [[CARRY0_32:%.*]](s32) = G_CONSTANT 0
+    ; CHECK-DAG: [[CARRY0_32:%.*]](s32) = G_CONSTANT i32 0
     ; CHECK-DAG: [[CARRY0:%[0-9]+]](s1) = G_TRUNC [[CARRY0_32]]
     ; CHECK: [[RES_LO:%.*]](s64), [[CARRY:%.*]](s1) = G_UADDE %0, %2, [[CARRY0]]
     ; CHECK: [[RES_HI:%.*]](s64), {{%.*}}(s1) = G_UADDE %1, %3, [[CARRY]]
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir b/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir
index 69a1ceb..56a7d47 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir
@@ -27,22 +27,22 @@
 body: |
   bb.0.entry:
     ; CHECK-LABEL: name: test_constant
-    ; CHECK: [[TMP:%[0-9]+]](s32) = G_CONSTANT 0
+    ; CHECK: [[TMP:%[0-9]+]](s32) = G_CONSTANT i32 0
     ; CHECK: %0(s1) = G_TRUNC [[TMP]]
-    ; CHECK: [[TMP:%[0-9]+]](s32) = G_CONSTANT 42
+    ; CHECK: [[TMP:%[0-9]+]](s32) = G_CONSTANT i32 42
     ; CHECK: %1(s8) = G_TRUNC [[TMP]]
-    ; CHECK: [[TMP:%[0-9]+]](s32) = G_CONSTANT 65535
+    ; CHECK: [[TMP:%[0-9]+]](s32) = G_CONSTANT i32 -1
     ; CHECK: %2(s16) = G_TRUNC [[TMP]]
-    ; CHECK: %3(s32) = G_CONSTANT -1
-    ; CHECK: %4(s64) = G_CONSTANT 1
-    ; CHECK: %5(s64) = G_CONSTANT 0
+    ; CHECK: %3(s32) = G_CONSTANT i32 -1
+    ; CHECK: %4(s64) = G_CONSTANT i64 1
+    ; CHECK: %5(s64) = G_CONSTANT i64 0
 
-    %0(s1) = G_CONSTANT 0
-    %1(s8) = G_CONSTANT 42
-    %2(s16) = G_CONSTANT 65535
-    %3(s32) = G_CONSTANT -1
-    %4(s64) = G_CONSTANT 1
-    %5(s64) = G_CONSTANT 0
+    %0(s1) = G_CONSTANT i1 0
+    %1(s8) = G_CONSTANT i8 42
+    %2(s16) = G_CONSTANT i16 65535
+    %3(s32) = G_CONSTANT i32 -1
+    %4(s64) = G_CONSTANT i64 1
+    %5(s64) = G_CONSTANT i64 0
 ...
 
 ---
diff --git a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll
index 2334b5e..14dbc7c 100644
--- a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll
+++ b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll
@@ -5,7 +5,7 @@
 define %type* @first_offset_const(%type* %addr) {
 ; CHECK-LABEL: name: first_offset_const
 ; CHECK: [[BASE:%[0-9]+]](p0) = COPY %x0
-; CHECK: [[OFFSET:%[0-9]+]](s64) = G_CONSTANT 32
+; CHECK: [[OFFSET:%[0-9]+]](s64) = G_CONSTANT i64 32
 ; CHECK: [[RES:%[0-9]+]](p0) = G_GEP [[BASE]], [[OFFSET]](s64)
 ; CHECK: %x0 = COPY [[RES]](p0)
 
@@ -27,7 +27,7 @@
 ; CHECK-LABEL: name: first_offset_variable
 ; CHECK: [[BASE:%[0-9]+]](p0) = COPY %x0
 ; CHECK: [[IDX:%[0-9]+]](s64) = COPY %x1
-; CHECK: [[SIZE:%[0-9]+]](s64) = G_CONSTANT 32
+; CHECK: [[SIZE:%[0-9]+]](s64) = G_CONSTANT i64 32
 ; CHECK: [[OFFSET:%[0-9]+]](s64) = G_MUL [[SIZE]], [[IDX]]
 ; CHECK: [[STEP0:%[0-9]+]](p0) = G_GEP [[BASE]], [[OFFSET]](s64)
 ; CHECK: [[RES:%[0-9]+]](p0) = COPY [[STEP0]](p0)
@@ -41,7 +41,7 @@
 ; CHECK-LABEL: name: first_offset_ext
 ; CHECK: [[BASE:%[0-9]+]](p0) = COPY %x0
 ; CHECK: [[IDX32:%[0-9]+]](s32) = COPY %w1
-; CHECK: [[SIZE:%[0-9]+]](s64) = G_CONSTANT 32
+; CHECK: [[SIZE:%[0-9]+]](s64) = G_CONSTANT i64 32
 ; CHECK: [[IDX64:%[0-9]+]](s64) = G_SEXT [[IDX32]](s32)
 ; CHECK: [[OFFSET:%[0-9]+]](s64) = G_MUL [[SIZE]], [[IDX64]]
 ; CHECK: [[STEP0:%[0-9]+]](p0) = G_GEP [[BASE]], [[OFFSET]](s64)
@@ -57,9 +57,9 @@
 ; CHECK-LABEL: name: const_then_var
 ; CHECK: [[BASE:%[0-9]+]](p0) = COPY %x0
 ; CHECK: [[IDX:%[0-9]+]](s64) = COPY %x1
-; CHECK: [[OFFSET1:%[0-9]+]](s64) = G_CONSTANT 272
+; CHECK: [[OFFSET1:%[0-9]+]](s64) = G_CONSTANT i64 272
 ; CHECK: [[BASE1:%[0-9]+]](p0) = G_GEP [[BASE]], [[OFFSET1]](s64)
-; CHECK: [[SIZE:%[0-9]+]](s64) = G_CONSTANT 4
+; CHECK: [[SIZE:%[0-9]+]](s64) = G_CONSTANT i64 4
 ; CHECK: [[OFFSET2:%[0-9]+]](s64) = G_MUL [[SIZE]], [[IDX]]
 ; CHECK: [[BASE2:%[0-9]+]](p0) = G_GEP [[BASE1]], [[OFFSET2]](s64)
 ; CHECK: [[RES:%[0-9]+]](p0) = COPY [[BASE2]](p0)
@@ -73,10 +73,10 @@
 ; CHECK-LABEL: name: var_then_const
 ; CHECK: [[BASE:%[0-9]+]](p0) = COPY %x0
 ; CHECK: [[IDX:%[0-9]+]](s64) = COPY %x1
-; CHECK: [[SIZE:%[0-9]+]](s64) = G_CONSTANT 64
+; CHECK: [[SIZE:%[0-9]+]](s64) = G_CONSTANT i64 64
 ; CHECK: [[OFFSET1:%[0-9]+]](s64) = G_MUL [[SIZE]], [[IDX]]
 ; CHECK: [[BASE1:%[0-9]+]](p0) = G_GEP [[BASE]], [[OFFSET1]](s64)
-; CHECK: [[OFFSET2:%[0-9]+]](s64) = G_CONSTANT 40
+; CHECK: [[OFFSET2:%[0-9]+]](s64) = G_CONSTANT i64 40
 ; CHECK: [[BASE2:%[0-9]+]](p0) = G_GEP [[BASE1]], [[OFFSET2]](s64)
 ; CHECK: %x0 = COPY [[BASE2]](p0)
 
diff --git a/test/CodeGen/AArch64/arm64-regress-opt-cmp.mir b/test/CodeGen/AArch64/arm64-regress-opt-cmp.mir
index 3948c04..bda025a 100644
--- a/test/CodeGen/AArch64/arm64-regress-opt-cmp.mir
+++ b/test/CodeGen/AArch64/arm64-regress-opt-cmp.mir
@@ -1,4 +1,3 @@
-# RUN: rm -f %S/arm64-regress-opt-cmp.s
 # RUN: llc -mtriple=aarch64-linux-gnu -run-pass peephole-opt -o - %s 2>&1 | FileCheck %s
 # CHECK: %1 = ANDWri {{.*}}
 # CHECK-NEXT: %wzr = SUBSWri {{.*}}
diff --git a/test/CodeGen/AMDGPU/add.i16.ll b/test/CodeGen/AMDGPU/add.i16.ll
index 3c7a2c1..a41d307 100644
--- a/test/CodeGen/AMDGPU/add.i16.ll
+++ b/test/CodeGen/AMDGPU/add.i16.ll
@@ -4,7 +4,7 @@
 ; GCN-LABEL: {{^}}v_test_add_i16:
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
 define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -67,7 +67,7 @@
 ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i32:
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_dword [[ADD]]
 define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -86,7 +86,7 @@
 ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64:
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
+; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
 ; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
@@ -106,7 +106,7 @@
 ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32:
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]],  [[B]], [[A]]
 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: buffer_store_dword [[SEXT]]
 define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
@@ -126,7 +126,7 @@
 ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64:
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
index 5444b74..13e4192 100644
--- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
@@ -533,6 +533,27 @@
   ret i16 %r
 }
 
+; GCN-LABEL: @constant_add_i16(
+; VI: ret i16 3
+define i16 @constant_add_i16() {
+  %r = add i16 1, 2
+  ret i16 %r
+}
+
+; GCN-LABEL: @constant_add_nsw_i16(
+; VI: ret i16 3
+define i16 @constant_add_nsw_i16() {
+  %r = add nsw i16 1, 2
+  ret i16 %r
+}
+
+; GCN-LABEL: @constant_add_nuw_i16(
+; VI: ret i16 3
+define i16 @constant_add_nuw_i16() {
+  %r = add nsw i16 1, 2
+  ret i16 %r
+}
+
 ; GCN-LABEL: @add_nsw_i16(
 ; SI: %r = add nsw i16 %a, %b
 ; SI-NEXT: ret i16 %r
@@ -806,6 +827,13 @@
   ret i16 %r
 }
 
+; GCN-LABEL: @constant_lshr_exact_i16(
+; VI: ret i16 2
+define i16 @constant_lshr_exact_i16(i16 %a, i16 %b) {
+  %r = lshr exact i16 4, 1
+  ret i16 %r
+}
+
 ; GCN-LABEL: @and_i16(
 ; SI: %r = and i16 %a, %b
 ; SI-NEXT: ret i16 %r
diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 8de539a..1511e13 100644
--- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -252,8 +252,8 @@
 ; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding:
 ; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:1 ; encoding:
 
-; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0
-; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0
+; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding: [0x00,0x00,0x60,0xe0
+; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:1 ; encoding: [0x01,0x00,0x60,0xe0
 define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i8]
@@ -274,7 +274,9 @@
 ; R600-CHECK: MOV
 ; R600-CHECK: [[CHAN:[XYZW]]]+
 ; R600-NOT: [[CHAN]]+
-; SI: v_mov_b32_e32 v3
+;
+; A total of 5 bytes should be allocated and used.
+; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ;
 define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1
diff --git a/test/CodeGen/AMDGPU/basic-branch.ll b/test/CodeGen/AMDGPU/basic-branch.ll
index 83313ed..24874ee 100644
--- a/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/test/CodeGen/AMDGPU/basic-branch.ll
@@ -8,13 +8,15 @@
 ; GCNNOOPT: v_writelane_b32
 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
 
+
 ; GCN: ; BB#1
 ; GCNNOOPT: v_readlane_b32
 ; GCNNOOPT: v_readlane_b32
 ; GCN: buffer_store_dword
-; GCN: s_endpgm
+; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; TODO: This waitcnt can be eliminated
 
-; GCN: {{^}}[[END]]
+; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
 define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
   %cmp = icmp ne i32 %val, 0
@@ -35,9 +37,10 @@
 ; GCN: s_cbranch_vccnz [[END:BB[0-9]+_[0-9]+]]
 
 ; GCN: buffer_store_dword
-; GCN: s_endpgm
+; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; TODO: This waitcnt can be eliminated
 
-; GCN: {{^}}[[END]]
+; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
 define void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
   %cmp0 = icmp ne i1 %val, 0
diff --git a/test/CodeGen/AMDGPU/br_cc.f16.ll b/test/CodeGen/AMDGPU/br_cc.f16.ll
index 6cf3fda..b758471 100644
--- a/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -12,9 +12,10 @@
 ; GCN: s_cbranch_vccnz
 
 ; GCN: one{{$}}
-; SI:  v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
-; GCN: buffer_store_short v[[A_F16]]
-; GCN: s_endpgm
+; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
+; SI: s_branch
+; VI: buffer_store_short
+; VI: s_endpgm
 
 ; GCN: two{{$}}
 ; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
@@ -46,18 +47,21 @@
 
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
 ; SI:  v_cmp_ngt_f32_e32 vcc, v[[B_F32]], v[[A_F32]]
-; VI:  v_cmp_nle_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: s_cbranch_vccnz
+; SI: s_cbranch_vccz
 
-; GCN: one{{$}}
-; VI:  v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x3800{{$}}
-; GCN: buffer_store_short v[[A_F16]]
-; GCN: s_endpgm
+; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; VI: s_cbranch_vccnz
+
+; VI: one{{$}}
+; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x380{{0|1}}{{$}}
 
 ; GCN: two{{$}}
 ; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
-; GCN: buffer_store_short v[[B_F16]]
-; GCN: s_endpgm
+
+; SI: one{{$}}
+; SI: buffer_store_short v[[A_F16]]
+; SI: s_endpgm
+
 define void @br_cc_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
@@ -82,13 +86,11 @@
 
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_nge_f16_e32 vcc, v[[B_F16]], v[[A_F16]]
+; VI:  v_cmp_ngt_f16_e32 vcc, v[[B_F16]], v[[A_F16]]
 ; GCN: s_cbranch_vccnz
 
 ; GCN: one{{$}}
 ; SI:  v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
-; GCN: buffer_store_short v[[A_F16]]
-; GCN: s_endpgm
 
 ; GCN: two{{$}}
 ; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}
diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll
index 92debd8..3950540 100644
--- a/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -475,14 +475,13 @@
 
 ; GCN-LABEL: {{^}}long_branch_hang:
 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
-; GCN-NEXT: s_cbranch_scc1 [[LONG_BR_0:BB[0-9]+_[0-9]+]]
-; GCN-NEXT: s_branch  [[SHORTB:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
 
-; GCN-NEXT: [[LONG_BR_0]]:
 ; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
 ; GCN: s_setpc_b64
 
-; GCN: [[SHORTB]]:
+; GCN-NEXT: [[LONG_BR_0]]:
 ; GCN-DAG: v_cmp_lt_i32
 ; GCN-DAG: v_cmp_gt_i32
 ; GCN: s_cbranch_vccnz
@@ -492,7 +491,6 @@
 
 ; GCN: [[LONG_BR_DEST0]]
 ; GCN: v_cmp_ne_u32_e32
-; GCN-NEXT: ; implicit-def
 ; GCN-NEXT: s_cbranch_vccz
 ; GCN: s_setpc_b64
 
diff --git a/test/CodeGen/AMDGPU/captured-frame-index.ll b/test/CodeGen/AMDGPU/captured-frame-index.ll
index 59b9425..49af1595 100644
--- a/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -28,8 +28,8 @@
 ; Offset is applied
 ; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects:
 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 
 ; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]]
 
@@ -51,9 +51,9 @@
 ; Same frame index is used multiple times in the store
 ; GCN-LABEL: {{^}}stored_fi_to_self:
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}}
+; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dword [[K]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
-; GCN: buffer_store_dword [[ZERO]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
 define void @stored_fi_to_self() #0 {
   %tmp = alloca i32*
 
@@ -65,15 +65,14 @@
 }
 
 ; GCN-LABEL: {{^}}stored_fi_to_self_offset:
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}}
-; GCN: buffer_store_dword [[K0]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
 
 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}}
-; GCN: buffer_store_dword [[K1]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2048{{$}}
+; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2048{{$}}
 
 ; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x800{{$}}
-; GCN: buffer_store_dword [[OFFSETK]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2048{{$}}
+; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2048{{$}}
 define void @stored_fi_to_self_offset() #0 {
   %tmp0 = alloca [512 x i32]
   %tmp1 = alloca i32*
@@ -90,16 +89,15 @@
 }
 
 ; GCN-LABEL: {{^}}stored_fi_to_fi:
-; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:8{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
 
 ; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
-; GCN: buffer_store_dword [[FI1]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:8{{$}}
+; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
 
 ; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
-; GCN: buffer_store_dword [[FI2]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4{{$}}
+; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 define void @stored_fi_to_fi() #0 {
   %tmp0 = alloca i32*
   %tmp1 = alloca i32*
@@ -129,9 +127,9 @@
 
 ; Offset is applied
 ; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects:
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
 
 ; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll
index 65464cd..a96035f 100644
--- a/test/CodeGen/AMDGPU/fdiv.ll
+++ b/test/CodeGen/AMDGPU/fdiv.ll
@@ -1,4 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; These tests check that fdiv is expanded correctly and also test that the
@@ -11,18 +13,20 @@
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
 
-; SI: v_div_scale_f32
-; SI-DAG: v_div_scale_f32
+; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
+; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
 
-; SI-DAG: v_rcp_f32
-; SI: v_fma_f32
-; SI: v_fma_f32
-; SI: v_mul_f32
-; SI: v_fma_f32
-; SI: v_fma_f32
-; SI: v_fma_f32
-; SI: v_div_fmas_f32
-; SI: v_div_fixup_f32
+; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
+; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
+; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
+; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
+; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
+; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
+; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
+; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
 define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv float %a, %b
@@ -30,12 +34,37 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}fdiv_f32_denormals:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
+
+; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
+; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
+
+; GCN-NOT: s_setreg
+; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
+; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
+; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
+; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
+; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
+; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
+; GCN-NOT: s_setreg
+; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
+; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
+define void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+entry:
+  %fdiv = fdiv float %a, %b
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}fdiv_25ulp_f32:
-; SI: v_cndmask_b32
-; SI: v_mul_f32
-; SI: v_rcp_f32
-; SI: v_mul_f32
-; SI: v_mul_f32
+; GCN: v_cndmask_b32
+; GCN: v_mul_f32
+; GCN: v_rcp_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
 define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv float %a, %b, !fpmath !0
@@ -45,9 +74,9 @@
 
 ; Use correct fdiv
 ; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32:
-; SI: v_fma_f32
-; SI: v_div_fmas_f32
-; SI: v_div_fixup_f32
+; GCN: v_fma_f32
+; GCN: v_div_fmas_f32
+; GCN: v_div_fixup_f32
 define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv float %a, %b, !fpmath !0
@@ -56,10 +85,10 @@
 }
 
 ; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
-; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
-; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
-; SI-NOT: [[RESULT]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv fast float %a, %b
@@ -71,10 +100,10 @@
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
 
-; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
-; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
-; SI-NOT: [[RESULT]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv fast float %a, %b
@@ -86,10 +115,10 @@
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
 
-; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
-; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
-; SI-NOT: [[RESULT]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv arcp float %a, %b
@@ -103,10 +132,10 @@
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
 
-; SI: v_div_scale_f32
-; SI: v_div_scale_f32
-; SI: v_div_scale_f32
-; SI: v_div_scale_f32
+; GCN: v_div_scale_f32
+; GCN: v_div_scale_f32
+; GCN: v_div_scale_f32
+; GCN: v_div_scale_f32
 define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv <2 x float> %a, %b
@@ -115,8 +144,8 @@
 }
 
 ; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
-; SI: v_cmp_gt_f32
-; SI: v_cmp_gt_f32
+; GCN: v_cmp_gt_f32
+; GCN: v_cmp_gt_f32
 define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
@@ -130,8 +159,8 @@
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
 
-; SI: v_rcp_f32
-; SI: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
 define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv fast <2 x float> %a, %b
@@ -145,8 +174,8 @@
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
 
-; SI: v_rcp_f32
-; SI: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
 define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv arcp <2 x float> %a, %b
@@ -164,10 +193,10 @@
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 
-; SI: v_div_fixup_f32
-; SI: v_div_fixup_f32
-; SI: v_div_fixup_f32
-; SI: v_div_fixup_f32
+; GCN: v_div_fixup_f32
+; GCN: v_div_fixup_f32
+; GCN: v_div_fixup_f32
+; GCN: v_div_fixup_f32
 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
@@ -187,10 +216,10 @@
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 
-; SI: v_rcp_f32
-; SI: v_rcp_f32
-; SI: v_rcp_f32
-; SI: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
 define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
@@ -210,10 +239,10 @@
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 
-; SI: v_rcp_f32
-; SI: v_rcp_f32
-; SI: v_rcp_f32
-; SI: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
 define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/AMDGPU/global_smrd.ll b/test/CodeGen/AMDGPU/global_smrd.ll
new file mode 100644
index 0000000..2089089
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global_smrd.ll
@@ -0,0 +1,126 @@
+; RUN: llc -O2 -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true -verify-machineinstrs  < %s | FileCheck %s
+
+; uniform loads
+; CHECK-LABEL: @uniform_load
+; CHECK: s_load_dwordx4
+; CHECK-NOT: flat_load_dword
+
+define amdgpu_kernel void @uniform_load(float addrspace(1)* %arg, float addrspace(1)*  %arg1) {
+bb:
+  %tmp2 = load float, float addrspace(1)* %arg, align 4, !tbaa !8
+  %tmp3 = fadd float %tmp2, 0.000000e+00
+  %tmp4 = getelementptr inbounds float, float addrspace(1)* %arg, i64 1
+  %tmp5 = load float, float addrspace(1)* %tmp4, align 4, !tbaa !8
+  %tmp6 = fadd float %tmp3, %tmp5
+  %tmp7 = getelementptr inbounds float, float addrspace(1)* %arg, i64 2
+  %tmp8 = load float, float addrspace(1)* %tmp7, align 4, !tbaa !8
+  %tmp9 = fadd float %tmp6, %tmp8
+  %tmp10 = getelementptr inbounds float, float addrspace(1)* %arg, i64 3
+  %tmp11 = load float, float addrspace(1)* %tmp10, align 4, !tbaa !8
+  %tmp12 = fadd float %tmp9, %tmp11
+  %tmp13 = getelementptr inbounds float, float addrspace(1)* %arg1
+  store float %tmp12, float addrspace(1)* %tmp13, align 4, !tbaa !8
+  ret void
+}
+
+; non-uniform loads
+; CHECK-LABEL: @non-uniform_load
+; CHECK: flat_load_dword
+; CHECK-NOT: s_load_dwordx4
+
+define amdgpu_kernel void @non-uniform_load(float addrspace(1)* %arg, float addrspace(1)* %arg1) #0 {
+bb:
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp
+  %tmp3 = load float, float addrspace(1)* %tmp2, align 4, !tbaa !8
+  %tmp4 = fadd float %tmp3, 0.000000e+00
+  %tmp5 = add i32 %tmp, 1
+  %tmp6 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp5
+  %tmp7 = load float, float addrspace(1)* %tmp6, align 4, !tbaa !8
+  %tmp8 = fadd float %tmp4, %tmp7
+  %tmp9 = add i32 %tmp, 2
+  %tmp10 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp9
+  %tmp11 = load float, float addrspace(1)* %tmp10, align 4, !tbaa !8
+  %tmp12 = fadd float %tmp8, %tmp11
+  %tmp13 = add i32 %tmp, 3
+  %tmp14 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp13
+  %tmp15 = load float, float addrspace(1)* %tmp14, align 4, !tbaa !8
+  %tmp16 = fadd float %tmp12, %tmp15
+  %tmp17 = getelementptr inbounds float, float addrspace(1)* %arg1, i32 %tmp
+  store float %tmp16, float addrspace(1)* %tmp17, align 4, !tbaa !8
+  ret void
+}
+
+
+; uniform load dominated by no-alias store - scalarize
+; CHECK-LABEL: @no_memdep_alias_arg
+; CHECK: flat_store_dword
+; CHECK: s_load_dword [[SVAL:s[0-9]+]]
+; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
+; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
+
+define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
+  store i32 0, i32 addrspace(1)* %out0
+  %val = load i32, i32 addrspace(1)* %in
+  store i32 %val, i32 addrspace(1)* %out1
+  ret void
+}
+
+; uniform load dominated by alias store - vector
+; CHECK-LABEL: {{^}}memdep:
+; CHECK: flat_store_dword
+; CHECK: flat_load_dword [[VVAL:v[0-9]+]]
+; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
+define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
+  store i32 0, i32 addrspace(1)* %out0
+  %val = load i32, i32 addrspace(1)* %in
+  store i32 %val, i32 addrspace(1)* %out1
+  ret void
+}
+
+; uniform load from global array
+; CHECK-LABEL:  @global_array
+; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]]
+; CHECK: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0
+; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0
+; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
+; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
+
+@A = common local_unnamed_addr addrspace(1) global i32 addrspace(1)* null, align 4
+
+define amdgpu_kernel void @global_array(i32 addrspace(1)* nocapture %out) {
+entry:
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
+  %1 = load i32, i32 addrspace(1)* %0, align 4
+  store i32 %1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; uniform load from global array dominated by alias store
+; CHECK-LABEL:  @global_array_alias_store
+; CHECK: flat_store_dword
+; CHECK: v_mov_b32_e32 v[[ADDR_LO:[0-9]+]], s{{[0-9]+}}
+; CHECK: v_mov_b32_e32 v[[ADDR_HI:[0-9]+]], s{{[0-9]+}}
+; CHECK: flat_load_dwordx2 [[A_ADDR:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}}
+; CHECK: flat_load_dword [[VVAL:v[0-9]+]], [[A_ADDR]]
+; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
+define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, i32 %n) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1) * %out, i32 %n
+  store i32 12, i32 addrspace(1) * %gep
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
+  %1 = load i32, i32 addrspace(1)* %0, align 4
+  store i32 %1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #1 = { nounwind readnone }
+
+!8 = !{!9, !9, i64 0}
+!9 = !{!"float", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/AMDGPU/global_smrd_cfg.ll b/test/CodeGen/AMDGPU/global_smrd_cfg.ll
new file mode 100644
index 0000000..a6a0415
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global_smrd_cfg.ll
@@ -0,0 +1,80 @@
+; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true -verify-machineinstrs  < %s | FileCheck %s
+
+; CHECK-LABEL: %bb11
+
+; Load from %arg in a Loop body has alias store
+
+; CHECK: flat_load_dword
+
+; CHECK-LABEL: %bb20
+; CHECK: flat_store_dword
+
+; #####################################################################
+
+; CHECK-LABEL: %bb22
+
+; Load from %arg has alias store in Loop
+
+; CHECK: flat_load_dword
+
+; #####################################################################
+
+; Load from %arg1 has no-alias store in Loop - arg1[i+1] never alias arg1[i]
+
+; CHECK: s_load_dword
+
+define amdgpu_kernel void @cfg(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) #0 {
+bb:
+  %tmp = sext i32 %arg2 to i64
+  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp
+  %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4, !tbaa !0
+  %tmp5 = icmp sgt i32 %tmp4, 0
+  br i1 %tmp5, label %bb6, label %bb8
+
+bb6:                                              ; preds = %bb
+  br label %bb11
+
+bb7:                                              ; preds = %bb22
+  br label %bb8
+
+bb8:                                              ; preds = %bb7, %bb
+  %tmp9 = phi i32 [ 0, %bb ], [ %tmp30, %bb7 ]
+  %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp
+  store i32 %tmp9, i32 addrspace(1)* %tmp10, align 4, !tbaa !0
+  ret void
+
+bb11:                                             ; preds = %bb22, %bb6
+  %tmp12 = phi i32 [ %tmp30, %bb22 ], [ 0, %bb6 ]
+  %tmp13 = phi i32 [ %tmp25, %bb22 ], [ 0, %bb6 ]
+  %tmp14 = srem i32 %tmp13, %arg2
+  %tmp15 = sext i32 %tmp14 to i64
+  %tmp16 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp15
+  %tmp17 = load i32, i32 addrspace(1)* %tmp16, align 4, !tbaa !0
+  %tmp18 = icmp sgt i32 %tmp17, 100
+  %tmp19 = sext i32 %tmp13 to i64
+  br i1 %tmp18, label %bb20, label %bb22
+
+bb20:                                             ; preds = %bb11
+  %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp19
+  store i32 0, i32 addrspace(1)* %tmp21, align 4, !tbaa !0
+  br label %bb22
+
+bb22:                                             ; preds = %bb20, %bb11
+  %tmp23 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp19
+  %tmp24 = load i32, i32 addrspace(1)* %tmp23, align 4, !tbaa !0
+  %tmp25 = add nuw nsw i32 %tmp13, 1
+  %tmp26 = sext i32 %tmp25 to i64
+  %tmp27 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp26
+  %tmp28 = load i32, i32 addrspace(1)* %tmp27, align 4, !tbaa !0
+  %tmp29 = add i32 %tmp24, %tmp12
+  %tmp30 = add i32 %tmp29, %tmp28
+  %tmp31 = icmp eq i32 %tmp25, %tmp4
+  br i1 %tmp31, label %bb7, label %bb11
+}
+
+attributes #0 = { "target-cpu"="fiji" }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"int", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 3a93330..528e12b 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -506,11 +506,13 @@
 bb1:
   %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
   %tmp3 = extractelement <4 x float> %tmp2, i32 undef
+  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) #0 ; Prevent block optimize out
   br label %bb7
 
 bb4:
   %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
   %tmp6 = extractelement <4 x float> %tmp5, i32 undef
+  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) #0 ; Prevent block optimize out
   br label %bb7
 
 bb7:
@@ -554,11 +556,13 @@
 bb1:                                              ; preds = %bb
   %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
   %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
+  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) #0 ; Prevent block optimize out
   br label %bb7
 
 bb4:                                              ; preds = %bb
   %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
   %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
+  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) #0 ; Prevent block optimize out
   br label %bb7
 
 bb7:                                              ; preds = %bb4, %bb1
@@ -745,6 +749,8 @@
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare void @llvm.amdgcn.s.barrier() #2
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind convergent }
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 8ff09ff..7351665 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -207,12 +207,10 @@
 ; GCN: buffer_load_ushort v{{[0-9]+}}, off
 ; GCN: buffer_load_ushort v{{[0-9]+}}, off
 
-; GCN: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 0{{$}}
-
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2
-; GCN-DAG: buffer_store_short v{{[0-9]+}}, [[BASE_FI]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
 ; GCN: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
 ; GCN: s_waitcnt
@@ -234,7 +232,7 @@
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
 
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
 
 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
@@ -253,9 +251,9 @@
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
 
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
 
 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
@@ -280,7 +278,7 @@
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:3
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
 
 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
@@ -394,15 +392,15 @@
 
 ; Stack store
 
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}}
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}}
 
 ; Write element
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 
 ; Stack reload
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}}
 
 ; Store result
 ; GCN: buffer_store_dwordx4
@@ -419,21 +417,17 @@
 ; GCN-LABEL: {{^}}dynamic_insertelement_v8f64:
 ; GCN-DAG: SCRATCH_RSRC_DWORD
 
-; FIXME: Should be able to eliminate this?
-
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}}
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
-
-; GCN-DAG: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, [[BASE_FI0]], s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}}
 
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, [[BASE_FI0]], s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, [[BASE_FI0]], s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, [[BASE_FI0]], s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, [[BASE_FI0]], s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}}
 
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
index dcb2ddb..b1f20fd 100644
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -5,6 +5,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 
 ; FUNC-LABEL: {{^}}i8_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
@@ -24,6 +25,7 @@
 }
 
 ; FUNC-LABEL: {{^}}i8_zext_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
@@ -42,6 +44,7 @@
 }
 
 ; FUNC-LABEL: {{^}}i8_sext_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
@@ -60,6 +63,7 @@
 }
 
 ; FUNC-LABEL: {{^}}i16_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
@@ -79,6 +83,7 @@
 }
 
 ; FUNC-LABEL: {{^}}i16_zext_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
@@ -97,6 +102,7 @@
 }
 
 ; FUNC-LABEL: {{^}}i16_sext_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
@@ -115,6 +121,7 @@
 }
 
 ; FUNC-LABEL: {{^}}i32_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
@@ -126,6 +133,7 @@
 }
 
 ; FUNC-LABEL: {{^}}f32_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
@@ -137,6 +145,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v2i8_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; MESA-GCN: buffer_load_ubyte
@@ -150,6 +159,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v2i16_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 ; MESA-GCN: buffer_load_ushort
@@ -163,6 +173,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v2i32_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
@@ -175,6 +186,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v2f32_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
@@ -187,6 +199,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v3i8_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
@@ -203,6 +216,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v3i16_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
@@ -218,6 +232,7 @@
   ret void
 }
 ; FUNC-LABEL: {{^}}v3i32_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
@@ -231,6 +246,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v3f32_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
@@ -244,6 +260,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v4i8_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
@@ -263,6 +280,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v4i16_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
@@ -282,6 +300,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v4i32_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
@@ -296,6 +315,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v4f32_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
@@ -310,6 +330,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v8i8_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
@@ -340,6 +361,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v8i16_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
@@ -371,6 +393,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v8i32_arg:
+; HSA-VI: kernarg_segment_alignment = 5
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
@@ -389,6 +412,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v8f32_arg:
+; HSA-VI: kernarg_segment_alignment = 5
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
@@ -405,6 +429,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v16i8_arg:
+; HSA-VI: kernarg_segment_alignment = 4
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
@@ -460,6 +485,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v16i16_arg:
+; HSA-VI: kernarg_segment_alignment = 5
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
@@ -515,6 +541,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v16i32_arg:
+; HSA-VI: kernarg_segment_alignment = 6
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
@@ -541,6 +568,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v16f32_arg:
+; HSA-VI: kernarg_segment_alignment = 6
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
diff --git a/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll b/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
index 8dbec18..078d633 100644
--- a/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
+++ b/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
@@ -1,8 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-; FIXME: Enabling critical edge splitting will fix this.
-; XFAIL: *
-
 ; Make sure that m0 is not reinitialized in the loop.
 
 ; GCN-LABEL: {{^}}copy_local_to_global_loop_m0_init:
@@ -12,7 +9,9 @@
 ; GCN: s_mov_b32 m0, -1
 
 ; GCN: BB0_2:
+; GCN-NOT: m0
 ; GCN: ds_read_b32
+; GCN-NOT: m0
 ; GCN: buffer_store_dword
 
 ; GCN: s_cbranch_scc0 BB0_2
diff --git a/test/CodeGen/AMDGPU/llvm.SI.export.ll b/test/CodeGen/AMDGPU/llvm.SI.export.ll
new file mode 100644
index 0000000..23a32dc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.export.ll
@@ -0,0 +1,237 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #0
+
+; GCN-LABEL: {{^}}test_export_zeroes:
+; GCN: exp mrt0 off, off, off, off{{$}}
+; GCN: exp mrt0 off, off, off, off done{{$}}
+define void @test_export_zeroes() #0 {
+
+  call void @llvm.SI.export(i32 0, i32 0, i32 0, i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0)
+  call void @llvm.SI.export(i32 0, i32 0, i32 1, i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+; FIXME: Should not set up registers for the unused source registers.
+
+; GCN-LABEL: {{^}}test_export_en_src0:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
+define void @test_export_en_src0() #0 {
+  call void @llvm.SI.export(i32 1, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src1:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
+define void @test_export_en_src1() #0 {
+  call void @llvm.SI.export(i32 2, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src2:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
+define void @test_export_en_src2() #0 {
+  call void @llvm.SI.export(i32 4, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src3:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
+define void @test_export_en_src3() #0 {
+  call void @llvm.SI.export(i32 8, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src0_src1:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
+define void @test_export_en_src0_src1() #0 {
+  call void @llvm.SI.export(i32 3, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src0_src2:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
+define void @test_export_en_src0_src2() #0 {
+  call void @llvm.SI.export(i32 5, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src0_src3:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
+; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
+define void @test_export_en_src0_src3() #0 {
+  call void @llvm.SI.export(i32 9, i32 0, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  call void @llvm.SI.export(i32 9, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src0_src1_src2_src3:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define void @test_export_en_src0_src1_src2_src3() #0 {
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_mrt7:
+; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0.5
+; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
+; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
+define void @test_export_mrt7() #0 {
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 7, i32 0, float 0.5, float 0.5, float 0.5, float 0.5)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 7, i32 0, float 0.5, float 0.5, float 0.5, float 0.5)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_z:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define void @test_export_z() #0 {
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 8, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 8, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_null:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define void @test_export_null() #0 {
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 9, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 9, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_reserved10:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define void @test_export_reserved10() #0 {
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 10, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 10, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_reserved11:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define void @test_export_reserved11() #0 {
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 11, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 11, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_pos0:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define void @test_export_pos0() #0 {
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 12, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_pos3:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define void @test_export_pos3() #0 {
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 15, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 15, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_param0:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define void @test_export_param0() #0 {
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 32, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_param31:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define void @test_export_param31() #0 {
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 63, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 63, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_vm:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
+; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
+define void @test_export_vm() #0 {
+  call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
+  ret void
+}
+
+attributes #0 = { nounwind "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
index 74f70e5..9613a50 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
@@ -14,7 +14,9 @@
   %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, float %j, i32 0, i32 0, i32 %3)
   %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 %3)
   %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, float %j, i32 1, i32 0, i32 %3)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %p0_0, float %p0_0, float %p1_1, float %p1_1)
+  %const = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %3)
+  %w = fadd float %p1_1, %const
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %p0_0, float %p0_0, float %p1_1, float %w)
   ret void
 }
 
@@ -24,6 +26,8 @@
 ; Function Attrs: nounwind readnone
 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
 
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0
+
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
 attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
index 92c592b..d49fa2b 100644
--- a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
+++ b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
@@ -7,16 +7,11 @@
 ;
 ; CHECK-LABEL: {{^}}main:
 
-; CHECK: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 0{{$}}
-
-; FIXME: add 0?
-; CHECK-DAG: v_add_i32_e32 [[ADD_K0:v[0-9]+]], vcc, 0x140, [[BASE_FI]]
-
 ; CHECK-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
-; CHECK-DAG: buffer_store_dword {{v[0-9]+}}, [[ADD_K0]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-
 ; CHECK-DAG: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]]
-; CHECK-DAG: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, [[BASE_FI]], [[BYTES]]
+
+; TODO: add 0?
+; CHECK-DAG: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]]
 
 ; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
 ; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
diff --git a/test/CodeGen/AMDGPU/local-stack-slot-offset.ll b/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
new file mode 100644
index 0000000..256f6e8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
@@ -0,0 +1,35 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+
+; Allocate two stack slots of 2052 bytes each requiring a total of 4104 bytes.
+; Extracting the last element of each does not fit into the offset field of
+; MUBUF instructions, so a new base register is needed. This used to not
+; happen, leading to an assertion.
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: buffer_store_dword
+; CHECK: buffer_store_dword
+; CHECK: buffer_load_dword
+; CHECK: buffer_load_dword
+define amdgpu_gs float @main(float %v1, float %v2, i32 %idx1, i32 %idx2) {
+main_body:
+  %m1 = alloca [513 x float]
+  %m2 = alloca [513 x float]
+
+  %gep1.store = getelementptr [513 x float], [513 x float]* %m1, i32 0, i32 %idx1
+  store float %v1, float* %gep1.store
+
+  %gep2.store = getelementptr [513 x float], [513 x float]* %m2, i32 0, i32 %idx2
+  store float %v2, float* %gep2.store
+
+; This used to use a base reg equal to 0.
+  %gep1.load = getelementptr [513 x float], [513 x float]* %m1, i32 0, i32 0
+  %out1 = load float, float* %gep1.load
+
+; This used to attempt to re-use the base reg at 0, generating an out-of-bounds instruction offset.
+  %gep2.load = getelementptr [513 x float], [513 x float]* %m2, i32 0, i32 512
+  %out2 = load float, float* %gep2.load
+
+  %r = fadd float %out1, %out2
+  ret float %r
+}
diff --git a/test/CodeGen/AMDGPU/private-element-size.ll b/test/CodeGen/AMDGPU/private-element-size.ll
index acf295d..de9a8f7 100644
--- a/test/CodeGen/AMDGPU/private-element-size.ll
+++ b/test/CodeGen/AMDGPU/private-element-size.ll
@@ -10,27 +10,27 @@
 ; HSA-ELT4: private_element_size = 1
 
 
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24
 
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
 
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
@@ -59,29 +59,29 @@
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:32
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:48
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48
 
 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 
 
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:32
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:40
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:48
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:56
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:56
 
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}}
@@ -89,14 +89,14 @@
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:32{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:36{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:40{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:44{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:48{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:52{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:56{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:60{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:48{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:52{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:56{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:60{{$}}
 
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
@@ -130,16 +130,16 @@
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
 
 ; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}}
 
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
@@ -166,16 +166,16 @@
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
 
 ; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}}
 
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
@@ -202,27 +202,27 @@
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24
 
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
 
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
diff --git a/test/CodeGen/AMDGPU/ret.ll b/test/CodeGen/AMDGPU/ret.ll
index 0408413..515203f 100644
--- a/test/CodeGen/AMDGPU/ret.ll
+++ b/test/CodeGen/AMDGPU/ret.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}vgpr:
 ; GCN: v_mov_b32_e32 v1, v0
 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
-; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1
+; GCN-DAG: exp mrt0 v1, v1, v1, v1 done compr vm
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
 define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
@@ -19,7 +19,8 @@
 
 ; GCN-LABEL: {{^}}vgpr_literal:
 ; GCN: v_mov_b32_e32 v4, v0
-; GCN: exp 15, 0, 1, 1, 1, v4, v4, v4, v4
+; GCN: exp mrt0 v4, v4, v4, v4 done compr vm
+
 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
 ; GCN-DAG: v_mov_b32_e32 v2, 4.0
@@ -43,7 +44,6 @@
 ; GCN: v_mov_b32_e32 v3, v4
 ; GCN: v_mov_b32_e32 v4, v6
 ; GCN-NOT: s_endpgm
-attributes #0 = { "InitialPSInputAddr"="0" }
 define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
   %i0 = extractelement <2 x i32> %4, i32 0
   %i1 = extractelement <2 x i32> %4, i32 1
@@ -209,7 +209,7 @@
 
 ; GCN-LABEL: {{^}}both:
 ; GCN: v_mov_b32_e32 v1, v0
-; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1
+; GCN-DAG: exp mrt0 v1, v1, v1, v1 done compr vm
 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
 ; GCN-DAG: s_add_i32 s0, s3, 2
 ; GCN-DAG: s_mov_b32 s1, s2
@@ -231,7 +231,8 @@
 
 ; GCN-LABEL: {{^}}structure_literal:
 ; GCN: v_mov_b32_e32 v3, v0
-; GCN: exp 15, 0, 1, 1, 1, v3, v3, v3, v3
+; GCN: exp mrt0 v3, v3, v3, v3 done compr vm
+
 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
 ; GCN-DAG: s_mov_b32 s0, 2
 ; GCN-DAG: s_mov_b32 s1, 3
@@ -242,3 +243,5 @@
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
   ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> <float 2.0, float 4.0>}}
 }
+
+attributes #0 = { nounwind "InitialPSInputAddr"="0" }
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll
index ff01306..37083fb 100644
--- a/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -478,5 +478,30 @@
   br label %bb1
 }
 
+; GCN-LABEL: {{^}}phi_imm_in_sgprs
+; GCN: s_movk_i32 [[A:s[0-9]+]], 0x400
+; GCN: s_movk_i32 [[B:s[0-9]+]], 0x400
+; GCN: [[LOOP_LABEL:[0-9a-zA-Z_]+]]:
+; GCN: s_xor_b32 [[B]], [[B]], [[A]]
+; GCN: s_cbranch_scc{{[01]}} [[LOOP_LABEL]]
+define void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [0, %entry], [%i.add, %loop]
+  %offset = phi i32 [1024, %entry], [%offset.xor, %loop]
+  %offset.xor = xor i32 %offset, 1024
+  %offset.i = add i32 %offset.xor, %i
+  %ptr = getelementptr i32, i32 addrspace(3)* %out, i32 %offset.i
+  store i32 0, i32 addrspace(3)* %ptr
+  %i.add = add i32 %i, 1
+  %cmp = icmp ult i32 %i.add, %cond
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index bb3f949..d5d2f6b 100644
--- a/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ;
 ;
 ; Most SALU instructions ignore control flow, so we need to make sure
@@ -9,7 +9,9 @@
 ; about instructions in different blocks overwriting each other.
 ; SI-LABEL: {{^}}sgpr_if_else_salu_br:
 ; SI: s_add
-; SI: s_add
+; SI: s_branch
+
+; SI: s_sub
 
 define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
@@ -17,6 +19,45 @@
   br i1 %0, label %if, label %else
 
 if:
+  %1 = sub i32 %b, %c
+  br label %endif
+
+else:
+  %2 = add i32 %d, %e
+  br label %endif
+
+endif:
+  %3 = phi i32 [%1, %if], [%2, %else]
+  %4 = add i32 %3, %a
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}sgpr_if_else_salu_br_opt:
+; SI: s_cmp_lg_u32
+; SI: s_cbranch_scc0 [[IF:BB[0-9]+_[0-9]+]]
+
+; SI: ; BB#1: ; %else
+; SI: s_load_dword [[LOAD0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xe
+; SI: s_load_dword [[LOAD1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xf
+; SI-NOT: add
+; SI: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]
+
+; SI: [[IF]]: ; %if
+; SI: s_load_dword [[LOAD0]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_load_dword [[LOAD1]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-NOT: add
+
+; SI: [[ENDIF]]: ; %endif
+; SI: s_add_i32 s{{[0-9]+}}, [[LOAD0]], [[LOAD1]]
+; SI: buffer_store_dword
+; SI-NEXT: s_endpgm
+define void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+entry:
+  %0 = icmp eq i32 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
   %1 = add i32 %b, %c
   br label %endif
 
@@ -67,7 +108,7 @@
 ; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]
 ; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
 
-; SI: BB2_2:
+; SI: BB{{[0-9]+}}_2:
 ; SI: buffer_load_dword [[AVAL:v[0-9]+]]
 ; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
 ; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll
index da270c5..e65f1e2 100644
--- a/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
 ; This test checks that no VGPR to SGPR copies are created by the register
 ; allocator.
@@ -223,8 +223,15 @@
 ; an assertion failure.
 
 ; CHECK-LABEL: {{^}}sample_v3:
-; CHECK: image_sample
-; CHECK: image_sample
+; CHECK: v_mov_b32_e32 v[[SAMPLE_LO:[0-9]+]], 11
+; CHECK: v_mov_b32_e32 v[[SAMPLE_HI:[0-9]+]], 13
+; CHECK: s_branch
+
+; CHECK-DAG: v_mov_b32_e32 v[[SAMPLE_LO:[0-9]+]], 5
+; CHECK-DAG: v_mov_b32_e32 v[[SAMPLE_HI:[0-9]+]], 7
+
+; CHECK: BB{{[0-9]+_[0-9]+}}:
+; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[SAMPLE_LO]]:[[SAMPLE_HI]]{{\]}}
 ; CHECK: exp
 ; CHECK: s_endpgm
 define amdgpu_ps void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
@@ -241,14 +248,14 @@
   br i1 %tmp27, label %if, label %else
 
 if:                                               ; preds = %entry
-  %val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> zeroinitializer, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 11, i32 13>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %val.if.0 = extractelement <4 x float> %val.if, i32 0
   %val.if.1 = extractelement <4 x float> %val.if, i32 1
   %val.if.2 = extractelement <4 x float> %val.if, i32 2
   br label %endif
 
 else:                                             ; preds = %entry
-  %val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1, i32 0>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 5, i32 7>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %val.else.0 = extractelement <4 x float> %val.else, i32 0
   %val.else.1 = extractelement <4 x float> %val.else, i32 1
   %val.else.2 = extractelement <4 x float> %val.else, i32 2
@@ -317,9 +324,15 @@
 
 ; This test checks that image_sample resource descriptors aren't loaded into
 ; vgprs.  The verifier will fail if this happens.
-; CHECK-LABEL:{{^}}sample_rsrc:
-; CHECK: image_sample
-; CHECK: image_sample
+; CHECK-LABEL:{{^}}sample_rsrc
+
+; CHECK: s_cmp_eq_u32
+; CHECK: s_cbranch_scc0 [[END:BB[0-9]+_[0-9]+]]
+
+; CHECK: v_add_i32_e32 v[[ADD:[0-9]+]], vcc, 1, v{{[0-9]+}}
+
+; [[END]]:
+; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[ADD]]{{\]}}
 ; CHECK: s_endpgm
 define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 bb:
diff --git a/test/CodeGen/AMDGPU/skip-if-dead.ll b/test/CodeGen/AMDGPU/skip-if-dead.ll
index ee344b2..33f5e98 100644
--- a/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -106,7 +106,7 @@
 ; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
 ; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: ; BB#2:
-; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0
+; CHECK-NEXT: exp null off, off, off, off done vm
 ; CHECK-NEXT: s_endpgm
 
 ; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
@@ -158,7 +158,7 @@
 ; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
 
 ; CHECK-NEXT: ; BB#2:
-; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0
+; CHECK-NEXT: exp null off, off, off, off done vm
 ; CHECK-NEXT: s_endpgm
 
 ; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
@@ -308,10 +308,8 @@
 ; CHECK: s_mov_b64 exec, 0
 
 ; CHECK: [[SKIPKILL]]:
-; CHECK: v_cmp_nge_f32
-; CHECK-NEXT: s_cbranch_vccz [[UNREACHABLE:BB[0-9]+_[0-9]+]]
-
-; CHECK: [[UNREACHABLE]]:
+; CHECK: v_cmp_nge_f32_e32 vcc
+; CHECK-NEXT: BB#3: ; %bb5
 ; CHECK-NEXT: .Lfunc_end{{[0-9]+}}
 define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1) #0 {
 bb:
diff --git a/test/CodeGen/AMDGPU/sub.i16.ll b/test/CodeGen/AMDGPU/sub.i16.ll
new file mode 100644
index 0000000..065bac3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sub.i16.ll
@@ -0,0 +1,169 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_i16:
+; VI: flat_load_ushort [[A:v[0-9]+]]
+; VI: flat_load_ushort [[B:v[0-9]+]]
+; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI-NEXT: buffer_store_short [[ADD]]
+define void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
+  %a = load volatile i16, i16 addrspace(1)* %gep.in0
+  %b = load volatile i16, i16 addrspace(1)* %gep.in1
+  %add = sub i16 %a, %b
+  store i16 %add, i16 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_i16_constant:
+; VI: flat_load_ushort [[A:v[0-9]+]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffff85, [[A]]
+; VI-NEXT: buffer_store_short [[ADD]]
+define void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
+  %a = load volatile i16, i16 addrspace(1)* %gep.in0
+  %add = sub i16 %a, 123
+  store i16 %add, i16 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_i16_neg_constant:
+; VI: flat_load_ushort [[A:v[0-9]+]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x34d, [[A]]
+; VI-NEXT: buffer_store_short [[ADD]]
+define void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
+  %a = load volatile i16, i16 addrspace(1)* %gep.in0
+  %add = sub i16 %a, -845
+  store i16 %add, i16 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_i16_inline_63:
+; VI: flat_load_ushort [[A:v[0-9]+]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffffc1, [[A]]
+; VI-NEXT: buffer_store_short [[ADD]]
+define void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
+  %a = load volatile i16, i16 addrspace(1)* %gep.in0
+  %add = sub i16 %a, 63
+  store i16 %add, i16 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i32:
+; VI: flat_load_ushort [[A:v[0-9]+]]
+; VI: flat_load_ushort [[B:v[0-9]+]]
+; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI-NEXT: buffer_store_dword [[ADD]]
+define void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
+  %a = load volatile i16, i16 addrspace(1)* %gep.in0
+  %b = load volatile i16, i16 addrspace(1)* %gep.in1
+  %add = sub i16 %a, %b
+  %ext = zext i16 %add to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
+; VI: flat_load_ushort [[A:v[0-9]+]]
+; VI: flat_load_ushort [[B:v[0-9]+]]
+; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
+; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
+; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
+  %a = load volatile i16, i16 addrspace(1)* %gep.in0
+  %b = load volatile i16, i16 addrspace(1)* %gep.in1
+  %add = sub i16 %a, %b
+  %ext = zext i16 %add to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i32:
+; VI: flat_load_ushort [[A:v[0-9]+]]
+; VI: flat_load_ushort [[B:v[0-9]+]]
+; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
+; VI-NEXT: buffer_store_dword [[SEXT]]
+define void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
+  %a = load i16, i16 addrspace(1)* %gep.in0
+  %b = load i16, i16 addrspace(1)* %gep.in1
+  %add = sub i16 %a, %b
+  %ext = sext i16 %add to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i64:
+; VI: flat_load_ushort [[A:v[0-9]+]]
+; VI: flat_load_ushort [[B:v[0-9]+]]
+; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
+; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
+  %a = load i16, i16 addrspace(1)* %gep.in0
+  %b = load i16, i16 addrspace(1)* %gep.in1
+  %add = sub i16 %a, %b
+  %ext = sext i16 %add to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+@lds = addrspace(3) global [512 x i32] undef, align 4
+
+; GCN-LABEL: {{^}}v_test_sub_i16_constant_commute:
+; VI: v_subrev_u16_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}}
+; CI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 0x800, v{{[0-9]+}}
+define void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+  %size = call i32 @llvm.amdgcn.groupstaticsize()
+  %size.trunc = trunc i32 %size to i16
+  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
+  %a = load volatile i16, i16 addrspace(1)* %gep.in0
+  %add = sub i16 %a, %size.trunc
+  store i16 %add, i16 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.groupstaticsize() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/uniform-cfg.ll b/test/CodeGen/AMDGPU/uniform-cfg.ll
index a5d1cd2..a0060bd 100644
--- a/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -197,15 +197,15 @@
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
 ; GCN-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
 
-; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN: buffer_store_dword [[TWO]]
+; GCN: v_mov_b32_e32 [[IMM_REG:v[0-9]+]], 2
 ; GCN: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]]
 
 ; GCN: [[IF_LABEL]]:
-; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN: buffer_store_dword [[ONE]]
+; GCN-NEXT: v_mov_b32_e32 [[IMM_REG]], 1
 
-; GCN: [[ENDIF_LABEL]]:
+; GCN-NEXT: [[ENDIF_LABEL]]:
+; GCN: buffer_store_dword [[IMM_REG]]
+
 ; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
 ; GCN: buffer_store_dword [[THREE]]
 ; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/unify-metadata.ll b/test/CodeGen/AMDGPU/unify-metadata.ll
new file mode 100644
index 0000000..2b8eec8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/unify-metadata.ll
@@ -0,0 +1,26 @@
+; RUN: opt -mtriple=amdgcn--amdhsa -amdgpu-unify-metadata -S < %s | FileCheck -check-prefix=ALL %s
+
+; This test check that we have a singe metadata value after linking several
+; modules for records such as opencl.ocl.version, llvm.ident and similar.
+
+; ALL-DAG: !opencl.ocl.version = !{![[OCL_VER:[0-9]+]]}
+; ALL-DAG: !llvm.ident = !{![[LLVM_IDENT:[0-9]+]]}
+; ALL-DAG: !opencl.used.extensions = !{![[USED_EXT:[0-9]+]]}
+; ALL-DAG: ![[OCL_VER]] = !{i32 1, i32 2}
+; ALL-DAG: ![[LLVM_IDENT]] = !{!"clang version 4.0 "}
+; ALL-DAG: ![[USED_EXT]] = !{!"cl_images", !"cl_khr_fp16", !"cl_doubles"}
+
+define void @test() {
+   ret void
+}
+
+!opencl.ocl.version = !{!1, !0, !0, !0}
+!llvm.ident = !{!2, !2, !2, !2}
+!opencl.used.extensions = !{!3, !3, !4, !5}
+
+!0 = !{i32 2, i32 0}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 4.0 "}
+!3 = !{!"cl_images", !"cl_khr_fp16"}
+!4 = !{!"cl_images", !"cl_doubles"}
+!5 = !{}
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index 681b9be..df7cf09 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -34,10 +34,10 @@
 ; GCN: buffer_store_dword {{v[0-9]}}, off, s[12:15], s16 offset:{{[0-9]+}}
 ; GCN: buffer_store_dword {{v[0-9]}}, off, s[12:15], s16 offset:{{[0-9]+}}
 
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}}
+; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}}
+; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}}
+; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}}
 
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1024
diff --git a/test/CodeGen/ARM/deprecated-asm.s b/test/CodeGen/ARM/deprecated-asm.s
new file mode 100644
index 0000000..7318e6a
--- /dev/null
+++ b/test/CodeGen/ARM/deprecated-asm.s
@@ -0,0 +1,43 @@
+// REQUIRES: asserts
+// RUN: llvm-mc < %s  -triple=armv4t-linux-gnueabi -filetype=obj -o %t.o -no-deprecated-warn -stats 2>&1 | FileCheck %s
+// RUN: llvm-mc < %s  -triple=armv4t-linux-gnueabi -filetype=obj -o %t.o 2>&1 | FileCheck %s -check-prefix=WARN
+
+	.text
+	.syntax unified
+	.eabi_attribute	67, "2.09"	@ Tag_conformance
+	.cpu	arm7tdmi
+	.eabi_attribute	6, 2	@ Tag_CPU_arch
+	.eabi_attribute	8, 1	@ Tag_ARM_ISA_use
+	.eabi_attribute	17, 1	@ Tag_ABI_PCS_GOT_use
+	.eabi_attribute	20, 1	@ Tag_ABI_FP_denormal
+	.eabi_attribute	21, 1	@ Tag_ABI_FP_exceptions
+	.eabi_attribute	23, 3	@ Tag_ABI_FP_number_model
+	.eabi_attribute	34, 0	@ Tag_CPU_unaligned_access
+	.eabi_attribute	24, 1	@ Tag_ABI_align_needed
+	.eabi_attribute	25, 1	@ Tag_ABI_align_preserved
+	.eabi_attribute	38, 1	@ Tag_ABI_FP_16bit_format
+	.eabi_attribute	18, 4	@ Tag_ABI_PCS_wchar_t
+	.eabi_attribute	26, 2	@ Tag_ABI_enum_size
+	.eabi_attribute	14, 0	@ Tag_ABI_PCS_R9_use
+	.file	"t.c"
+	.globl	foo
+	.p2align	2
+	.type	foo,%function
+foo:                                    @ @foo
+	.fnstart
+@ BB#0:                                 @ %entry
+	mov	r0, #0
+	bx	lr
+        stmia   r4!, {r12-r14}
+.Lfunc_end0:
+.Ltmp0:
+	.size	foo, .Ltmp0-foo
+	.cantunwind
+	.fnend
+
+
+
+// CHECK: Statistic
+// CHECK-NOT: warning
+
+// WARN: warning
diff --git a/test/CodeGen/AVR/error-srcreg-destreg-same.ll b/test/CodeGen/AVR/error-srcreg-destreg-same.ll
deleted file mode 100644
index 8227635..0000000
--- a/test/CodeGen/AVR/error-srcreg-destreg-same.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc < %s -march=avr | FileCheck %s
-; XFAIL: *
-
-; This occurs when compiling Rust libcore.
-;
-; Assertion failed:
-; (DstReg != SrcReg && "SrcReg and DstReg cannot be the same")
-;   lib/Target/AVR/AVRExpandPseudoInsts.cpp, line 817
-;
-; https://github.com/avr-llvm/llvm/issues/229
-
-; CHECK-LABEL: rust_eh_personality
-declare void @rust_eh_personality()
-
-; CHECK-LABEL: __udivmoddi4
-define void @__udivmoddi4(i64 %arg, i64 %arg1) personality i32 (...)* bitcast (void ()* @rust_eh_personality to i32 (...)*) {
-entry-block:
-  %tmp = lshr i64 %arg, 32
-  %tmp2 = trunc i64 %tmp to i32
-  %tmp3 = trunc i64 %arg to i32
-  %tmp4 = add i64 %arg1, -1
-  br label %bb135
-
-bb133.loopexit:
-  ret void
-
-bb135:
-  %carry.0120 = phi i64 [ 0, %entry-block ], [ %phitmp, %bb135 ]
-  %q.sroa.12.1119 = phi i32 [ %tmp3, %entry-block ], [ %q.sroa.12.0.extract.trunc, %bb135 ]
-  %q.sroa.0.1118 = phi i32 [ 0, %entry-block ], [ %q.sroa.0.0.extract.trunc, %bb135 ]
-  %r.sroa.0.1116 = phi i32 [ %tmp2, %entry-block ], [ undef, %bb135 ]
-  %r.sroa.0.0.insert.ext62 = zext i32 %r.sroa.0.1116 to i64
-  %r.sroa.0.0.insert.insert64 = or i64 0, %r.sroa.0.0.insert.ext62
-  %tmp5 = shl nuw nsw i64 %r.sroa.0.0.insert.ext62, 1
-  %q.sroa.12.0.insert.ext101 = zext i32 %q.sroa.12.1119 to i64
-  %q.sroa.12.0.insert.shift102 = shl nuw i64 %q.sroa.12.0.insert.ext101, 32
-  %q.sroa.0.0.insert.ext87 = zext i32 %q.sroa.0.1118 to i64
-  %q.sroa.0.0.insert.insert89 = or i64 %q.sroa.12.0.insert.shift102, %q.sroa.0.0.insert.ext87
-  %tmp6 = lshr i64 %q.sroa.12.0.insert.ext101, 31
-  %tmp7 = lshr i64 %r.sroa.0.0.insert.insert64, 31
-  %tmp8 = shl nuw nsw i64 %q.sroa.0.0.insert.ext87, 1
-  %tmp9 = or i64 %tmp8, %carry.0120
-  %q.sroa.0.0.extract.trunc = trunc i64 %tmp9 to i32
-  %tmp10 = lshr i64 %q.sroa.0.0.insert.insert89, 31
-  %q.sroa.12.0.extract.trunc = trunc i64 %tmp10 to i32
-  %r.sroa.13.0.insert.shift72 = shl i64 %tmp7, 32
-  %.masked114 = and i64 %tmp5, 4294967294
-  %r.sroa.0.0.insert.ext57 = or i64 %tmp6, %.masked114
-  %r.sroa.0.0.insert.insert59 = or i64 %r.sroa.0.0.insert.ext57, %r.sroa.13.0.insert.shift72
-  %tmp11 = sub i64 %tmp4, %r.sroa.0.0.insert.insert59
-  %tmp12 = ashr i64 %tmp11, 63
-  %phitmp = and i64 %tmp12, 1
-  %tmp13 = icmp ult i32 undef, 32
-  br i1 %tmp13, label %bb135, label %bb133.loopexit
-}
-
diff --git a/test/CodeGen/AVR/expand-integer-failure.ll b/test/CodeGen/AVR/expand-integer-failure.ll
index cc022c5..99be3c8 100644
--- a/test/CodeGen/AVR/expand-integer-failure.ll
+++ b/test/CodeGen/AVR/expand-integer-failure.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -march=avr | FileCheck %s
-; XFAIL: *
 
 ; Causes an assertion error
 ; Assertion failed: (Lo.getValueType() == TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
@@ -20,4 +19,3 @@
 else:
   ret void
 }
-
diff --git a/test/CodeGen/AVR/progmem-extended.ll b/test/CodeGen/AVR/progmem-extended.ll
index 4dec871..c4c474e 100644
--- a/test/CodeGen/AVR/progmem-extended.ll
+++ b/test/CodeGen/AVR/progmem-extended.ll
@@ -1,4 +1,7 @@
 ; RUN: llc < %s -march=avr -mattr=movw,lpmx | FileCheck %s
+; XFAIL: *
+
+# Wide LPM is currently unimplemented in the pseudo expansion pass.
 
 ; Tests the extended LPM instructions (LPMW, LPM Rd, Z+).
 
diff --git a/test/CodeGen/AVR/pseudo/ADCWRdRr.mir b/test/CodeGen/AVR/pseudo/ADCWRdRr.mir
new file mode 100644
index 0000000..475d5b3
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/ADCWRdRr.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit add with carry pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_adcwrdrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_adcwrdrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_adcwrdrr
+
+    ; CHECK:       %r14 = ADCRdRr %r14, %r20, implicit-def %sreg, implicit %sreg
+    ; CHECK-LABEL: %r15 = ADCRdRr %r15, %r21, implicit-def %sreg, implicit killed %sreg
+
+    %r15r14 = ADCWRdRr %r15r14, %r21r20, implicit-def %sreg, implicit %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/ADDWRdRr.mir b/test/CodeGen/AVR/pseudo/ADDWRdRr.mir
new file mode 100644
index 0000000..2205feb
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/ADDWRdRr.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit add pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_addwrdrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_addwrdrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_addwrdrr
+
+    ; CHECK:       %r14 = ADDRdRr %r14, %r20, implicit-def %sreg
+    ; CHECK-LABEL: %r15 = ADCRdRr %r15, %r21, implicit-def %sreg, implicit killed %sreg
+
+    %r15r14 = ADDWRdRr %r15r14, %r21r20, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/ANDIWRdK.mir b/test/CodeGen/AVR/pseudo/ANDIWRdK.mir
new file mode 100644
index 0000000..5af8db1
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/ANDIWRdK.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit ANDO pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_andiwrdrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_andiwrdrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_andiwrdrr
+
+    ; CHECK:      %r20 = ANDIRdK %r20, 175, implicit-def dead %sreg
+    ; CHECK-NEXT: %r21 = ANDIRdK %r21, 250, implicit-def %sreg
+
+    %r21r20 = ANDIWRdK %r17r16, 64175, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/ANDWRdRr.mir b/test/CodeGen/AVR/pseudo/ANDWRdRr.mir
new file mode 100644
index 0000000..c9458e9
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/ANDWRdRr.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit AND pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_andwrdrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_andwrdrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_andwrdrr
+
+    ; CHECK:      %r14 = ANDRdRr %r14, %r20, implicit-def dead %sreg
+    ; CHECK-NEXT: %r15 = ANDRdRr %r15, %r21, implicit-def %sreg
+
+    %r15r14 = ANDWRdRr %r15r14, %r21r20, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/ASRWRd.mir b/test/CodeGen/AVR/pseudo/ASRWRd.mir
new file mode 100644
index 0000000..3e80956
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/ASRWRd.mir
@@ -0,0 +1,22 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+--- |
+  target triple = "avr--"
+  define void @test() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test
+
+    ; CHECK:      %r15 = ASRRd %r15, implicit-def %sreg
+    ; CHECK-NEXT: %r14 = RORRd %r14, implicit-def %sreg, implicit killed %sreg
+
+    %r15r14 = ASRWRd %r15r14, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/COMWRd.mir b/test/CodeGen/AVR/pseudo/COMWRd.mir
new file mode 100644
index 0000000..282d601
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/COMWRd.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit COM pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_comwrd() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_comwrd
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_comwrd
+
+    ; CHECK:      %r14 = COMRd %r14, implicit-def dead %sreg
+    ; CHECK-NEXT: %r15 = COMRd %r15, implicit-def %sreg
+
+    %r15r14 = COMWRd %r9r8, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/CPCWRdRr.mir b/test/CodeGen/AVR/pseudo/CPCWRdRr.mir
new file mode 100644
index 0000000..2081aa0
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/CPCWRdRr.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit CPCW pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_cpcwrdrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_cpcwrdrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_cpcwrdrr
+
+    ; CHECK:      CPCRdRr %r20, %r22, implicit-def %sreg, implicit killed %sreg
+    ; CHECK-NEXT: CPCRdRr %r21, %r23, implicit-def %sreg, implicit killed %sreg
+
+    CPCWRdRr %r21r20, %r23r22, implicit-def %sreg, implicit %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/CPWRdRr.mir b/test/CodeGen/AVR/pseudo/CPWRdRr.mir
new file mode 100644
index 0000000..7e25e7f
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/CPWRdRr.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit CPW pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_cpwrdrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_cpwrdrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_cpwrdrr
+
+    ; CHECK:      CPRdRr %r14, %r20, implicit-def %sreg
+    ; CHECK-NEXT: CPCRdRr %r15, %r21, implicit-def %sreg, implicit killed %sreg
+
+    CPWRdRr %r15r14, %r21r20, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/EORWRdRr.mir b/test/CodeGen/AVR/pseudo/EORWRdRr.mir
new file mode 100644
index 0000000..8769c12
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/EORWRdRr.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit EOR pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_eorwrdrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_eorwrdrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_eorwrdrr
+
+    ; CHECK:      %r14 = EORRdRr %r14, %r20, implicit-def dead %sreg
+    ; CHECK-NEXT: %r15 = EORRdRr %r15, %r21, implicit-def %sreg
+
+    %r15r14 = EORWRdRr %r15r14, %r21r20, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/FRMIDX.mir b/test/CodeGen/AVR/pseudo/FRMIDX.mir
new file mode 100644
index 0000000..47a9397
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/FRMIDX.mir
@@ -0,0 +1,25 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# TODO: Write this test.
+# This instruction isn't expanded by the pseudo expansion passs, but
+# rather AVRRegisterInfo::eliminateFrameIndex.
+
+--- |
+  target triple = "avr--"
+  define void @test() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test
+registers:
+  - { id: 0, class: _ }
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test
+
+    %r29r28 = FRMIDX %r31r30, 0, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/INWRdA.mir b/test/CodeGen/AVR/pseudo/INWRdA.mir
new file mode 100644
index 0000000..a801598
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/INWRdA.mir
@@ -0,0 +1,22 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+--- |
+  target triple = "avr--"
+  define void @test() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test
+
+    ; CHECK:      %r14 = INRdA 31
+    ; CHECK-NEXT: %r15 = INRdA 32
+
+    %r15r14 = INWRdA 31
+...
diff --git a/test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir
new file mode 100644
index 0000000..dd95abd
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0  %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit 'LDDWRdPtrQ' pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_lddwrdptrq() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_lddwrdptrq
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_lddwrdptrq
+
+    ; CHECK:      ldd     r30, Y+10
+    ; CHECK-NEXT: ldd     r31, Y+11
+
+    early-clobber %r31r30 = LDDWRdPtrQ %r29r28, 10
+...
diff --git a/test/CodeGen/AVR/pseudo/LDDWRdYQ.mir b/test/CodeGen/AVR/pseudo/LDDWRdYQ.mir
new file mode 100644
index 0000000..bb1ef8e
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/LDDWRdYQ.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0  %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit 'LDDWRdYQ instruction
+
+--- |
+  target triple = "avr--"
+  define void @test_lddwrdyq() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_lddwrdyq
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_lddwrdyq
+
+    ; CHECK:      ldd     r30, Y+1
+    ; CHECK-NEXT: ldd     r31, Y+2
+
+    early-clobber %r31r30 = LDDWRdYQ %r29r28, 1
+...
diff --git a/test/CodeGen/AVR/pseudo/LDIWRdK.mir b/test/CodeGen/AVR/pseudo/LDIWRdK.mir
new file mode 100644
index 0000000..23d16d9
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/LDIWRdK.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit LDIWRdK pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_ldiwrdrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_ldiwrdrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_ldiwrdrr
+
+    ; CHECK:      %r30 = LDIRdK 255
+    ; CHECK-NEXT: %r31 = LDIRdK 9
+
+    %r31r30 = LDIWRdK 2559
+...
diff --git a/test/CodeGen/AVR/pseudo/LDSWRdK.mir b/test/CodeGen/AVR/pseudo/LDSWRdK.mir
new file mode 100644
index 0000000..aa48836
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/LDSWRdK.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit LDSWRdK pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_ldswrdrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_ldswrdrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_ldswrdrr
+
+    ; CHECK:      %r30 = LDSRdK 2559
+    ; CHECK-NEXT: %r31 = LDSRdK 2560
+
+    %r31r30 = LDSWRdK 2559
+...
diff --git a/test/CodeGen/AVR/pseudo/LDWRdPtr.mir b/test/CodeGen/AVR/pseudo/LDWRdPtr.mir
new file mode 100644
index 0000000..aaf9f18
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/LDWRdPtr.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit LDWRdPtr pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_ldwrdptr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_ldwrdptr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_ldwrdptr
+
+    ; CHECK:                    %r0 = LDRdPtr %r31r30
+    ; CHECK-NEXT: early-clobber %r1 = LDDRdPtrQ %r31r30, 1
+
+    %r1r0 = LDWRdPtr %r31r30
+...
diff --git a/test/CodeGen/AVR/pseudo/LDWRdPtrPd.mir b/test/CodeGen/AVR/pseudo/LDWRdPtrPd.mir
new file mode 100644
index 0000000..f304cc2
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/LDWRdPtrPd.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit LDWRdPtrPd pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_ldwrdptrpd() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_ldwrdptrpd
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_ldwrdptrpd
+
+    ; CHECK:      early-clobber %r1, early-clobber %r31r30 = LDRdPtrPd killed %r31r30
+    ; CHECK-NEXT: early-clobber %r0, early-clobber %r31r30 = LDRdPtrPd killed %r31r30
+
+    %r1r0, %r31r30 = LDWRdPtrPd %r31r30
+...
diff --git a/test/CodeGen/AVR/pseudo/LDWRdPtrPi.mir b/test/CodeGen/AVR/pseudo/LDWRdPtrPi.mir
new file mode 100644
index 0000000..9153be0
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/LDWRdPtrPi.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit LDWRdPtrPi pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_ldwrdptrpi() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_ldwrdptrpi
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_ldwrdptrpi
+
+    ; CHECK:      early-clobber %r0, early-clobber %r31r30 = LDRdPtrPi killed %r31r30
+    ; CHECK-NEXT: early-clobber %r1, early-clobber %r31r30 = LDRdPtrPi killed %r31r30
+
+    %r1r0, %r31r30 = LDWRdPtrPi %r31r30
+...
diff --git a/test/CodeGen/AVR/pseudo/LSLWRd.mir b/test/CodeGen/AVR/pseudo/LSLWRd.mir
new file mode 100644
index 0000000..4419398
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/LSLWRd.mir
@@ -0,0 +1,22 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+--- |
+  target triple = "avr--"
+  define void @test() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test
+
+    ; CHECK:      %r14 = LSLRd %r14, implicit-def %sreg
+    ; CHECK-NEXT: %r15 = ROLRd %r15, implicit-def %sreg, implicit killed %sreg
+
+    %r15r14 = LSLWRd %r15r14, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/LSRWRd.mir b/test/CodeGen/AVR/pseudo/LSRWRd.mir
new file mode 100644
index 0000000..f5ffb93
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/LSRWRd.mir
@@ -0,0 +1,22 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+--- |
+  target triple = "avr--"
+  define void @test() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test
+
+    ; CHECK:      %r15 = LSRRd %r15, implicit-def %sreg
+    ; CHECK-NEXT: %r14 = RORRd %r14, implicit-def %sreg, implicit killed %sreg
+
+    %r15r14 = LSRWRd %r15r14, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/ORIWRdK.mir b/test/CodeGen/AVR/pseudo/ORIWRdK.mir
new file mode 100644
index 0000000..92bc367
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/ORIWRdK.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit OR pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_oriwrdrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_oriwrdrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_oriwrdrr
+
+    ; CHECK:      %r20 = ORIRdK %r20, 175, implicit-def dead %sreg
+    ; CHECK-NEXT: %r21 = ORIRdK %r21, 250, implicit-def %sreg
+
+    %r21r20 = ORIWRdK %r17r16, 64175, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/ORWRdRr.mir b/test/CodeGen/AVR/pseudo/ORWRdRr.mir
new file mode 100644
index 0000000..f7a377e
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/ORWRdRr.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit OR pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_orwrdrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_orwrdrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_orwrdrr
+
+    ; CHECK:      %r14 = ORRdRr %r14, %r20, implicit-def dead %sreg
+    ; CHECK-NEXT: %r15 = ORRdRr %r15, %r21, implicit-def %sreg
+
+    %r15r14 = ORWRdRr %r15r14, %r21r20, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/OUTWARr.mir b/test/CodeGen/AVR/pseudo/OUTWARr.mir
new file mode 100644
index 0000000..85e9f52
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/OUTWARr.mir
@@ -0,0 +1,22 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+--- |
+  target triple = "avr--"
+  define void @test() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test
+
+    ; CHECK:      OUTARr 32, %r15
+    ; CHECK-NEXT: OUTARr 31, %r14
+
+    OUTWARr 31, %r15r14
+...
diff --git a/test/CodeGen/AVR/pseudo/POPWRd.mir b/test/CodeGen/AVR/pseudo/POPWRd.mir
new file mode 100644
index 0000000..6794742
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/POPWRd.mir
@@ -0,0 +1,22 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+--- |
+  target triple = "avr--"
+  define void @test() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test
+
+    ; CHECK:       %r29 = POPRd implicit-def %sp, implicit %sp
+    ; CHECK-LABEL: %r28 = POPRd implicit-def %sp, implicit %sp
+
+    %r29r28 = POPWRd implicit-def %sp, implicit %sp
+...
diff --git a/test/CodeGen/AVR/pseudo/PUSHWRr.mir b/test/CodeGen/AVR/pseudo/PUSHWRr.mir
new file mode 100644
index 0000000..9392086
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/PUSHWRr.mir
@@ -0,0 +1,22 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+--- |
+  target triple = "avr--"
+  define void @test() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test
+
+    ; CHECK:      PUSHRr %r28, implicit-def %sp, implicit %sp
+    ; CHECK-NEXT: PUSHRr %r29, implicit-def %sp, implicit %sp
+
+    PUSHWRr %r29r28, implicit-def %sp, implicit %sp
+...
diff --git a/test/CodeGen/AVR/pseudo/SBCIWRdK.mir b/test/CodeGen/AVR/pseudo/SBCIWRdK.mir
new file mode 100644
index 0000000..9152c6d
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/SBCIWRdK.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit subtraction with carry pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_sbciwrdk() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_sbciwrdk
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_sbciwrdk
+
+    ; CHECK:      %r20 = SBCIRdK %r20, 175, implicit-def %sreg, implicit killed %sreg
+    ; CHECK-NEXT: %r21 = SBCIRdK %r21, 250, implicit-def %sreg, implicit killed %sreg
+
+    %r21r20 = SBCIWRdK %r17r16, 64175, implicit-def %sreg, implicit %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/SBCWRdRr.mir b/test/CodeGen/AVR/pseudo/SBCWRdRr.mir
new file mode 100644
index 0000000..9159906
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/SBCWRdRr.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit subtraction with carry pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_sbcwrdrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_sbcwrdrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_sbcwrdrr
+
+    ; CHECK:      %r14 = SBCRdRr %r14, %r20, implicit-def %sreg
+    ; CHECK-NEXT: %r15 = SBCRdRr %r15, %r21, implicit-def %sreg, implicit killed %sreg
+
+    %r15r14 = SBCWRdRr %r15r14, %r21r20, implicit-def %sreg, implicit %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/SEXT.mir b/test/CodeGen/AVR/pseudo/SEXT.mir
new file mode 100644
index 0000000..069eb88
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/SEXT.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+--- |
+  target triple = "avr--"
+  define void @test() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test
+
+    ; CHECK:      %r14 = MOVRdRr %r31
+    ; CHECK-NEXT: %r15 = MOVRdRr %r31
+    ; CHECK-NEXT: %r15 = LSLRd killed %r15, implicit-def %sreg
+    ; CHECK-NEXT: %r15 = SBCRdRr killed %r15, killed %r15, implicit-def %sreg, implicit killed %sreg
+
+    %r15r14 = SEXT %r31, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/STDWPtrQRr.mir b/test/CodeGen/AVR/pseudo/STDWPtrQRr.mir
new file mode 100644
index 0000000..ff2fdb9
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/STDWPtrQRr.mir
@@ -0,0 +1,22 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo  %s -o - 2>&1 | FileCheck %s
+
+--- |
+  target triple = "avr--"
+  define void @test() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test
+
+    ; CHECK:      STDPtrQRr %r29r28, 10, %r0
+    ; CHECK-NEXT: STDPtrQRr %r29r28, 11, %r1
+
+    STDWPtrQRr %r29r28, 10, %r1r0
+...
diff --git a/test/CodeGen/AVR/pseudo/STSWKRr.mir b/test/CodeGen/AVR/pseudo/STSWKRr.mir
new file mode 100644
index 0000000..ccf8522
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/STSWKRr.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit STSWRdK pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_stswkrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_stswkrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_stswkrr
+
+    ; CHECK:      STSKRr 2560, %r31
+    ; CHECK-NEXT: STSKRr 2559, %r30
+
+    STSWKRr 2559, %r31r30
+...
diff --git a/test/CodeGen/AVR/pseudo/STWPtrPdRr.mir b/test/CodeGen/AVR/pseudo/STWPtrPdRr.mir
new file mode 100644
index 0000000..0d0d9e9
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/STWPtrPdRr.mir
@@ -0,0 +1,22 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+--- |
+  target triple = "avr--"
+  define void @test() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test
+
+    ; CHECK:      early-clobber %r31r30 = STPtrPdRr killed %r31r30, %r29, 52
+    ; CHECK-NEXT: early-clobber %r31r30 = STPtrPdRr killed %r31r30, %r28, 52
+
+    %r31r30 = STWPtrPdRr %r31r30, %r29r28, 52
+...
diff --git a/test/CodeGen/AVR/pseudo/STWPtrPiRr.mir b/test/CodeGen/AVR/pseudo/STWPtrPiRr.mir
new file mode 100644
index 0000000..a436d9b
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/STWPtrPiRr.mir
@@ -0,0 +1,22 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+--- |
+  target triple = "avr--"
+  define void @test() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test
+
+    ; CHECK:      early-clobber %r31r30 = STPtrPiRr killed %r31r30, %r28, 52
+    ; CHECK-NEXT: early-clobber %r31r30 = STPtrPiRr killed %r31r30, %r29, 52
+
+    %r31r30 = STWPtrPiRr %r31r30, %r29r28, 52
+...
diff --git a/test/CodeGen/AVR/pseudo/STWPtrRr.mir b/test/CodeGen/AVR/pseudo/STWPtrRr.mir
new file mode 100644
index 0000000..f85f4f8
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/STWPtrRr.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit STSWRdK pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_stwptrrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_stwptrrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_stwptrrr
+
+    ; CHECK:      STPtrRr %r31r30, %r16
+    ; CHECK-NEXT: STDPtrQRr %r31r30, 1, %r17
+
+    STWPtrRr %r31r30, %r17r16
+...
diff --git a/test/CodeGen/AVR/pseudo/SUBIWRdK.mir b/test/CodeGen/AVR/pseudo/SUBIWRdK.mir
new file mode 100644
index 0000000..95c68c0
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/SUBIWRdK.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit subtraction pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_subiwrdrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_subiwrdrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_subiwrdrr
+
+    ; CHECK:      %r20 = SUBIRdK %r20, 175, implicit-def %sreg
+    ; CHECK-NEXT: %r21 = SBCIRdK %r21, 250, implicit-def %sreg, implicit killed %sreg
+
+    %r21r20 = SUBIWRdK %r17r16, 64175, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/SUBWRdRr.mir b/test/CodeGen/AVR/pseudo/SUBWRdRr.mir
new file mode 100644
index 0000000..9892cf5
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/SUBWRdRr.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test checks the expansion of the 16-bit subtraction pseudo instruction.
+
+--- |
+  target triple = "avr--"
+  define void @test_subwrdrr() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_subwrdrr
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test_subwrdrr
+
+    ; CHECK:      %r14 = SUBRdRr %r14, %r20, implicit-def %sreg
+    ; CHECK-NEXT: %r15 = SBCRdRr %r15, %r21, implicit-def %sreg, implicit killed %sreg
+
+    %r15r14 = SUBWRdRr %r15r14, %r21r20, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/ZEXT.mir b/test/CodeGen/AVR/pseudo/ZEXT.mir
new file mode 100644
index 0000000..069eb88
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/ZEXT.mir
@@ -0,0 +1,24 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+--- |
+  target triple = "avr--"
+  define void @test() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test
+
+    ; CHECK:      %r14 = MOVRdRr %r31
+    ; CHECK-NEXT: %r15 = MOVRdRr %r31
+    ; CHECK-NEXT: %r15 = LSLRd killed %r15, implicit-def %sreg
+    ; CHECK-NEXT: %r15 = SBCRdRr killed %r15, killed %r15, implicit-def %sreg, implicit killed %sreg
+
+    %r15r14 = SEXT %r31, implicit-def %sreg
+...
diff --git a/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir b/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir
new file mode 100644
index 0000000..1fe7ce3
--- /dev/null
+++ b/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir
@@ -0,0 +1,33 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+# This test ensures that the pseudo expander can correctly handle the case
+# where we are expanding a 16-bit LDD instruction where the source and
+# destination registers are the same.
+#
+# The instruction itself is earlyclobber and so ISel will never produce an
+# instruction like this, but the stack slot loading can and will.
+
+--- |
+  target triple = "avr--"
+  define void @test_lddw() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_lddw
+registers:
+  - { id: 0, class: _ }
+body: |
+  ; CHECK-LABEL: bb.0.entry
+  bb.0.entry:
+
+    ; CHECK-NEXT: early-clobber %r0 = LDDRdPtrQ %r29r28, 1
+    ; CHECK-NEXT: PUSHRr %r0, implicit-def %sp, implicit %sp
+    ; CHECK-NEXT: early-clobber %r0 = LDDRdPtrQ %r29r28, 2
+    ; CHECK-NEXT: MOVRdRr %r29, %r0
+    ; CHECK-NEXT: POPRd %r28, implicit-def %sp, implicit %sp
+
+    early-clobber %r29r28 = LDDWRdYQ %r29r28, 1
+...
diff --git a/test/CodeGen/MIR/AMDGPU/insert-waits-exp.mir b/test/CodeGen/MIR/AMDGPU/insert-waits-exp.mir
new file mode 100644
index 0000000..9aaa374
--- /dev/null
+++ b/test/CodeGen/MIR/AMDGPU/insert-waits-exp.mir
@@ -0,0 +1,63 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s
+--- |
+  define amdgpu_ps <4 x float> @exp_done_waitcnt(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
+    %a = load volatile float, float addrspace(1)* undef
+    %b = load volatile float, float addrspace(1)* undef
+    %c = load volatile float, float addrspace(1)* undef
+    %d = load volatile float, float addrspace(1)* undef
+    call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %a, float %b, float %c, float %d)
+    ret <4 x float> <float 5.000000e-01, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>
+  }
+
+  declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+  attributes #0 = { readnone }
+  attributes #1 = { nounwind }
+
+...
+---
+
+# CHECK-LABEL: name: exp_done_waitcnt{{$}}
+# CHECK: EXP_DONE
+# CHECK-NEXT: S_WAITCNT 3855
+# CHECK: %vgpr0 = V_MOV_B32
+# CHECK: %vgpr1 = V_MOV_B32
+# CHECK: %vgpr2 = V_MOV_B32
+# CHECK: %vgpr3 = V_MOV_B32
+name:            exp_done_waitcnt
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.2):
+    %sgpr3 = S_MOV_B32 61440
+    %sgpr2 = S_MOV_B32 -1
+    %vgpr0 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %vgpr1 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %vgpr2 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %vgpr3 = BUFFER_LOAD_DWORD_OFFSET killed %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    EXP_DONE 0, killed %vgpr0, killed %vgpr1, killed %vgpr2, killed %vgpr3, -1, -1, 15, implicit %exec
+    %vgpr0 = V_MOV_B32_e32 1056964608, implicit %exec
+    %vgpr1 = V_MOV_B32_e32 1065353216, implicit %exec
+    %vgpr2 = V_MOV_B32_e32 1073741824, implicit %exec
+    %vgpr3 = V_MOV_B32_e32 1082130432, implicit %exec
+    SI_RETURN killed %vgpr0, killed %vgpr1, killed %vgpr2, killed %vgpr3
+
+...
diff --git a/test/CodeGen/MIR/AMDGPU/movrels-bug.mir b/test/CodeGen/MIR/AMDGPU/movrels-bug.mir
index 6493cc8..9c330bc 100644
--- a/test/CodeGen/MIR/AMDGPU/movrels-bug.mir
+++ b/test/CodeGen/MIR/AMDGPU/movrels-bug.mir
@@ -25,7 +25,7 @@
     %m0 = S_MOV_B32 undef %sgpr0
     %vgpr1 = V_MOVRELS_B32_e32 undef %vgpr1, implicit %m0, implicit %exec, implicit killed %vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
     %vgpr4 = V_MAC_F32_e32 undef %vgpr0, undef %vgpr0, undef %vgpr4, implicit %exec
-    EXP 15, 12, 0, 1, 0, undef %vgpr0, killed %vgpr1, killed %vgpr4, undef %vgpr0, implicit %exec
+    EXP_DONE 15, undef %vgpr0, killed %vgpr1, killed %vgpr4, undef %vgpr0, 0, 0, 12, implicit %exec
     S_ENDPGM
 
 ...
diff --git a/test/CodeGen/MSP430/umulo-16.ll b/test/CodeGen/MSP430/umulo-16.ll
new file mode 100644
index 0000000..bd42137
--- /dev/null
+++ b/test/CodeGen/MSP430/umulo-16.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=msp430 | FileCheck %s
+target datalayout = "e-m:e-p:16:16-i32:16:32-a:16-n8:16"
+target triple = "msp430"
+
+define void @foo(i16 %arg) unnamed_addr {
+entry-block:
+  br i1 undef, label %bb2, label %bb3
+
+bb2:                                              ; preds = %entry-block
+  unreachable
+
+bb3:                                              ; preds = %entry-block
+  %0 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 %arg)
+; CHECK: call
+  %1 = extractvalue { i16, i1 } %0, 1
+  %2 = call i1 @llvm.expect.i1(i1 %1, i1 false)
+  br i1 %2, label %panic, label %bb5
+
+bb5:                                              ; preds = %bb3
+  unreachable
+
+panic:                                            ; preds = %bb3
+  unreachable
+}
+
+; Function Attrs: nounwind readnone
+declare i1 @llvm.expect.i1(i1, i1) #0
+
+; Function Attrs: nounwind readnone
+declare { i16, i1 } @llvm.umul.with.overflow.i16(i16, i16) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/Mips/ehframe-indirect.ll b/test/CodeGen/Mips/ehframe-indirect.ll
index 79bd7f2..ab799eb 100644
--- a/test/CodeGen/Mips/ehframe-indirect.ll
+++ b/test/CodeGen/Mips/ehframe-indirect.ll
@@ -4,8 +4,6 @@
 ; RUN:     FileCheck -check-prefixes=ALL,LINUX,LINUX-O32,O32 %s
 ; RUN: llc -mtriple=mips64el-linux-gnu -target-abi=n32 < %s -asm-verbose -relocation-model=pic | \
 ; RUN:     FileCheck -check-prefixes=ALL,LINUX,LINUX-N32,N32 %s
-; RUN: llc -mtriple=mips64el-linux-android -target-abi=n32 < %s -asm-verbose -relocation-model=pic | \
-; RUN:     FileCheck -check-prefixes=ALL,LINUX,LINUX-N32,N32 %s
 ; RUN: llc -mtriple=mips64el-linux-gnu < %s -asm-verbose -relocation-model=pic | \
 ; RUN:     FileCheck -check-prefixes=ALL,LINUX,LINUX-N64,N64 %s
 ; RUN: llc -mtriple=mips64el-linux-android < %s -asm-verbose -relocation-model=pic | \
diff --git a/test/CodeGen/Mips/fp16-promote.ll b/test/CodeGen/Mips/fp16-promote.ll
index f060f6a..e3d3a0a 100644
--- a/test/CodeGen/Mips/fp16-promote.ll
+++ b/test/CodeGen/Mips/fp16-promote.ll
@@ -1,4 +1,4 @@
-; RUN: llc -asm-verbose=false -mtriple=mipsel-linux-gnueabi -relocation-model=pic < %s | FileCheck %s -check-prefix=CHECK-LIBCALL
+; RUN: llc -asm-verbose=false -mtriple=mipsel-linux-gnu -relocation-model=pic < %s | FileCheck %s -check-prefix=CHECK-LIBCALL
 
 ; CHECK-LIBCALL-LABEL: test_fadd:
 ; CHECK-LIBCALL: %call16(__gnu_h2f_ieee)
diff --git a/test/CodeGen/PowerPC/build-vector-tests.ll b/test/CodeGen/PowerPC/build-vector-tests.ll
new file mode 100644
index 0000000..7babd3f
--- /dev/null
+++ b/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -0,0 +1,4858 @@
+; RUN: llc -mcpu=pwr9 -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-unknown \
+; RUN:   < %s | FileCheck %s -check-prefix=P9BE -implicit-check-not frsp
+; RUN: llc -mcpu=pwr9 -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-unknown \
+; RUN:   < %s | FileCheck %s -check-prefix=P9LE -implicit-check-not frsp
+; RUN: llc -mcpu=pwr8 -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-unknown \
+; RUN:   < %s | FileCheck %s -check-prefix=P8BE -implicit-check-not frsp
+; RUN: llc -mcpu=pwr8 -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-unknown \
+; RUN:   < %s | FileCheck %s -check-prefix=P8LE -implicit-check-not frsp
+
+; This test case comes from the following C test case (included as it may be
+; slightly more readable than the LLVM IR.
+
+;/*  This test case provides various ways of building vectors to ensure we
+;    produce optimal code for all cases. The cases are (for each type):
+;    - All zeros
+;    - All ones
+;    - Splat of a constant
+;    - From different values already in registers
+;    - From different constants
+;    - From different values in memory
+;    - Splat of a value in register
+;    - Splat of a value in memory
+;    - Inserting element into existing vector
+;    - Inserting element from existing vector into existing vector
+;
+;    With conversions (float <-> int)
+;    - Splat of a constant
+;    - From different values already in registers
+;    - From different constants
+;    - From different values in memory
+;    - Splat of a value in register
+;    - Splat of a value in memory
+;    - Inserting element into existing vector
+;    - Inserting element from existing vector into existing vector
+;*/
+;
+;/*=================================== int ===================================*/
+;// P8: xxlxor                                                                //
+;// P9: xxlxor                                                                //
+;vector int allZeroi() {                                                      //
+;  return (vector int)0;                                                      //
+;}                                                                            //
+;// P8: vspltisb -1                                                           //
+;// P9: xxspltisb 255                                                         //
+;vector int allOnei() {                                                       //
+;  return (vector int)-1;                                                     //
+;}                                                                            //
+;// P8: vspltisw 1                                                            //
+;// P9: vspltisw 1                                                            //
+;vector int spltConst1i() {                                                   //
+;  return (vector int)1;                                                      //
+;}                                                                            //
+;// P8: vspltisw -15; vsrw                                                    //
+;// P9: vspltisw -15; vsrw                                                    //
+;vector int spltConst16ki() {                                                 //
+;  return (vector int)((1<<15) - 1);                                          //
+;}                                                                            //
+;// P8: vspltisw -16; vsrw                                                    //
+;// P9: vspltisw -16; vsrw                                                    //
+;vector int spltConst32ki() {                                                 //
+;  return (vector int)((1<<16) - 1);                                          //
+;}                                                                            //
+;// P8: 4 x mtvsrwz, 2 x xxmrgh, vmrgow                                       //
+;// P9: 2 x mtvsrdd, vmrgow                                                   //
+;vector int fromRegsi(int a, int b, int c, int d) {                           //
+;  return (vector int){ a, b, c, d };                                         //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx (or even lxv)                                                    //
+;vector int fromDiffConstsi() {                                               //
+;  return (vector int) { 242, -113, 889, 19 };                                //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx                                                                  //
+;vector int fromDiffMemConsAi(int *arr) {                                     //
+;  return (vector int) { arr[0], arr[1], arr[2], arr[3] };                    //
+;}                                                                            //
+;// P8: 2 x lxvd2x, 2 x xxswapd, vperm                                        //
+;// P9: 2 x lxvx, vperm                                                       //
+;vector int fromDiffMemConsDi(int *arr) {                                     //
+;  return (vector int) { arr[3], arr[2], arr[1], arr[0] };                    //
+;}                                                                            //
+;// P8: sldi 2, lxvd2x, xxswapd                                               //
+;// P9: sldi 2, lxvx                                                          //
+;vector int fromDiffMemVarAi(int *arr, int elem) {                            //
+;  return (vector int) { arr[elem], arr[elem+1], arr[elem+2], arr[elem+3] };  //
+;}                                                                            //
+;// P8: sldi 2, 2 x lxvd2x, 2 x xxswapd, vperm                                //
+;// P9: sldi 2, 2 x lxvx, vperm                                               //
+;vector int fromDiffMemVarDi(int *arr, int elem) {                            //
+;  return (vector int) { arr[elem], arr[elem-1], arr[elem-2], arr[elem-3] };  //
+;}                                                                            //
+;// P8: 4 x lwz, 4 x mtvsrwz, 2 x xxmrghd, vmrgow                             //
+;// P9: 4 x lwz, 2 x mtvsrdd, vmrgow                                          //
+;vector int fromRandMemConsi(int *arr) {                                      //
+;  return (vector int) { arr[4], arr[18], arr[2], arr[88] };                  //
+;}                                                                            //
+;// P8: sldi 2, 4 x lwz, 4 x mtvsrwz, 2 x xxmrghd, vmrgow                     //
+;// P9: sldi 2, add, 4 x lwz, 2 x mtvsrdd, vmrgow                             //
+;vector int fromRandMemVari(int *arr, int elem) {                             //
+;  return (vector int) { arr[elem+4], arr[elem+1], arr[elem+2], arr[elem+8] };//
+;}                                                                            //
+;// P8: mtvsrwz, xxspltw                                                      //
+;// P9: mtvsrws                                                               //
+;vector int spltRegVali(int val) {                                            //
+;  return (vector int) val;                                                   //
+;}                                                                            //
+;// P8: lxsiwax, xxspltw                                                      //
+;// P9: lxvwsx                                                                //
+;vector int spltMemVali(int *ptr) {                                           //
+;  return (vector int)*ptr;                                                   //
+;}                                                                            //
+;// P8: vspltisw                                                              //
+;// P9: vspltisw                                                              //
+;vector int spltCnstConvftoi() {                                              //
+;  return (vector int) 4.74f;                                                 //
+;}                                                                            //
+;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws                         //
+;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvdpsxws                         //
+;vector int fromRegsConvftoi(float a, float b, float c, float d) {            //
+;  return (vector int) { a, b, c, d };                                        //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx (even lxv)                                                       //
+;vector int fromDiffConstsConvftoi() {                                        //
+;  return (vector int) { 24.46f, 234.f, 988.19f, 422.39f };                   //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd, xvcvspsxws                                           //
+;// P9: lxvx, xvcvspsxws                                                      //
+;vector int fromDiffMemConsAConvftoi(float *ptr) {                            //
+;  return (vector int) { ptr[0], ptr[1], ptr[2], ptr[3] };                    //
+;}                                                                            //
+;// P8: 2 x lxvd2x, 2 x xxswapd, vperm, xvcvspsxws                            //
+;// P9: 2 x lxvx, vperm, xvcvspsxws                                           //
+;vector int fromDiffMemConsDConvftoi(float *ptr) {                            //
+;  return (vector int) { ptr[3], ptr[2], ptr[1], ptr[0] };                    //
+;}                                                                            //
+;// P8: 4 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws             //
+;// P9: 4 x lxssp, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws              //
+;// Note: if the consecutive loads learns to handle pre-inc, this can be:     //
+;//       sldi 2, load, xvcvspuxws                                            //
+;vector int fromDiffMemVarAConvftoi(float *arr, int elem) {                   //
+;  return (vector int) { arr[elem], arr[elem+1], arr[elem+2], arr[elem+3] };  //
+;}                                                                            //
+;// P8: 4 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws             //
+;// P9: 4 x lxssp, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws              //
+;// Note: if the consecutive loads learns to handle pre-inc, this can be:     //
+;//       sldi 2, 2 x load, vperm, xvcvspuxws                                 //
+;vector int fromDiffMemVarDConvftoi(float *arr, int elem) {                   //
+;  return (vector int) { arr[elem], arr[elem-1], arr[elem-2], arr[elem-3] };  //
+;}                                                                            //
+;// P8: xscvdpsxws, xxspltw                                                   //
+;// P9: xscvdpsxws, xxspltw                                                   //
+;vector int spltRegValConvftoi(float val) {                                   //
+;  return (vector int) val;                                                   //
+;}                                                                            //
+;// P8: lxsspx, xscvdpsxws, xxspltw                                           //
+;// P9: lxvwsx, xvcvspsxws                                                    //
+;vector int spltMemValConvftoi(float *ptr) {                                  //
+;  return (vector int)*ptr;                                                   //
+;}                                                                            //
+;// P8: vspltisw                                                              //
+;// P9: vspltisw                                                              //
+;vector int spltCnstConvdtoi() {                                              //
+;  return (vector int) 4.74;                                                  //
+;}                                                                            //
+;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws                         //
+;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws                         //
+;vector int fromRegsConvdtoi(double a, double b, double c, double d) {        //
+;  return (vector int) { a, b, c, d };                                        //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx (even lxv)                                                       //
+;vector int fromDiffConstsConvdtoi() {                                        //
+;  return (vector int) { 24.46, 234., 988.19, 422.39 };                       //
+;}                                                                            //
+;// P8: 2 x lxvd2x, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew,      //
+;//     xvcvspsxws                                                            //
+;// P9: 2 x lxvx, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew,        //
+;//     xvcvspsxws                                                            //
+;vector int fromDiffMemConsAConvdtoi(double *ptr) {                           //
+;  return (vector int) { ptr[0], ptr[1], ptr[2], ptr[3] };                    //
+;}                                                                            //
+;// P8: 4 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws              //
+;// P9: 4 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws                //
+;vector int fromDiffMemConsDConvdtoi(double *ptr) {                           //
+;  return (vector int) { ptr[3], ptr[2], ptr[1], ptr[0] };                    //
+;}                                                                            //
+;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws       //
+;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws         //
+;vector int fromDiffMemVarAConvdtoi(double *arr, int elem) {                  //
+;  return (vector int) { arr[elem], arr[elem+1], arr[elem+2], arr[elem+3] };  //
+;}                                                                            //
+;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws       //
+;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws         //
+;vector int fromDiffMemVarDConvdtoi(double *arr, int elem) {                  //
+;  return (vector int) { arr[elem], arr[elem-1], arr[elem-2], arr[elem-3] };  //
+;}                                                                            //
+;// P8: xscvdpsxws, xxspltw                                                   //
+;// P9: xscvdpsxws, xxspltw                                                   //
+;vector int spltRegValConvdtoi(double val) {                                  //
+;  return (vector int) val;                                                   //
+;}                                                                            //
+;// P8: lxsdx, xscvdpsxws, xxspltw                                            //
+;// P9: lxssp, xscvdpsxws, xxspltw                                            //
+;vector int spltMemValConvdtoi(double *ptr) {                                 //
+;  return (vector int)*ptr;                                                   //
+;}                                                                            //
+;/*=================================== int ===================================*/
+;/*=============================== unsigned int ==============================*/
+;// P8: xxlxor                                                                //
+;// P9: xxlxor                                                                //
+;vector unsigned int allZeroui() {                                            //
+;  return (vector unsigned int)0;                                             //
+;}                                                                            //
+;// P8: vspltisb -1                                                           //
+;// P9: xxspltisb 255                                                         //
+;vector unsigned int allOneui() {                                             //
+;  return (vector unsigned int)-1;                                            //
+;}                                                                            //
+;// P8: vspltisw 1                                                            //
+;// P9: vspltisw 1                                                            //
+;vector unsigned int spltConst1ui() {                                         //
+;  return (vector unsigned int)1;                                             //
+;}                                                                            //
+;// P8: vspltisw -15; vsrw                                                    //
+;// P9: vspltisw -15; vsrw                                                    //
+;vector unsigned int spltConst16kui() {                                       //
+;  return (vector unsigned int)((1<<15) - 1);                                 //
+;}                                                                            //
+;// P8: vspltisw -16; vsrw                                                    //
+;// P9: vspltisw -16; vsrw                                                    //
+;vector unsigned int spltConst32kui() {                                       //
+;  return (vector unsigned int)((1<<16) - 1);                                 //
+;}                                                                            //
+;// P8: 4 x mtvsrwz, 2 x xxmrghd, vmrgow                                      //
+;// P9: 2 x mtvsrdd, vmrgow                                                   //
+;vector unsigned int fromRegsui(unsigned int a, unsigned int b,               //
+;                              unsigned int c, unsigned int d) {              //
+;  return (vector unsigned int){ a, b, c, d };                                //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx (or even lxv)                                                    //
+;vector unsigned int fromDiffConstsui() {                                     //
+;  return (vector unsigned int) { 242, -113, 889, 19 };                       //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx                                                                  //
+;vector unsigned int fromDiffMemConsAui(unsigned int *arr) {                  //
+;  return (vector unsigned int) { arr[0], arr[1], arr[2], arr[3] };           //
+;}                                                                            //
+;// P8: 2 x lxvd2x, 2 x xxswapd, vperm                                        //
+;// P9: 2 x lxvx, vperm                                                       //
+;vector unsigned int fromDiffMemConsDui(unsigned int *arr) {                  //
+;  return (vector unsigned int) { arr[3], arr[2], arr[1], arr[0] };           //
+;}                                                                            //
+;// P8: sldi 2, lxvd2x, xxswapd                                               //
+;// P9: sldi 2, lxvx                                                          //
+;vector unsigned int fromDiffMemVarAui(unsigned int *arr, int elem) {         //
+;  return (vector unsigned int) { arr[elem], arr[elem+1],                     //
+;                                 arr[elem+2], arr[elem+3] };                 //
+;}                                                                            //
+;// P8: sldi 2, 2 x lxvd2x, 2 x xxswapd, vperm                                //
+;// P9: sldi 2, 2 x lxvx, vperm                                               //
+;vector unsigned int fromDiffMemVarDui(unsigned int *arr, int elem) {         //
+;  return (vector unsigned int) { arr[elem], arr[elem-1],                     //
+;                                 arr[elem-2], arr[elem-3] };                 //
+;}                                                                            //
+;// P8: 4 x lwz, 4 x mtvsrwz, 2 x xxmrghd, vmrgow                             //
+;// P9: 4 x lwz, 2 x mtvsrdd, vmrgow                                          //
+;vector unsigned int fromRandMemConsui(unsigned int *arr) {                   //
+;  return (vector unsigned int) { arr[4], arr[18], arr[2], arr[88] };         //
+;}                                                                            //
+;// P8: sldi 2, 4 x lwz, 4 x mtvsrwz, 2 x xxmrghd, vmrgow                     //
+;// P9: sldi 2, add, 4 x lwz, 2 x mtvsrdd, vmrgow                             //
+;vector unsigned int fromRandMemVarui(unsigned int *arr, int elem) {          //
+;  return (vector unsigned int) { arr[elem+4], arr[elem+1],                   //
+;                                 arr[elem+2], arr[elem+8] };                 //
+;}                                                                            //
+;// P8: mtvsrwz, xxspltw                                                      //
+;// P9: mtvsrws                                                               //
+;vector unsigned int spltRegValui(unsigned int val) {                         //
+;  return (vector unsigned int) val;                                          //
+;}                                                                            //
+;// P8: lxsiwax, xxspltw                                                      //
+;// P9: lxvwsx                                                                //
+;vector unsigned int spltMemValui(unsigned int *ptr) {                        //
+;  return (vector unsigned int)*ptr;                                          //
+;}                                                                            //
+;// P8: vspltisw                                                              //
+;// P9: vspltisw                                                              //
+;vector unsigned int spltCnstConvftoui() {                                    //
+;  return (vector unsigned int) 4.74f;                                        //
+;}                                                                            //
+;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws                         //
+;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws                         //
+;vector unsigned int fromRegsConvftoui(float a, float b, float c, float d) {  //
+;  return (vector unsigned int) { a, b, c, d };                               //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx (even lxv)                                                       //
+;vector unsigned int fromDiffConstsConvftoui() {                              //
+;  return (vector unsigned int) { 24.46f, 234.f, 988.19f, 422.39f };          //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd, xvcvspuxws                                           //
+;// P9: lxvx, xvcvspuxws                                                      //
+;vector unsigned int fromDiffMemConsAConvftoui(float *ptr) {                  //
+;  return (vector unsigned int) { ptr[0], ptr[1], ptr[2], ptr[3] };           //
+;}                                                                            //
+;// P8: 2 x lxvd2x, 2 x xxswapd, vperm, xvcvspuxws                            //
+;// P9: 2 x lxvx, vperm, xvcvspuxws                                           //
+;vector unsigned int fromDiffMemConsDConvftoui(float *ptr) {                  //
+;  return (vector unsigned int) { ptr[3], ptr[2], ptr[1], ptr[0] };           //
+;}                                                                            //
+;// P8: lfsux, 3 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws      //
+;// P9: lfsux, 3 x lfs, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws         //
+;// Note: if the consecutive loads learns to handle pre-inc, this can be:     //
+;//       sldi 2, load, xvcvspuxws                                            //
+;vector unsigned int fromDiffMemVarAConvftoui(float *arr, int elem) {         //
+;  return (vector unsigned int) { arr[elem], arr[elem+1],                     //
+;                                 arr[elem+2], arr[elem+3] };                 //
+;}                                                                            //
+;// P8: lfsux, 3 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws      //
+;// P9: lfsux, 3 x lfs, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws         //
+;// Note: if the consecutive loads learns to handle pre-inc, this can be:     //
+;//       sldi 2, 2 x load, vperm, xvcvspuxws                                 //
+;vector unsigned int fromDiffMemVarDConvftoui(float *arr, int elem) {         //
+;  return (vector unsigned int) { arr[elem], arr[elem-1],                     //
+;                                 arr[elem-2], arr[elem-3] };                 //
+;}                                                                            //
+;// P8: xscvdpuxws, xxspltw                                                   //
+;// P9: xscvdpuxws, xxspltw                                                   //
+;vector unsigned int spltRegValConvftoui(float val) {                         //
+;  return (vector unsigned int) val;                                          //
+;}                                                                            //
+;// P8: lxsspx, xscvdpuxws, xxspltw                                           //
+;// P9: lxvwsx, xvcvspuxws                                                    //
+;vector unsigned int spltMemValConvftoui(float *ptr) {                        //
+;  return (vector unsigned int)*ptr;                                          //
+;}                                                                            //
+;// P8: vspltisw                                                              //
+;// P9: vspltisw                                                              //
+;vector unsigned int spltCnstConvdtoui() {                                    //
+;  return (vector unsigned int) 4.74;                                         //
+;}                                                                            //
+;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws                         //
+;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws                         //
+;vector unsigned int fromRegsConvdtoui(double a, double b,                    //
+;                                      double c, double d) {                  //
+;  return (vector unsigned int) { a, b, c, d };                               //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx (even lxv)                                                       //
+;vector unsigned int fromDiffConstsConvdtoui() {                              //
+;  return (vector unsigned int) { 24.46, 234., 988.19, 422.39 };              //
+;}                                                                            //
+;// P8: 2 x lxvd2x, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew,      //
+;//     xvcvspuxws                                                            //
+;// P9: 2 x lxvx, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws          //
+;vector unsigned int fromDiffMemConsAConvdtoui(double *ptr) {                 //
+;  return (vector unsigned int) { ptr[0], ptr[1], ptr[2], ptr[3] };           //
+;}                                                                            //
+;// P8: 4 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws              //
+;// P9: 4 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws                //
+;vector unsigned int fromDiffMemConsDConvdtoui(double *ptr) {                 //
+;  return (vector unsigned int) { ptr[3], ptr[2], ptr[1], ptr[0] };           //
+;}                                                                            //
+;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws       //
+;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws         //
+;vector unsigned int fromDiffMemVarAConvdtoui(double *arr, int elem) {        //
+;  return (vector unsigned int) { arr[elem], arr[elem+1],                     //
+;                                 arr[elem+2], arr[elem+3] };                 //
+;}                                                                            //
+;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws       //
+;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws         //
+;vector unsigned int fromDiffMemVarDConvdtoui(double *arr, int elem) {        //
+;  return (vector unsigned int) { arr[elem], arr[elem-1],                     //
+;                                 arr[elem-2], arr[elem-3] };                 //
+;}                                                                            //
+;// P8: xscvdpuxws, xxspltw                                                   //
+;// P9: xscvdpuxws, xxspltw                                                   //
+;vector unsigned int spltRegValConvdtoui(double val) {                        //
+;  return (vector unsigned int) val;                                          //
+;}                                                                            //
+;// P8: lxsspx, xscvdpuxws, xxspltw                                           //
+;// P9: lfd, xscvdpuxws, xxspltw                                              //
+;vector unsigned int spltMemValConvdtoui(double *ptr) {                       //
+;  return (vector unsigned int)*ptr;                                          //
+;}                                                                            //
+;/*=============================== unsigned int ==============================*/
+;/*=============================== long long =================================*/
+;// P8: xxlxor                                                                //
+;// P9: xxlxor                                                                //
+;vector long long allZeroll() {                                               //
+;  return (vector long long)0;                                                //
+;}                                                                            //
+;// P8: vspltisb -1                                                           //
+;// P9: xxspltisb 255                                                         //
+;vector long long allOnell() {                                                //
+;  return (vector long long)-1;                                               //
+;}                                                                            //
+;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw))            //
+;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw))            //
+;vector long long spltConst1ll() {                                            //
+;  return (vector long long)1;                                                //
+;}                                                                            //
+;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw))      //
+;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw))      //
+;vector long long spltConst16kll() {                                          //
+;  return (vector long long)((1<<15) - 1);                                    //
+;}                                                                            //
+;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw))      //
+;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw))      //
+;vector long long spltConst32kll() {                                          //
+;  return (vector long long)((1<<16) - 1);                                    //
+;}                                                                            //
+;// P8: 2 x mtvsrd, xxmrghd                                                   //
+;// P9: mtvsrdd                                                               //
+;vector long long fromRegsll(long long a, long long b) {                      //
+;  return (vector long long){ a, b };                                         //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx (or even lxv)                                                    //
+;vector long long fromDiffConstsll() {                                        //
+;  return (vector long long) { 242, -113 };                                   //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx                                                                  //
+;vector long long fromDiffMemConsAll(long long *arr) {                        //
+;  return (vector long long) { arr[0], arr[1] };                              //
+;}                                                                            //
+;// P8: lxvd2x                                                                //
+;// P9: lxvx, xxswapd (maybe just use lxvd2x)                                 //
+;vector long long fromDiffMemConsDll(long long *arr) {                        //
+;  return (vector long long) { arr[3], arr[2] };                              //
+;}                                                                            //
+;// P8: sldi 3, lxvd2x, xxswapd                                               //
+;// P9: sldi 3, lxvx                                                          //
+;vector long long fromDiffMemVarAll(long long *arr, int elem) {               //
+;  return (vector long long) { arr[elem], arr[elem+1] };                      //
+;}                                                                            //
+;// P8: sldi 3, lxvd2x                                                        //
+;// P9: sldi 3, lxvx, xxswapd (maybe just use lxvd2x)                         //
+;vector long long fromDiffMemVarDll(long long *arr, int elem) {               //
+;  return (vector long long) { arr[elem], arr[elem-1] };                      //
+;}                                                                            //
+;// P8: 2 x ld, 2 x mtvsrd, xxmrghd                                           //
+;// P9: 2 x ld, mtvsrdd                                                       //
+;vector long long fromRandMemConsll(long long *arr) {                         //
+;  return (vector long long) { arr[4], arr[18] };                             //
+;}                                                                            //
+;// P8: sldi 3, add, 2 x ld, 2 x mtvsrd, xxmrghd                              //
+;// P9: sldi 3, add, 2 x ld, mtvsrdd                                          //
+;vector long long fromRandMemVarll(long long *arr, int elem) {                //
+;  return (vector long long) { arr[elem+4], arr[elem+1] };                    //
+;}                                                                            //
+;// P8: mtvsrd, xxspltd                                                       //
+;// P9: mtvsrdd                                                               //
+;vector long long spltRegValll(long long val) {                               //
+;  return (vector long long) val;                                             //
+;}                                                                            //
+;// P8: lxvdsx                                                                //
+;// P9: lxvdsx                                                                //
+;vector long long spltMemValll(long long *ptr) {                              //
+;  return (vector long long)*ptr;                                             //
+;}                                                                            //
+;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw))            //
+;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw))            //
+;vector long long spltCnstConvftoll() {                                       //
+;  return (vector long long) 4.74f;                                           //
+;}                                                                            //
+;// P8: xxmrghd, xvcvdpsxds                                                   //
+;// P9: xxmrghd, xvcvdpsxds                                                   //
+;vector long long fromRegsConvftoll(float a, float b) {                       //
+;  return (vector long long) { a, b };                                        //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx (even lxv)                                                       //
+;vector long long fromDiffConstsConvftoll() {                                 //
+;  return (vector long long) { 24.46f, 234.f };                               //
+;}                                                                            //
+;// P8: 2 x lxsspx, xxmrghd, xvcvdpsxds                                       //
+;// P9: 2 x lxssp, xxmrghd, xvcvdpsxds                                        //
+;vector long long fromDiffMemConsAConvftoll(float *ptr) {                     //
+;  return (vector long long) { ptr[0], ptr[1] };                              //
+;}                                                                            //
+;// P8: 2 x lxsspx, xxmrghd, xvcvdpsxds                                       //
+;// P9: 2 x lxssp, xxmrghd, xvcvdpsxds                                        //
+;vector long long fromDiffMemConsDConvftoll(float *ptr) {                     //
+;  return (vector long long) { ptr[3], ptr[2] };                              //
+;}                                                                            //
+;// P8: sldi 2, lfsux, lxsspx, xxmrghd, xvcvdpsxds                            //
+;// P9: sldi 2, lfsux, lfs, xxmrghd, xvcvdpsxds                               //
+;vector long long fromDiffMemVarAConvftoll(float *arr, int elem) {            //
+;  return (vector long long) { arr[elem], arr[elem+1] };                      //
+;}                                                                            //
+;// P8: sldi 2, lfsux, lxsspx, xxmrghd, xvcvdpsxds                            //
+;// P9: sldi 2, lfsux, lfs, xxmrghd, xvcvdpsxds                               //
+;vector long long fromDiffMemVarDConvftoll(float *arr, int elem) {            //
+;  return (vector long long) { arr[elem], arr[elem-1] };                      //
+;}                                                                            //
+;// P8: xscvdpsxds, xxspltd                                                   //
+;// P9: xscvdpsxds, xxspltd                                                   //
+;vector long long spltRegValConvftoll(float val) {                            //
+;  return (vector long long) val;                                             //
+;}                                                                            //
+;// P8: lxsspx, xscvdpsxds, xxspltd                                           //
+;// P9: lfs, xscvdpsxds, xxspltd                                              //
+;vector long long spltMemValConvftoll(float *ptr) {                           //
+;  return (vector long long)*ptr;                                             //
+;}                                                                            //
+;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw))            //
+;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw))            //
+;vector long long spltCnstConvdtoll() {                                       //
+;  return (vector long long) 4.74;                                            //
+;}                                                                            //
+;// P8: xxmrghd, xvcvdpsxds                                                   //
+;// P9: xxmrghd, xvcvdpsxds                                                   //
+;vector long long fromRegsConvdtoll(double a, double b) {                     //
+;  return (vector long long) { a, b };                                        //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx (even lxv)                                                       //
+;vector long long fromDiffConstsConvdtoll() {                                 //
+;  return (vector long long) { 24.46, 234. };                                 //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd, xvcvdpsxds                                           //
+;// P9: lxvx, xvcvdpsxds                                                      //
+;vector long long fromDiffMemConsAConvdtoll(double *ptr) {                    //
+;  return (vector long long) { ptr[0], ptr[1] };                              //
+;}                                                                            //
+;// P8: lxvd2x, xvcvdpsxds                                                    //
+;// P9: lxvx, xxswapd, xvcvdpsxds                                             //
+;vector long long fromDiffMemConsDConvdtoll(double *ptr) {                    //
+;  return (vector long long) { ptr[3], ptr[2] };                              //
+;}                                                                            //
+;// P8: sldi 3, lxvd2x, xxswapd, xvcvdpsxds                                   //
+;// P9: sldi 3, lxvx, xvcvdpsxds                                              //
+;vector long long fromDiffMemVarAConvdtoll(double *arr, int elem) {           //
+;  return (vector long long) { arr[elem], arr[elem+1] };                      //
+;}                                                                            //
+;// P8: sldi 3, lxvd2x, xvcvdpsxds                                            //
+;// P9: sldi 3, lxvx, xxswapd, xvcvdpsxds                                     //
+;vector long long fromDiffMemVarDConvdtoll(double *arr, int elem) {           //
+;  return (vector long long) { arr[elem], arr[elem-1] };                      //
+;}                                                                            //
+;// P8: xscvdpsxds, xxspltd                                                   //
+;// P9: xscvdpsxds, xxspltd                                                   //
+;vector long long spltRegValConvdtoll(double val) {                           //
+;  return (vector long long) val;                                             //
+;}                                                                            //
+;// P8: lxvdsx, xvcvdpsxds                                                    //
+;// P9: lxvdsx, xvcvdpsxds                                                    //
+;vector long long spltMemValConvdtoll(double *ptr) {                          //
+;  return (vector long long)*ptr;                                             //
+;}                                                                            //
+;/*=============================== long long =================================*/
+;/*========================== unsigned long long =============================*/
+;// P8: xxlxor                                                                //
+;// P9: xxlxor                                                                //
+;vector unsigned long long allZeroull() {                                     //
+;  return (vector unsigned long long)0;                                       //
+;}                                                                            //
+;// P8: vspltisb -1                                                           //
+;// P9: xxspltisb 255                                                         //
+;vector unsigned long long allOneull() {                                      //
+;  return (vector unsigned long long)-1;                                      //
+;}                                                                            //
+;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw))            //
+;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw))            //
+;vector unsigned long long spltConst1ull() {                                  //
+;  return (vector unsigned long long)1;                                       //
+;}                                                                            //
+;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw))      //
+;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw))      //
+;vector unsigned long long spltConst16kull() {                                //
+;  return (vector unsigned long long)((1<<15) - 1);                           //
+;}                                                                            //
+;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw))      //
+;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw, vsrw))      //
+;vector unsigned long long spltConst32kull() {                                //
+;  return (vector unsigned long long)((1<<16) - 1);                           //
+;}                                                                            //
+;// P8: 2 x mtvsrd, xxmrghd                                                   //
+;// P9: mtvsrdd                                                               //
+;vector unsigned long long fromRegsull(unsigned long long a,                  //
+;                                      unsigned long long b) {                //
+;  return (vector unsigned long long){ a, b };                                //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx (or even lxv)                                                    //
+;vector unsigned long long fromDiffConstsull() {                              //
+;  return (vector unsigned long long) { 242, -113 };                          //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx                                                                  //
+;vector unsigned long long fromDiffMemConsAull(unsigned long long *arr) {     //
+;  return (vector unsigned long long) { arr[0], arr[1] };                     //
+;}                                                                            //
+;// P8: lxvd2x                                                                //
+;// P9: lxvx, xxswapd (maybe just use lxvd2x)                                 //
+;vector unsigned long long fromDiffMemConsDull(unsigned long long *arr) {     //
+;  return (vector unsigned long long) { arr[3], arr[2] };                     //
+;}                                                                            //
+;// P8: sldi 3, lxvd2x, xxswapd                                               //
+;// P9: sldi 3, lxvx                                                          //
+;vector unsigned long long fromDiffMemVarAull(unsigned long long *arr,        //
+;                                             int elem) {                     //
+;  return (vector unsigned long long) { arr[elem], arr[elem+1] };             //
+;}                                                                            //
+;// P8: sldi 3, lxvd2x                                                        //
+;// P9: sldi 3, lxvx, xxswapd (maybe just use lxvd2x)                         //
+;vector unsigned long long fromDiffMemVarDull(unsigned long long *arr,        //
+;                                             int elem) {                     //
+;  return (vector unsigned long long) { arr[elem], arr[elem-1] };             //
+;}                                                                            //
+;// P8: 2 x ld, 2 x mtvsrd, xxmrghd                                           //
+;// P9: 2 x ld, mtvsrdd                                                       //
+;vector unsigned long long fromRandMemConsull(unsigned long long *arr) {      //
+;  return (vector unsigned long long) { arr[4], arr[18] };                    //
+;}                                                                            //
+;// P8: sldi 3, add, 2 x ld, 2 x mtvsrd, xxmrghd                              //
+;// P9: sldi 3, add, 2 x ld, mtvsrdd                                          //
+;vector unsigned long long fromRandMemVarull(unsigned long long *arr,         //
+;                                            int elem) {                      //
+;  return (vector unsigned long long) { arr[elem+4], arr[elem+1] };           //
+;}                                                                            //
+;// P8: mtvsrd, xxspltd                                                       //
+;// P9: mtvsrdd                                                               //
+;vector unsigned long long spltRegValull(unsigned long long val) {            //
+;  return (vector unsigned long long) val;                                    //
+;}                                                                            //
+;// P8: lxvdsx                                                                //
+;// P9: lxvdsx                                                                //
+;vector unsigned long long spltMemValull(unsigned long long *ptr) {           //
+;  return (vector unsigned long long)*ptr;                                    //
+;}                                                                            //
+;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw))            //
+;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw))            //
+;vector unsigned long long spltCnstConvftoull() {                             //
+;  return (vector unsigned long long) 4.74f;                                  //
+;}                                                                            //
+;// P8: xxmrghd, xvcvdpuxds                                                   //
+;// P9: xxmrghd, xvcvdpuxds                                                   //
+;vector unsigned long long fromRegsConvftoull(float a, float b) {             //
+;  return (vector unsigned long long) { a, b };                               //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx (even lxv)                                                       //
+;vector unsigned long long fromDiffConstsConvftoull() {                       //
+;  return (vector unsigned long long) { 24.46f, 234.f };                      //
+;}                                                                            //
+;// P8: 2 x lxsspx, xxmrghd, xvcvdpuxds                                       //
+;// P9: 2 x lxssp, xxmrghd, xvcvdpuxds                                        //
+;vector unsigned long long fromDiffMemConsAConvftoull(float *ptr) {           //
+;  return (vector unsigned long long) { ptr[0], ptr[1] };                     //
+;}                                                                            //
+;// P8: 2 x lxsspx, xxmrghd, xvcvdpuxds                                       //
+;// P9: 2 x lxssp, xxmrghd, xvcvdpuxds                                        //
+;vector unsigned long long fromDiffMemConsDConvftoull(float *ptr) {           //
+;  return (vector unsigned long long) { ptr[3], ptr[2] };                     //
+;}                                                                            //
+;// P8: sldi 2, lfsux, lxsspx, xxmrghd, xvcvdpuxds                            //
+;// P9: sldi 2, lfsux, lfs, xxmrghd, xvcvdpuxds                               //
+;vector unsigned long long fromDiffMemVarAConvftoull(float *arr, int elem) {  //
+;  return (vector unsigned long long) { arr[elem], arr[elem+1] };             //
+;}                                                                            //
+;// P8: sldi 2, lfsux, lxsspx, xxmrghd, xvcvdpuxds                            //
+;// P9: sldi 2, lfsux, lfs, xxmrghd, xvcvdpuxds                               //
+;vector unsigned long long fromDiffMemVarDConvftoull(float *arr, int elem) {  //
+;  return (vector unsigned long long) { arr[elem], arr[elem-1] };             //
+;}                                                                            //
+;// P8: xscvdpuxds, xxspltd                                                   //
+;// P9: xscvdpuxds, xxspltd                                                   //
+;vector unsigned long long spltRegValConvftoull(float val) {                  //
+;  return (vector unsigned long long) val;                                    //
+;}                                                                            //
+;// P8: lxsspx, xscvdpuxds, xxspltd                                           //
+;// P9: lfs, xscvdpuxds, xxspltd                                              //
+;vector unsigned long long spltMemValConvftoull(float *ptr) {                 //
+;  return (vector unsigned long long)*ptr;                                    //
+;}                                                                            //
+;// P8: constant pool load (possible: vmrgew (xxlxor), (vspltisw))            //
+;// P9: constant pool load (possible: vmrgew (xxlxor), (vspltisw))            //
+;vector unsigned long long spltCnstConvdtoull() {                             //
+;  return (vector unsigned long long) 4.74;                                   //
+;}                                                                            //
+;// P8: xxmrghd, xvcvdpuxds                                                   //
+;// P9: xxmrghd, xvcvdpuxds                                                   //
+;vector unsigned long long fromRegsConvdtoull(double a, double b) {           //
+;  return (vector unsigned long long) { a, b };                               //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd                                                       //
+;// P9: lxvx (even lxv)                                                       //
+;vector unsigned long long fromDiffConstsConvdtoull() {                       //
+;  return (vector unsigned long long) { 24.46, 234. };                        //
+;}                                                                            //
+;// P8: lxvd2x, xxswapd, xvcvdpuxds                                           //
+;// P9: lxvx, xvcvdpuxds                                                      //
+;vector unsigned long long fromDiffMemConsAConvdtoull(double *ptr) {          //
+;  return (vector unsigned long long) { ptr[0], ptr[1] };                     //
+;}                                                                            //
+;// P8: lxvd2x, xvcvdpuxds                                                    //
+;// P9: lxvx, xxswapd, xvcvdpuxds                                             //
+;vector unsigned long long fromDiffMemConsDConvdtoull(double *ptr) {          //
+;  return (vector unsigned long long) { ptr[3], ptr[2] };                     //
+;}                                                                            //
+;// P8: sldi 3, lxvd2x, xxswapd, xvcvdpuxds                                   //
+;// P9: sldi 3, lxvx, xvcvdpuxds                                              //
+;vector unsigned long long fromDiffMemVarAConvdtoull(double *arr, int elem) { //
+;  return (vector unsigned long long) { arr[elem], arr[elem+1] };             //
+;}                                                                            //
+;// P8: sldi 3, lxvd2x, xvcvdpuxds                                            //
+;// P9: sldi 3, lxvx, xxswapd, xvcvdpuxds                                     //
+;vector unsigned long long fromDiffMemVarDConvdtoull(double *arr, int elem) { //
+;  return (vector unsigned long long) { arr[elem], arr[elem-1] };             //
+;}                                                                            //
+;// P8: xscvdpuxds, xxspltd                                                   //
+;// P9: xscvdpuxds, xxspltd                                                   //
+;vector unsigned long long spltRegValConvdtoull(double val) {                 //
+;  return (vector unsigned long long) val;                                    //
+;}                                                                            //
+;// P8: lxvdsx, xvcvdpuxds                                                    //
+;// P9: lxvdsx, xvcvdpuxds                                                    //
+;vector unsigned long long spltMemValConvdtoull(double *ptr) {                //
+;  return (vector unsigned long long)*ptr;                                    //
+;}                                                                            //
+;/*========================== unsigned long long ==============================*/
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @allZeroi() {
+entry:
+  ret <4 x i32> zeroinitializer
+; P9BE-LABEL: allZeroi
+; P9LE-LABEL: allZeroi
+; P8BE-LABEL: allZeroi
+; P8LE-LABEL: allZeroi
+; P9BE: xxlxor v2, v2, v2
+; P9BE: blr
+; P9LE: xxlxor v2, v2, v2
+; P9LE: blr
+; P8BE: xxlxor v2, v2, v2
+; P8BE: blr
+; P8LE: xxlxor v2, v2, v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @allOnei() {
+entry:
+  ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+; P9BE-LABEL: allOnei
+; P9LE-LABEL: allOnei
+; P8BE-LABEL: allOnei
+; P8LE-LABEL: allOnei
+; P9BE: xxspltib v2, 255
+; P9BE: blr
+; P9LE: xxspltib v2, 255
+; P9LE: blr
+; P8BE: vspltisb v2, -1
+; P8BE: blr
+; P8LE: vspltisb v2, -1
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltConst1i() {
+entry:
+  ret <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; P9BE-LABEL: spltConst1i
+; P9LE-LABEL: spltConst1i
+; P8BE-LABEL: spltConst1i
+; P8LE-LABEL: spltConst1i
+; P9BE: vspltisw v2, 1
+; P9BE: blr
+; P9LE: vspltisw v2, 1
+; P9LE: blr
+; P8BE: vspltisw v2, 1
+; P8BE: blr
+; P8LE: vspltisw v2, 1
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltConst16ki() {
+entry:
+  ret <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
+; P9BE-LABEL: spltConst16ki
+; P9LE-LABEL: spltConst16ki
+; P8BE-LABEL: spltConst16ki
+; P8LE-LABEL: spltConst16ki
+; P9BE: vspltisw v2, -15
+; P9BE: vsrw v2, v2, v2
+; P9BE: blr
+; P9LE: vspltisw v2, -15
+; P9LE: vsrw v2, v2, v2
+; P9LE: blr
+; P8BE: vspltisw v2, -15
+; P8BE: vsrw v2, v2, v2
+; P8BE: blr
+; P8LE: vspltisw v2, -15
+; P8LE: vsrw v2, v2, v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltConst32ki() {
+entry:
+  ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
+; P9BE-LABEL: spltConst32ki
+; P9LE-LABEL: spltConst32ki
+; P8BE-LABEL: spltConst32ki
+; P8LE-LABEL: spltConst32ki
+; P9BE: vspltisw v2, -16
+; P9BE: vsrw v2, v2, v2
+; P9BE: blr
+; P9LE: vspltisw v2, -16
+; P9LE: vsrw v2, v2, v2
+; P9LE: blr
+; P8BE: vspltisw v2, -16
+; P8BE: vsrw v2, v2, v2
+; P8BE: blr
+; P8LE: vspltisw v2, -16
+; P8LE: vsrw v2, v2, v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @fromRegsi(i32 signext %a, i32 signext %b, i32 signext %c, i32 signext %d) {
+entry:
+  %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %c, i32 2
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %d, i32 3
+  ret <4 x i32> %vecinit3
+; P9BE-LABEL: fromRegsi
+; P9LE-LABEL: fromRegsi
+; P8BE-LABEL: fromRegsi
+; P8LE-LABEL: fromRegsi
+; P9BE-DAG: mtvsrdd [[REG1:v[0-9]+]], r3, r5
+; P9BE-DAG: mtvsrdd [[REG2:v[0-9]+]], r4, r6
+; P9BE: vmrgow v2, [[REG1]], [[REG2]]
+; P9BE: blr
+; P9LE-DAG: mtvsrdd [[REG1:v[0-9]+]], r5, r3
+; P9LE-DAG: mtvsrdd [[REG2:v[0-9]+]], r6, r4
+; P9LE: vmrgow v2, [[REG2]], [[REG1]]
+; P9LE: blr
+; P8BE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3
+; P8BE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4
+; P8BE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5
+; P8BE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6
+; P8BE-DAG: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG1]], {{[v][s]*}}[[REG3]]
+; P8BE-DAG: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG2]], {{[v][s]*}}[[REG4]]
+; P8BE: vmrgow v2, [[REG5]], [[REG6]]
+; P8LE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3
+; P8LE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4
+; P8LE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5
+; P8LE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6
+; P8LE: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG3]], {{[v][s]*}}[[REG1]]
+; P8LE: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG4]], {{[v][s]*}}[[REG2]]
+; P8LE: vmrgow v2, [[REG6]], [[REG5]]
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @fromDiffConstsi() {
+entry:
+  ret <4 x i32> <i32 242, i32 -113, i32 889, i32 19>
+; P9BE-LABEL: fromDiffConstsi
+; P9LE-LABEL: fromDiffConstsi
+; P8BE-LABEL: fromDiffConstsi
+; P8LE-LABEL: fromDiffConstsi
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvw4x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: xxswapd
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemConsAi(i32* nocapture readonly %arr) {
+entry:
+  %0 = load i32, i32* %arr, align 4
+  %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0
+  %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 1
+  %1 = load i32, i32* %arrayidx1, align 4
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %1, i32 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 2
+  %2 = load i32, i32* %arrayidx3, align 4
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %2, i32 2
+  %arrayidx5 = getelementptr inbounds i32, i32* %arr, i64 3
+  %3 = load i32, i32* %arrayidx5, align 4
+  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %3, i32 3
+  ret <4 x i32> %vecinit6
+; P9BE-LABEL: fromDiffMemConsAi
+; P9LE-LABEL: fromDiffMemConsAi
+; P8BE-LABEL: fromDiffMemConsAi
+; P8LE-LABEL: fromDiffMemConsAi
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvw4x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: xxswapd
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemConsDi(i32* nocapture readonly %arr) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 3
+  %0 = load i32, i32* %arrayidx, align 4
+  %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0
+  %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2
+  %1 = load i32, i32* %arrayidx1, align 4
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %1, i32 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 1
+  %2 = load i32, i32* %arrayidx3, align 4
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %2, i32 2
+  %3 = load i32, i32* %arr, align 4
+  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %3, i32 3
+  ret <4 x i32> %vecinit6
+; P9BE-LABEL: fromDiffMemConsDi
+; P9LE-LABEL: fromDiffMemConsDi
+; P8BE-LABEL: fromDiffMemConsDi
+; P8LE-LABEL: fromDiffMemConsDi
+; P9BE: lxvx
+; P9BE: lxvx
+; P9BE: vperm
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: lxvx
+; P9LE: vperm
+; P9LE: blr
+; P8BE: lxvw4x
+; P8BE: lxvw4x
+; P8BE: vperm
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE-DAG: lxvd2x
+; P8LE-DAG: xxswapd
+; P8LE: xxswapd
+; P8LE: vperm
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemVarAi(i32* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0
+  %add = add nsw i32 %elem, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %idxprom1
+  %1 = load i32, i32* %arrayidx2, align 4
+  %vecinit3 = insertelement <4 x i32> %vecinit, i32 %1, i32 1
+  %add4 = add nsw i32 %elem, 2
+  %idxprom5 = sext i32 %add4 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32* %arr, i64 %idxprom5
+  %2 = load i32, i32* %arrayidx6, align 4
+  %vecinit7 = insertelement <4 x i32> %vecinit3, i32 %2, i32 2
+  %add8 = add nsw i32 %elem, 3
+  %idxprom9 = sext i32 %add8 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32* %arr, i64 %idxprom9
+  %3 = load i32, i32* %arrayidx10, align 4
+  %vecinit11 = insertelement <4 x i32> %vecinit7, i32 %3, i32 3
+  ret <4 x i32> %vecinit11
+; P9BE-LABEL: fromDiffMemVarAi
+; P9LE-LABEL: fromDiffMemVarAi
+; P8BE-LABEL: fromDiffMemVarAi
+; P8LE-LABEL: fromDiffMemVarAi
+; P9BE: sldi r4, r4, 2
+; P9BE: lxvx v2, r3, r4
+; P9BE: blr
+; P9LE: sldi r4, r4, 2
+; P9LE: lxvx v2, r3, r4
+; P9LE: blr
+; P8BE: sldi r4, r4, 2
+; P8BE: lxvw4x {{[vs0-9]+}}, r3, r4
+; P8BE: blr
+; P8LE: sldi r4, r4, 2
+; P8LE: lxvd2x {{[vs0-9]+}}, r3, r4
+; P8LE: xxswapd
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemVarDi(i32* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0
+  %sub = add nsw i32 %elem, -1
+  %idxprom1 = sext i32 %sub to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %idxprom1
+  %1 = load i32, i32* %arrayidx2, align 4
+  %vecinit3 = insertelement <4 x i32> %vecinit, i32 %1, i32 1
+  %sub4 = add nsw i32 %elem, -2
+  %idxprom5 = sext i32 %sub4 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32* %arr, i64 %idxprom5
+  %2 = load i32, i32* %arrayidx6, align 4
+  %vecinit7 = insertelement <4 x i32> %vecinit3, i32 %2, i32 2
+  %sub8 = add nsw i32 %elem, -3
+  %idxprom9 = sext i32 %sub8 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32* %arr, i64 %idxprom9
+  %3 = load i32, i32* %arrayidx10, align 4
+  %vecinit11 = insertelement <4 x i32> %vecinit7, i32 %3, i32 3
+  ret <4 x i32> %vecinit11
+; P9BE-LABEL: fromDiffMemVarDi
+; P9LE-LABEL: fromDiffMemVarDi
+; P8BE-LABEL: fromDiffMemVarDi
+; P8LE-LABEL: fromDiffMemVarDi
+; P9BE: sldi r4, r4, 2
+; P9BE-DAG: lxvx {{[vs0-9]+}}, r3, r4
+; P9BE-DAG: lxvx
+; P9BE: vperm
+; P9BE: blr
+; P9LE: sldi r4, r4, 2
+; P9LE-DAG: lxvx {{[vs0-9]+}}, r3, r4
+; P9LE-DAG: lxvx
+; P9LE: vperm
+; P9LE: blr
+; P8BE: sldi r4, r4, 2
+; P8BE-DAG: lxvw4x {{[vs0-9]+}}, r3, r4
+; P8BE-DAG: lxvw4x
+; P8BE: vperm
+; P8BE: blr
+; P8LE: sldi r4, r4, 2
+; P8LE-DAG: lxvd2x
+; P8LE-DAG: lxvd2x
+; P8LE-DAG: xxswapd
+; P8LE: xxswapd
+; P8LE: vperm
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromRandMemConsi(i32* nocapture readonly %arr) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 4
+  %0 = load i32, i32* %arrayidx, align 4
+  %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0
+  %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 18
+  %1 = load i32, i32* %arrayidx1, align 4
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %1, i32 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 2
+  %2 = load i32, i32* %arrayidx3, align 4
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %2, i32 2
+  %arrayidx5 = getelementptr inbounds i32, i32* %arr, i64 88
+  %3 = load i32, i32* %arrayidx5, align 4
+  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %3, i32 3
+  ret <4 x i32> %vecinit6
+; P9BE-LABEL: fromRandMemConsi
+; P9LE-LABEL: fromRandMemConsi
+; P8BE-LABEL: fromRandMemConsi
+; P8LE-LABEL: fromRandMemConsi
+; P9BE: lwz
+; P9BE: lwz
+; P9BE: lwz
+; P9BE: lwz
+; P9BE: mtvsrdd
+; P9BE: mtvsrdd
+; P9BE: vmrgow
+; P9LE: lwz
+; P9LE: lwz
+; P9LE: lwz
+; P9LE: lwz
+; P9LE: mtvsrdd
+; P9LE: mtvsrdd
+; P9LE: vmrgow
+; P8BE: lwz
+; P8BE: lwz
+; P8BE: lwz
+; P8BE: lwz
+; P8BE: mtvsrwz
+; P8BE: mtvsrwz
+; P8BE: mtvsrwz
+; P8BE: mtvsrwz
+; P8BE: xxmrghd
+; P8BE: xxmrghd
+; P8BE: vmrgow
+; P8LE: lwz
+; P8LE: lwz
+; P8LE: lwz
+; P8LE: lwz
+; P8LE: mtvsrwz
+; P8LE: mtvsrwz
+; P8LE: mtvsrwz
+; P8LE: mtvsrwz
+; P8LE: xxmrghd
+; P8LE: xxmrghd
+; P8LE: vmrgow
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromRandMemVari(i32* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %add = add nsw i32 %elem, 4
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0
+  %add1 = add nsw i32 %elem, 1
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 %idxprom2
+  %1 = load i32, i32* %arrayidx3, align 4
+  %vecinit4 = insertelement <4 x i32> %vecinit, i32 %1, i32 1
+  %add5 = add nsw i32 %elem, 2
+  %idxprom6 = sext i32 %add5 to i64
+  %arrayidx7 = getelementptr inbounds i32, i32* %arr, i64 %idxprom6
+  %2 = load i32, i32* %arrayidx7, align 4
+  %vecinit8 = insertelement <4 x i32> %vecinit4, i32 %2, i32 2
+  %add9 = add nsw i32 %elem, 8
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %arr, i64 %idxprom10
+  %3 = load i32, i32* %arrayidx11, align 4
+  %vecinit12 = insertelement <4 x i32> %vecinit8, i32 %3, i32 3
+  ret <4 x i32> %vecinit12
+; P9BE-LABEL: fromRandMemVari
+; P9LE-LABEL: fromRandMemVari
+; P8BE-LABEL: fromRandMemVari
+; P8LE-LABEL: fromRandMemVari
+; P9BE: sldi r4, r4, 2
+; P9BE: lwz
+; P9BE: lwz
+; P9BE: lwz
+; P9BE: lwz
+; P9BE: mtvsrdd
+; P9BE: mtvsrdd
+; P9BE: vmrgow
+; P9LE: sldi r4, r4, 2
+; P9LE: lwz
+; P9LE: lwz
+; P9LE: lwz
+; P9LE: lwz
+; P9LE: mtvsrdd
+; P9LE: mtvsrdd
+; P9LE: vmrgow
+; P8BE: sldi r4, r4, 2
+; P8BE: lwz
+; P8BE: lwz
+; P8BE: lwz
+; P8BE: lwz
+; P8BE: mtvsrwz
+; P8BE: mtvsrwz
+; P8BE: mtvsrwz
+; P8BE: mtvsrwz
+; P8BE: xxmrghd
+; P8BE: xxmrghd
+; P8BE: vmrgow
+; P8LE: sldi r4, r4, 2
+; P8LE: lwz
+; P8LE: lwz
+; P8LE: lwz
+; P8LE: lwz
+; P8LE: mtvsrwz
+; P8LE: mtvsrwz
+; P8LE: mtvsrwz
+; P8LE: mtvsrwz
+; P8LE: xxmrghd
+; P8LE: xxmrghd
+; P8LE: vmrgow
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltRegVali(i32 signext %val) {
+entry:
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %val, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+; P9BE-LABEL: spltRegVali
+; P9LE-LABEL: spltRegVali
+; P8BE-LABEL: spltRegVali
+; P8LE-LABEL: spltRegVali
+; P9BE: mtvsrws v2, r3
+; P9BE: blr
+; P9LE: mtvsrws v2, r3
+; P9LE: blr
+; P8BE: mtvsrwz {{[vsf0-9]+}}, r3
+; P8BE: xxspltw v2, {{[vsf0-9]+}}, 1
+; P8BE: blr
+; P8LE: mtvsrwz {{[vsf0-9]+}}, r3
+; P8LE: xxspltw v2, {{[vsf0-9]+}}, 1
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @spltMemVali(i32* nocapture readonly %ptr) {
+entry:
+  %0 = load i32, i32* %ptr, align 4
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+; P9BE-LABEL: spltMemVali
+; P9LE-LABEL: spltMemVali
+; P8BE-LABEL: spltMemVali
+; P8LE-LABEL: spltMemVali
+; P9BE: lxvwsx v2, 0, r3
+; P9BE: blr
+; P9LE: lxvwsx v2, 0, r3
+; P9LE: blr
+; P8BE: lxsiwax {{[vsf0-9]+}}, 0, r3
+; P8BE: xxspltw v2, {{[vsf0-9]+}}, 1
+; P8BE: blr
+; P8LE: lxsiwax {{[vsf0-9]+}}, 0, r3
+; P8LE: xxspltw v2, {{[vsf0-9]+}}, 1
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltCnstConvftoi() {
+entry:
+  ret <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+; P9BE-LABEL: spltCnstConvftoi
+; P9LE-LABEL: spltCnstConvftoi
+; P8BE-LABEL: spltCnstConvftoi
+; P8LE-LABEL: spltCnstConvftoi
+; P9BE: vspltisw v2, 4
+; P9BE: blr
+; P9LE: vspltisw v2, 4
+; P9LE: blr
+; P8BE: vspltisw v2, 4
+; P8BE: blr
+; P8LE: vspltisw v2, 4
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @fromRegsConvftoi(float %a, float %b, float %c, float %d) {
+entry:
+  %conv = fptosi float %a to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %conv1 = fptosi float %b to i32
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %conv1, i32 1
+  %conv3 = fptosi float %c to i32
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %conv3, i32 2
+  %conv5 = fptosi float %d to i32
+  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %conv5, i32 3
+  ret <4 x i32> %vecinit6
+; P9BE-LABEL: fromRegsConvftoi
+; P9LE-LABEL: fromRegsConvftoi
+; P8BE-LABEL: fromRegsConvftoi
+; P8LE-LABEL: fromRegsConvftoi
+; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9BE: vmrgew v2, [[REG3]], [[REG4]]
+; P9BE: xvcvspsxws v2, v2
+; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9LE: vmrgew v2, [[REG4]], [[REG3]]
+; P9LE: xvcvspsxws v2, v2
+; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8BE: vmrgew v2, [[REG3]], [[REG4]]
+; P8BE: xvcvspsxws v2, v2
+; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8LE: vmrgew v2, [[REG4]], [[REG3]]
+; P8LE: xvcvspsxws v2, v2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @fromDiffConstsConvftoi() {
+entry:
+  ret <4 x i32> <i32 24, i32 234, i32 988, i32 422>
+; P9BE-LABEL: fromDiffConstsConvftoi
+; P9LE-LABEL: fromDiffConstsConvftoi
+; P8BE-LABEL: fromDiffConstsConvftoi
+; P8LE-LABEL: fromDiffConstsConvftoi
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvw4x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: xxswapd
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemConsAConvftoi(float* nocapture readonly %ptr) {
+entry:
+  %0 = bitcast float* %ptr to <4 x float>*
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = fptosi <4 x float> %1 to <4 x i32>
+  ret <4 x i32> %2
+; P9BE-LABEL: fromDiffMemConsAConvftoi
+; P9LE-LABEL: fromDiffMemConsAConvftoi
+; P8BE-LABEL: fromDiffMemConsAConvftoi
+; P8LE-LABEL: fromDiffMemConsAConvftoi
+; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9BE: xvcvspsxws v2, [[REG1]]
+; P9BE: blr
+; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9LE: xvcvspsxws v2, [[REG1]]
+; P9LE: blr
+; P8BE: lxvw4x [[REG1:[vs0-9]+]], 0, r3
+; P8BE: xvcvspsxws v2, [[REG1]]
+; P8BE: blr
+; P8LE: lxvd2x [[REG1:[vs0-9]+]], 0, r3
+; P8LE: xxswapd v2, [[REG1]]
+; P8LE: xvcvspsxws v2, v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemConsDConvftoi(float* nocapture readonly %ptr) {
+entry:
+  %arrayidx = getelementptr inbounds float, float* %ptr, i64 3
+  %0 = load float, float* %arrayidx, align 4
+  %conv = fptosi float %0 to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 2
+  %1 = load float, float* %arrayidx1, align 4
+  %conv2 = fptosi float %1 to i32
+  %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1
+  %arrayidx4 = getelementptr inbounds float, float* %ptr, i64 1
+  %2 = load float, float* %arrayidx4, align 4
+  %conv5 = fptosi float %2 to i32
+  %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2
+  %3 = load float, float* %ptr, align 4
+  %conv8 = fptosi float %3 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3
+  ret <4 x i32> %vecinit9
+; P9BE-LABEL: fromDiffMemConsDConvftoi
+; P9LE-LABEL: fromDiffMemConsDConvftoi
+; P8BE-LABEL: fromDiffMemConsDConvftoi
+; P8LE-LABEL: fromDiffMemConsDConvftoi
+; P9BE: lxvx
+; P9BE: lxvx
+; P9BE: vperm
+; P9BE: xvcvspsxws
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: lxvx
+; P9LE: vperm
+; P9LE: xvcvspsxws
+; P9LE: blr
+; P8BE: lxvw4x
+; P8BE: lxvw4x
+; P8BE: vperm
+; P8BE: xvcvspsxws
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE-DAG: lxvd2x
+; P8LE-DAG: xxswapd
+; P8LE: xxswapd
+; P8LE: vperm
+; P8LE: xvcvspsxws
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemVarAConvftoi(float* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  %conv = fptosi float %0 to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %add = add nsw i32 %elem, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1
+  %1 = load float, float* %arrayidx2, align 4
+  %conv3 = fptosi float %1 to i32
+  %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1
+  %add5 = add nsw i32 %elem, 2
+  %idxprom6 = sext i32 %add5 to i64
+  %arrayidx7 = getelementptr inbounds float, float* %arr, i64 %idxprom6
+  %2 = load float, float* %arrayidx7, align 4
+  %conv8 = fptosi float %2 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2
+  %add10 = add nsw i32 %elem, 3
+  %idxprom11 = sext i32 %add10 to i64
+  %arrayidx12 = getelementptr inbounds float, float* %arr, i64 %idxprom11
+  %3 = load float, float* %arrayidx12, align 4
+  %conv13 = fptosi float %3 to i32
+  %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3
+  ret <4 x i32> %vecinit14
+; P9BE-LABEL: fromDiffMemVarAConvftoi
+; P9LE-LABEL: fromDiffMemVarAConvftoi
+; P8BE-LABEL: fromDiffMemVarAConvftoi
+; P8LE-LABEL: fromDiffMemVarAConvftoi
+; FIXME: implement finding consecutive loads with pre-inc
+; P9BE: lfsux
+; P9LE: lfsux
+; P8BE: lfsux
+; P8LE: lfsux
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemVarDConvftoi(float* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  %conv = fptosi float %0 to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %sub = add nsw i32 %elem, -1
+  %idxprom1 = sext i32 %sub to i64
+  %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1
+  %1 = load float, float* %arrayidx2, align 4
+  %conv3 = fptosi float %1 to i32
+  %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1
+  %sub5 = add nsw i32 %elem, -2
+  %idxprom6 = sext i32 %sub5 to i64
+  %arrayidx7 = getelementptr inbounds float, float* %arr, i64 %idxprom6
+  %2 = load float, float* %arrayidx7, align 4
+  %conv8 = fptosi float %2 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2
+  %sub10 = add nsw i32 %elem, -3
+  %idxprom11 = sext i32 %sub10 to i64
+  %arrayidx12 = getelementptr inbounds float, float* %arr, i64 %idxprom11
+  %3 = load float, float* %arrayidx12, align 4
+  %conv13 = fptosi float %3 to i32
+  %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3
+  ret <4 x i32> %vecinit14
+; P9BE-LABEL: fromDiffMemVarDConvftoi
+; P9LE-LABEL: fromDiffMemVarDConvftoi
+; P8BE-LABEL: fromDiffMemVarDConvftoi
+; P8LE-LABEL: fromDiffMemVarDConvftoi
+; FIXME: implement finding consecutive loads with pre-inc
+; P9BE: lfsux
+; P9LE: lfsux
+; P8BE: lfsux
+; P8LE: lfsux
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltRegValConvftoi(float %val) {
+entry:
+  %conv = fptosi float %val to i32
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+; P9BE-LABEL: spltRegValConvftoi
+; P9LE-LABEL: spltRegValConvftoi
+; P8BE-LABEL: spltRegValConvftoi
+; P8LE-LABEL: spltRegValConvftoi
+; P9BE: xscvdpsxws f[[REG1:[0-9]+]], f1
+; P9BE: xxspltw v2, vs[[REG1]], 1
+; P9BE: blr
+; P9LE: xscvdpsxws f[[REG1:[0-9]+]], f1
+; P9LE: xxspltw v2, vs[[REG1]], 1
+; P9LE: blr
+; P8BE: xscvdpsxws f[[REG1:[0-9]+]], f1
+; P8BE: xxspltw v2, vs[[REG1]], 1
+; P8BE: blr
+; P8LE: xscvdpsxws f[[REG1:[0-9]+]], f1
+; P8LE: xxspltw v2, vs[[REG1]], 1
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @spltMemValConvftoi(float* nocapture readonly %ptr) {
+entry:
+  %0 = load float, float* %ptr, align 4
+  %conv = fptosi float %0 to i32
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+; P9BE-LABEL: spltMemValConvftoi
+; P9LE-LABEL: spltMemValConvftoi
+; P8BE-LABEL: spltMemValConvftoi
+; P8LE-LABEL: spltMemValConvftoi
+; P9BE: lxvwsx [[REG1:[vs0-9]+]], 0, r3
+; P9BE: xvcvspsxws v2, [[REG1]]
+; P9LE: [[REG1:[vs0-9]+]], 0, r3
+; P9LE: xvcvspsxws v2, [[REG1]]
+; P8BE: lxsspx [[REG1:f[0-9]+]], 0, r3
+; P8BE: xscvdpsxws f[[REG2:[0-9]+]], [[REG1]]
+; P8BE: xxspltw v2, vs[[REG2]], 1
+; P8LE: lxsspx [[REG1:f[0-9]+]], 0, r3
+; P8LE: xscvdpsxws f[[REG2:[vs0-9]+]], [[REG1]]
+; P8LE: xxspltw v2, vs[[REG2]], 1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltCnstConvdtoi() {
+entry:
+  ret <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+; P9BE-LABEL: spltCnstConvdtoi
+; P9LE-LABEL: spltCnstConvdtoi
+; P8BE-LABEL: spltCnstConvdtoi
+; P8LE-LABEL: spltCnstConvdtoi
+; P9BE: vspltisw v2, 4
+; P9BE: blr
+; P9LE: vspltisw v2, 4
+; P9LE: blr
+; P8BE: vspltisw v2, 4
+; P8BE: blr
+; P8LE: vspltisw v2, 4
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @fromRegsConvdtoi(double %a, double %b, double %c, double %d) {
+entry:
+  %conv = fptosi double %a to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %conv1 = fptosi double %b to i32
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %conv1, i32 1
+  %conv3 = fptosi double %c to i32
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %conv3, i32 2
+  %conv5 = fptosi double %d to i32
+  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %conv5, i32 3
+  ret <4 x i32> %vecinit6
+; P9BE-LABEL: fromRegsConvdtoi
+; P9LE-LABEL: fromRegsConvdtoi
+; P8BE-LABEL: fromRegsConvdtoi
+; P8LE-LABEL: fromRegsConvdtoi
+; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9BE: vmrgew v2, [[REG3]], [[REG4]]
+; P9BE: xvcvspsxws v2, v2
+; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9LE: vmrgew v2, [[REG4]], [[REG3]]
+; P9LE: xvcvspsxws v2, v2
+; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8BE: vmrgew v2, [[REG3]], [[REG4]]
+; P8BE: xvcvspsxws v2, v2
+; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8LE: vmrgew v2, [[REG4]], [[REG3]]
+; P8LE: xvcvspsxws v2, v2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @fromDiffConstsConvdtoi() {
+entry:
+  ret <4 x i32> <i32 24, i32 234, i32 988, i32 422>
+; P9BE-LABEL: fromDiffConstsConvdtoi
+; P9LE-LABEL: fromDiffConstsConvdtoi
+; P8BE-LABEL: fromDiffConstsConvdtoi
+; P8LE-LABEL: fromDiffConstsConvdtoi
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvw4x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: xxswapd
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemConsAConvdtoi(double* nocapture readonly %ptr) {
+entry:
+  %0 = bitcast double* %ptr to <2 x double>*
+  %1 = load <2 x double>, <2 x double>* %0, align 8
+  %2 = fptosi <2 x double> %1 to <2 x i32>
+  %arrayidx4 = getelementptr inbounds double, double* %ptr, i64 2
+  %3 = bitcast double* %arrayidx4 to <2 x double>*
+  %4 = load <2 x double>, <2 x double>* %3, align 8
+  %5 = fptosi <2 x double> %4 to <2 x i32>
+  %vecinit9 = shufflevector <2 x i32> %2, <2 x i32> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %vecinit9
+; P9BE-LABEL: fromDiffMemConsAConvdtoi
+; P9LE-LABEL: fromDiffMemConsAConvdtoi
+; P8BE-LABEL: fromDiffMemConsAConvdtoi
+; P8LE-LABEL: fromDiffMemConsAConvdtoi
+; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9BE: lxvx [[REG2:[vs0-9]+]], r3, r4
+; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
+; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
+; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
+; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
+; P9BE: vmrgew v2, [[REG6]], [[REG5]]
+; P9BE: xvcvspsxws v2, v2
+; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9LE: lxvx [[REG2:[vs0-9]+]], r3, r4
+; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]]
+; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]]
+; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
+; P9LE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
+; P9LE: vmrgew v2, [[REG6]], [[REG5]]
+; P9LE: xvcvspsxws v2, v2
+; P8BE: lxvd2x [[REG1:[vs0-9]+]], 0, r3
+; P8BE: lxvd2x [[REG2:[vs0-9]+]], r3, r4
+; P8BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
+; P8BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
+; P8BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
+; P8BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
+; P8BE: vmrgew v2, [[REG6]], [[REG5]]
+; P8BE: xvcvspsxws v2, v2
+; P8LE: lxvd2x [[REG1:[vs0-9]+]], 0, r3
+; P8LE: lxvd2x [[REG2:[vs0-9]+]], r3, r4
+; P8LE-DAG: xxswapd [[REG3:[vs0-9]+]], [[REG1]]
+; P8LE-DAG: xxswapd [[REG4:[vs0-9]+]], [[REG2]]
+; P8LE-DAG: xxmrgld [[REG5:[vs0-9]+]], [[REG4]], [[REG3]]
+; P8LE-DAG: xxmrghd [[REG6:[vs0-9]+]], [[REG4]], [[REG3]]
+; P8LE-DAG: xvcvdpsp [[REG7:[vs0-9]+]], [[REG5]]
+; P8LE-DAG: xvcvdpsp [[REG8:[vs0-9]+]], [[REG6]]
+; P8LE: vmrgew v2, [[REG8]], [[REG7]]
+; P8LE: xvcvspsxws v2, v2
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemConsDConvdtoi(double* nocapture readonly %ptr) {
+entry:
+  %arrayidx = getelementptr inbounds double, double* %ptr, i64 3
+  %0 = load double, double* %arrayidx, align 8
+  %conv = fptosi double %0 to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 2
+  %1 = load double, double* %arrayidx1, align 8
+  %conv2 = fptosi double %1 to i32
+  %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1
+  %arrayidx4 = getelementptr inbounds double, double* %ptr, i64 1
+  %2 = load double, double* %arrayidx4, align 8
+  %conv5 = fptosi double %2 to i32
+  %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2
+  %3 = load double, double* %ptr, align 8
+  %conv8 = fptosi double %3 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3
+  ret <4 x i32> %vecinit9
+; P9BE-LABEL: fromDiffMemConsDConvdtoi
+; P9LE-LABEL: fromDiffMemConsDConvdtoi
+; P8BE-LABEL: fromDiffMemConsDConvdtoi
+; P8LE-LABEL: fromDiffMemConsDConvdtoi
+; P9BE: lfd
+; P9BE: lfd
+; P9BE: lfd
+; P9BE: lfd
+; P9BE: xxmrghd
+; P9BE: xxmrghd
+; P9BE: xvcvdpsp
+; P9BE: xvcvdpsp
+; P9BE: vmrgew
+; P9BE: xvcvspsxws v2
+; P9LE: lfd
+; P9LE: lfd
+; P9LE: lfd
+; P9LE: lfd
+; P9LE: xxmrghd
+; P9LE: xxmrghd
+; P9LE: xvcvdpsp
+; P9LE: xvcvdpsp
+; P9LE: vmrgew
+; P9LE: xvcvspsxws v2
+; P8BE: lxsdx
+; P8BE: lxsdx
+; P8BE: lxsdx
+; P8BE: lxsdx
+; P8BE: xxmrghd
+; P8BE: xxmrghd
+; P8BE: xvcvdpsp
+; P8BE: xvcvdpsp
+; P8BE: vmrgew
+; P8BE: xvcvspsxws v2
+; P8LE: lxsdx
+; P8LE: lxsdx
+; P8LE: lxsdx
+; P8LE: lxsdx
+; P8LE: xxmrghd
+; P8LE: xxmrghd
+; P8LE: xvcvdpsp
+; P8LE: xvcvdpsp
+; P8LE: vmrgew
+; P8LE: xvcvspsxws v2
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemVarAConvdtoi(double* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %conv = fptosi double %0 to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %add = add nsw i32 %elem, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1
+  %1 = load double, double* %arrayidx2, align 8
+  %conv3 = fptosi double %1 to i32
+  %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1
+  %add5 = add nsw i32 %elem, 2
+  %idxprom6 = sext i32 %add5 to i64
+  %arrayidx7 = getelementptr inbounds double, double* %arr, i64 %idxprom6
+  %2 = load double, double* %arrayidx7, align 8
+  %conv8 = fptosi double %2 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2
+  %add10 = add nsw i32 %elem, 3
+  %idxprom11 = sext i32 %add10 to i64
+  %arrayidx12 = getelementptr inbounds double, double* %arr, i64 %idxprom11
+  %3 = load double, double* %arrayidx12, align 8
+  %conv13 = fptosi double %3 to i32
+  %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3
+  ret <4 x i32> %vecinit14
+; P9BE-LABEL: fromDiffMemVarAConvdtoi
+; P9LE-LABEL: fromDiffMemVarAConvdtoi
+; P8BE-LABEL: fromDiffMemVarAConvdtoi
+; P8LE-LABEL: fromDiffMemVarAConvdtoi
+; P9BE: lfdux
+; P9BE: lfd
+; P9BE: lfd
+; P9BE: lfd
+; P9BE: xxmrghd
+; P9BE: xxmrghd
+; P9BE: xvcvdpsp
+; P9BE: xvcvdpsp
+; P9BE: vmrgew
+; P9BE: xvcvspsxws v2
+; P9LE: lfdux
+; P9LE: lfd
+; P9LE: lfd
+; P9LE: lfd
+; P9LE: xxmrghd
+; P9LE: xxmrghd
+; P9LE: xvcvdpsp
+; P9LE: xvcvdpsp
+; P9LE: vmrgew
+; P9LE: xvcvspsxws v2
+; P8BE: lfdux
+; P8BE: lxsdx
+; P8BE: lxsdx
+; P8BE: lxsdx
+; P8BE: xxmrghd
+; P8BE: xxmrghd
+; P8BE: xvcvdpsp
+; P8BE: xvcvdpsp
+; P8BE: vmrgew
+; P8BE: xvcvspsxws v2
+; P8LE: lfdux
+; P8LE: lxsdx
+; P8LE: lxsdx
+; P8LE: lxsdx
+; P8LE: xxmrghd
+; P8LE: xxmrghd
+; P8LE: xvcvdpsp
+; P8LE: xvcvdpsp
+; P8LE: vmrgew
+; P8LE: xvcvspsxws v2
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemVarDConvdtoi(double* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %conv = fptosi double %0 to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %sub = add nsw i32 %elem, -1
+  %idxprom1 = sext i32 %sub to i64
+  %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1
+  %1 = load double, double* %arrayidx2, align 8
+  %conv3 = fptosi double %1 to i32
+  %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1
+  %sub5 = add nsw i32 %elem, -2
+  %idxprom6 = sext i32 %sub5 to i64
+  %arrayidx7 = getelementptr inbounds double, double* %arr, i64 %idxprom6
+  %2 = load double, double* %arrayidx7, align 8
+  %conv8 = fptosi double %2 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2
+  %sub10 = add nsw i32 %elem, -3
+  %idxprom11 = sext i32 %sub10 to i64
+  %arrayidx12 = getelementptr inbounds double, double* %arr, i64 %idxprom11
+  %3 = load double, double* %arrayidx12, align 8
+  %conv13 = fptosi double %3 to i32
+  %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3
+  ret <4 x i32> %vecinit14
+; P9BE-LABEL: fromDiffMemVarDConvdtoi
+; P9LE-LABEL: fromDiffMemVarDConvdtoi
+; P8BE-LABEL: fromDiffMemVarDConvdtoi
+; P8LE-LABEL: fromDiffMemVarDConvdtoi
+; P9BE: lfdux
+; P9BE: lfd
+; P9BE: lfd
+; P9BE: lfd
+; P9BE: xxmrghd
+; P9BE: xxmrghd
+; P9BE: xvcvdpsp
+; P9BE: xvcvdpsp
+; P9BE: vmrgew
+; P9BE: xvcvspsxws v2
+; P9LE: lfdux
+; P9LE: lfd
+; P9LE: lfd
+; P9LE: lfd
+; P9LE: xxmrghd
+; P9LE: xxmrghd
+; P9LE: xvcvdpsp
+; P9LE: xvcvdpsp
+; P9LE: vmrgew
+; P9LE: xvcvspsxws v2
+; P8BE: lfdux
+; P8BE: lxsdx
+; P8BE: lxsdx
+; P8BE: lxsdx
+; P8BE: xxmrghd
+; P8BE: xxmrghd
+; P8BE: xvcvdpsp
+; P8BE: xvcvdpsp
+; P8BE: vmrgew
+; P8BE: xvcvspsxws v2
+; P8LE: lfdux
+; P8LE: lxsdx
+; P8LE: lxsdx
+; P8LE: lxsdx
+; P8LE: xxmrghd
+; P8LE: xxmrghd
+; P8LE: xvcvdpsp
+; P8LE: xvcvdpsp
+; P8LE: vmrgew
+; P8LE: xvcvspsxws v2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltRegValConvdtoi(double %val) {
+entry:
+  %conv = fptosi double %val to i32
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+; P9BE-LABEL: spltRegValConvdtoi
+; P9LE-LABEL: spltRegValConvdtoi
+; P8BE-LABEL: spltRegValConvdtoi
+; P8LE-LABEL: spltRegValConvdtoi
+; P9BE: xscvdpsxws
+; P9BE: xxspltw
+; P9BE: blr
+; P9LE: xscvdpsxws
+; P9LE: xxspltw
+; P9LE: blr
+; P8BE: xscvdpsxws
+; P8BE: xxspltw
+; P8BE: blr
+; P8LE: xscvdpsxws
+; P8LE: xxspltw
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @spltMemValConvdtoi(double* nocapture readonly %ptr) {
+entry:
+  %0 = load double, double* %ptr, align 8
+  %conv = fptosi double %0 to i32
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+; P9BE-LABEL: spltMemValConvdtoi
+; P9LE-LABEL: spltMemValConvdtoi
+; P8BE-LABEL: spltMemValConvdtoi
+; P8LE-LABEL: spltMemValConvdtoi
+; P9BE: lfd
+; P9BE: xscvdpsxws
+; P9BE: xxspltw
+; P9BE: blr
+; P9LE: lfd
+; P9LE: xscvdpsxws
+; P9LE: xxspltw
+; P9LE: blr
+; P8BE: lxsdx
+; P8BE: xscvdpsxws
+; P8BE: xxspltw
+; P8BE: blr
+; P8LE: lxsdx
+; P8LE: xscvdpsxws
+; P8LE: xxspltw
+; P8LE: blr
+}
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @allZeroui() {
+entry:
+  ret <4 x i32> zeroinitializer
+; P9BE-LABEL: allZeroui
+; P9LE-LABEL: allZeroui
+; P8BE-LABEL: allZeroui
+; P8LE-LABEL: allZeroui
+; P9BE: xxlxor v2, v2, v2
+; P9BE: blr
+; P9LE: xxlxor v2, v2, v2
+; P9LE: blr
+; P8BE: xxlxor v2, v2, v2
+; P8BE: blr
+; P8LE: xxlxor v2, v2, v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @allOneui() {
+entry:
+  ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+; P9BE-LABEL: allOneui
+; P9LE-LABEL: allOneui
+; P8BE-LABEL: allOneui
+; P8LE-LABEL: allOneui
+; P9BE: xxspltib v2, 255
+; P9BE: blr
+; P9LE: xxspltib v2, 255
+; P9LE: blr
+; P8BE: vspltisb v2, -1
+; P8BE: blr
+; P8LE: vspltisb v2, -1
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltConst1ui() {
+entry:
+  ret <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; P9BE-LABEL: spltConst1ui
+; P9LE-LABEL: spltConst1ui
+; P8BE-LABEL: spltConst1ui
+; P8LE-LABEL: spltConst1ui
+; P9BE: vspltisw v2, 1
+; P9BE: blr
+; P9LE: vspltisw v2, 1
+; P9LE: blr
+; P8BE: vspltisw v2, 1
+; P8BE: blr
+; P8LE: vspltisw v2, 1
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltConst16kui() {
+entry:
+  ret <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
+; P9BE-LABEL: spltConst16kui
+; P9LE-LABEL: spltConst16kui
+; P8BE-LABEL: spltConst16kui
+; P8LE-LABEL: spltConst16kui
+; P9BE: vspltisw v2, -15
+; P9BE: vsrw v2, v2, v2
+; P9BE: blr
+; P9LE: vspltisw v2, -15
+; P9LE: vsrw v2, v2, v2
+; P9LE: blr
+; P8BE: vspltisw v2, -15
+; P8BE: vsrw v2, v2, v2
+; P8BE: blr
+; P8LE: vspltisw v2, -15
+; P8LE: vsrw v2, v2, v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltConst32kui() {
+entry:
+  ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
+; P9BE-LABEL: spltConst32kui
+; P9LE-LABEL: spltConst32kui
+; P8BE-LABEL: spltConst32kui
+; P8LE-LABEL: spltConst32kui
+; P9BE: vspltisw v2, -16
+; P9BE: vsrw v2, v2, v2
+; P9BE: blr
+; P9LE: vspltisw v2, -16
+; P9LE: vsrw v2, v2, v2
+; P9LE: blr
+; P8BE: vspltisw v2, -16
+; P8BE: vsrw v2, v2, v2
+; P8BE: blr
+; P8LE: vspltisw v2, -16
+; P8LE: vsrw v2, v2, v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @fromRegsui(i32 zeroext %a, i32 zeroext %b, i32 zeroext %c, i32 zeroext %d) {
+entry:
+  %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %c, i32 2
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %d, i32 3
+  ret <4 x i32> %vecinit3
+; P9BE-LABEL: fromRegsui
+; P9LE-LABEL: fromRegsui
+; P8BE-LABEL: fromRegsui
+; P8LE-LABEL: fromRegsui
+; P9BE-DAG: mtvsrdd [[REG1:v[0-9]+]], r3, r5
+; P9BE-DAG: mtvsrdd [[REG2:v[0-9]+]], r4, r6
+; P9BE: vmrgow v2, [[REG1]], [[REG2]]
+; P9BE: blr
+; P9LE-DAG: mtvsrdd [[REG1:v[0-9]+]], r5, r3
+; P9LE-DAG: mtvsrdd [[REG2:v[0-9]+]], r6, r4
+; P9LE: vmrgow v2, [[REG2]], [[REG1]]
+; P9LE: blr
+; P8BE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3
+; P8BE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4
+; P8BE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5
+; P8BE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6
+; P8BE-DAG: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG1]], {{[v][s]*}}[[REG3]]
+; P8BE-DAG: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG2]], {{[v][s]*}}[[REG4]]
+; P8BE: vmrgow v2, [[REG5]], [[REG6]]
+; P8LE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3
+; P8LE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4
+; P8LE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5
+; P8LE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6
+; P8LE: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG3]], {{[v][s]*}}[[REG1]]
+; P8LE: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG4]], {{[v][s]*}}[[REG2]]
+; P8LE: vmrgow v2, [[REG6]], [[REG5]]
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @fromDiffConstsui() {
+entry:
+  ret <4 x i32> <i32 242, i32 -113, i32 889, i32 19>
+; P9BE-LABEL: fromDiffConstsui
+; P9LE-LABEL: fromDiffConstsui
+; P8BE-LABEL: fromDiffConstsui
+; P8LE-LABEL: fromDiffConstsui
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvw4x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: xxswapd
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemConsAui(i32* nocapture readonly %arr) {
+entry:
+  %0 = load i32, i32* %arr, align 4
+  %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0
+  %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 1
+  %1 = load i32, i32* %arrayidx1, align 4
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %1, i32 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 2
+  %2 = load i32, i32* %arrayidx3, align 4
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %2, i32 2
+  %arrayidx5 = getelementptr inbounds i32, i32* %arr, i64 3
+  %3 = load i32, i32* %arrayidx5, align 4
+  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %3, i32 3
+  ret <4 x i32> %vecinit6
+; P9BE-LABEL: fromDiffMemConsAui
+; P9LE-LABEL: fromDiffMemConsAui
+; P8BE-LABEL: fromDiffMemConsAui
+; P8LE-LABEL: fromDiffMemConsAui
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvw4x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: xxswapd
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemConsDui(i32* nocapture readonly %arr) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 3
+  %0 = load i32, i32* %arrayidx, align 4
+  %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0
+  %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2
+  %1 = load i32, i32* %arrayidx1, align 4
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %1, i32 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 1
+  %2 = load i32, i32* %arrayidx3, align 4
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %2, i32 2
+  %3 = load i32, i32* %arr, align 4
+  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %3, i32 3
+  ret <4 x i32> %vecinit6
+; P9BE-LABEL: fromDiffMemConsDui
+; P9LE-LABEL: fromDiffMemConsDui
+; P8BE-LABEL: fromDiffMemConsDui
+; P8LE-LABEL: fromDiffMemConsDui
+; P9BE: lxvx
+; P9BE: lxvx
+; P9BE: vperm
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: lxvx
+; P9LE: vperm
+; P9LE: blr
+; P8BE: lxvw4x
+; P8BE: lxvw4x
+; P8BE: vperm
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE-DAG: lxvd2x
+; P8LE-DAG: xxswapd
+; P8LE: xxswapd
+; P8LE: vperm
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemVarAui(i32* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0
+  %add = add nsw i32 %elem, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %idxprom1
+  %1 = load i32, i32* %arrayidx2, align 4
+  %vecinit3 = insertelement <4 x i32> %vecinit, i32 %1, i32 1
+  %add4 = add nsw i32 %elem, 2
+  %idxprom5 = sext i32 %add4 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32* %arr, i64 %idxprom5
+  %2 = load i32, i32* %arrayidx6, align 4
+  %vecinit7 = insertelement <4 x i32> %vecinit3, i32 %2, i32 2
+  %add8 = add nsw i32 %elem, 3
+  %idxprom9 = sext i32 %add8 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32* %arr, i64 %idxprom9
+  %3 = load i32, i32* %arrayidx10, align 4
+  %vecinit11 = insertelement <4 x i32> %vecinit7, i32 %3, i32 3
+  ret <4 x i32> %vecinit11
+; P9BE-LABEL: fromDiffMemVarAui
+; P9LE-LABEL: fromDiffMemVarAui
+; P8BE-LABEL: fromDiffMemVarAui
+; P8LE-LABEL: fromDiffMemVarAui
+; P9BE: sldi r4, r4, 2
+; P9BE: lxvx v2, r3, r4
+; P9BE: blr
+; P9LE: sldi r4, r4, 2
+; P9LE: lxvx v2, r3, r4
+; P9LE: blr
+; P8BE: sldi r4, r4, 2
+; P8BE: lxvw4x {{[vs0-9]+}}, r3, r4
+; P8BE: blr
+; P8LE: sldi r4, r4, 2
+; P8LE: lxvd2x {{[vs0-9]+}}, r3, r4
+; P8LE: xxswapd
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemVarDui(i32* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0
+  %sub = add nsw i32 %elem, -1
+  %idxprom1 = sext i32 %sub to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %idxprom1
+  %1 = load i32, i32* %arrayidx2, align 4
+  %vecinit3 = insertelement <4 x i32> %vecinit, i32 %1, i32 1
+  %sub4 = add nsw i32 %elem, -2
+  %idxprom5 = sext i32 %sub4 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32* %arr, i64 %idxprom5
+  %2 = load i32, i32* %arrayidx6, align 4
+  %vecinit7 = insertelement <4 x i32> %vecinit3, i32 %2, i32 2
+  %sub8 = add nsw i32 %elem, -3
+  %idxprom9 = sext i32 %sub8 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32* %arr, i64 %idxprom9
+  %3 = load i32, i32* %arrayidx10, align 4
+  %vecinit11 = insertelement <4 x i32> %vecinit7, i32 %3, i32 3
+  ret <4 x i32> %vecinit11
+; P9BE-LABEL: fromDiffMemVarDui
+; P9LE-LABEL: fromDiffMemVarDui
+; P8BE-LABEL: fromDiffMemVarDui
+; P8LE-LABEL: fromDiffMemVarDui
+; P9BE: sldi r4, r4, 2
+; P9BE-DAG: lxvx {{[vs0-9]+}}, r3, r4
+; P9BE-DAG: lxvx
+; P9BE: vperm
+; P9BE: blr
+; P9LE: sldi r4, r4, 2
+; P9LE-DAG: lxvx {{[vs0-9]+}}, r3, r4
+; P9LE-DAG: lxvx
+; P9LE: vperm
+; P9LE: blr
+; P8BE: sldi r4, r4, 2
+; P8BE-DAG: lxvw4x {{[vs0-9]+}}, r3, r4
+; P8BE-DAG: lxvw4x
+; P8BE: vperm
+; P8BE: blr
+; P8LE: sldi r4, r4, 2
+; P8LE-DAG: lxvd2x
+; P8LE-DAG: lxvd2x
+; P8LE-DAG: xxswapd
+; P8LE: xxswapd
+; P8LE: vperm
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromRandMemConsui(i32* nocapture readonly %arr) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 4
+  %0 = load i32, i32* %arrayidx, align 4
+  %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0
+  %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 18
+  %1 = load i32, i32* %arrayidx1, align 4
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %1, i32 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 2
+  %2 = load i32, i32* %arrayidx3, align 4
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %2, i32 2
+  %arrayidx5 = getelementptr inbounds i32, i32* %arr, i64 88
+  %3 = load i32, i32* %arrayidx5, align 4
+  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %3, i32 3
+  ret <4 x i32> %vecinit6
+; P9BE-LABEL: fromRandMemConsui
+; P9LE-LABEL: fromRandMemConsui
+; P8BE-LABEL: fromRandMemConsui
+; P8LE-LABEL: fromRandMemConsui
+; P9BE: lwz
+; P9BE: lwz
+; P9BE: lwz
+; P9BE: lwz
+; P9BE: mtvsrdd
+; P9BE: mtvsrdd
+; P9BE: vmrgow
+; P9LE: lwz
+; P9LE: lwz
+; P9LE: lwz
+; P9LE: lwz
+; P9LE: mtvsrdd
+; P9LE: mtvsrdd
+; P9LE: vmrgow
+; P8BE: lwz
+; P8BE: lwz
+; P8BE: lwz
+; P8BE: lwz
+; P8BE: mtvsrwz
+; P8BE: mtvsrwz
+; P8BE: mtvsrwz
+; P8BE: mtvsrwz
+; P8BE: xxmrghd
+; P8BE: xxmrghd
+; P8BE: vmrgow
+; P8LE: lwz
+; P8LE: lwz
+; P8LE: lwz
+; P8LE: lwz
+; P8LE: mtvsrwz
+; P8LE: mtvsrwz
+; P8LE: mtvsrwz
+; P8LE: mtvsrwz
+; P8LE: xxmrghd
+; P8LE: xxmrghd
+; P8LE: vmrgow
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromRandMemVarui(i32* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %add = add nsw i32 %elem, 4
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %vecinit = insertelement <4 x i32> undef, i32 %0, i32 0
+  %add1 = add nsw i32 %elem, 1
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 %idxprom2
+  %1 = load i32, i32* %arrayidx3, align 4
+  %vecinit4 = insertelement <4 x i32> %vecinit, i32 %1, i32 1
+  %add5 = add nsw i32 %elem, 2
+  %idxprom6 = sext i32 %add5 to i64
+  %arrayidx7 = getelementptr inbounds i32, i32* %arr, i64 %idxprom6
+  %2 = load i32, i32* %arrayidx7, align 4
+  %vecinit8 = insertelement <4 x i32> %vecinit4, i32 %2, i32 2
+  %add9 = add nsw i32 %elem, 8
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %arr, i64 %idxprom10
+  %3 = load i32, i32* %arrayidx11, align 4
+  %vecinit12 = insertelement <4 x i32> %vecinit8, i32 %3, i32 3
+  ret <4 x i32> %vecinit12
+; P9BE-LABEL: fromRandMemVarui
+; P9LE-LABEL: fromRandMemVarui
+; P8BE-LABEL: fromRandMemVarui
+; P8LE-LABEL: fromRandMemVarui
+; P9BE: sldi r4, r4, 2
+; P9BE: lwz
+; P9BE: lwz
+; P9BE: lwz
+; P9BE: lwz
+; P9BE: mtvsrdd
+; P9BE: mtvsrdd
+; P9BE: vmrgow
+; P9LE: sldi r4, r4, 2
+; P9LE: lwz
+; P9LE: lwz
+; P9LE: lwz
+; P9LE: lwz
+; P9LE: mtvsrdd
+; P9LE: mtvsrdd
+; P9LE: vmrgow
+; P8BE: sldi r4, r4, 2
+; P8BE: lwz
+; P8BE: lwz
+; P8BE: lwz
+; P8BE: lwz
+; P8BE: mtvsrwz
+; P8BE: mtvsrwz
+; P8BE: mtvsrwz
+; P8BE: mtvsrwz
+; P8BE: xxmrghd
+; P8BE: xxmrghd
+; P8BE: vmrgow
+; P8LE: sldi r4, r4, 2
+; P8LE: lwz
+; P8LE: lwz
+; P8LE: lwz
+; P8LE: lwz
+; P8LE: mtvsrwz
+; P8LE: mtvsrwz
+; P8LE: mtvsrwz
+; P8LE: mtvsrwz
+; P8LE: xxmrghd
+; P8LE: xxmrghd
+; P8LE: vmrgow
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltRegValui(i32 zeroext %val) {
+entry:
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %val, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+; P9BE-LABEL: spltRegValui
+; P9LE-LABEL: spltRegValui
+; P8BE-LABEL: spltRegValui
+; P8LE-LABEL: spltRegValui
+; P9BE: mtvsrws v2, r3
+; P9BE: blr
+; P9LE: mtvsrws v2, r3
+; P9LE: blr
+; P8BE: mtvsrwz {{[vsf0-9]+}}, r3
+; P8BE: xxspltw v2, {{[vsf0-9]+}}, 1
+; P8BE: blr
+; P8LE: mtvsrwz {{[vsf0-9]+}}, r3
+; P8LE: xxspltw v2, {{[vsf0-9]+}}, 1
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @spltMemValui(i32* nocapture readonly %ptr) {
+entry:
+  %0 = load i32, i32* %ptr, align 4
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+; P9BE-LABEL: spltMemValui
+; P9LE-LABEL: spltMemValui
+; P8BE-LABEL: spltMemValui
+; P8LE-LABEL: spltMemValui
+; P9BE: lxvwsx v2, 0, r3
+; P9BE: blr
+; P9LE: lxvwsx v2, 0, r3
+; P9LE: blr
+; P8BE: lxsiwax {{[vsf0-9]+}}, 0, r3
+; P8BE: xxspltw v2, {{[vsf0-9]+}}, 1
+; P8BE: blr
+; P8LE: lxsiwax {{[vsf0-9]+}}, 0, r3
+; P8LE: xxspltw v2, {{[vsf0-9]+}}, 1
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltCnstConvftoui() {
+entry:
+  ret <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+; P9BE-LABEL: spltCnstConvftoui
+; P9LE-LABEL: spltCnstConvftoui
+; P8BE-LABEL: spltCnstConvftoui
+; P8LE-LABEL: spltCnstConvftoui
+; P9BE: vspltisw v2, 4
+; P9BE: blr
+; P9LE: vspltisw v2, 4
+; P9LE: blr
+; P8BE: vspltisw v2, 4
+; P8BE: blr
+; P8LE: vspltisw v2, 4
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @fromRegsConvftoui(float %a, float %b, float %c, float %d) {
+entry:
+  %conv = fptoui float %a to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %conv1 = fptoui float %b to i32
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %conv1, i32 1
+  %conv3 = fptoui float %c to i32
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %conv3, i32 2
+  %conv5 = fptoui float %d to i32
+  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %conv5, i32 3
+  ret <4 x i32> %vecinit6
+; P9BE-LABEL: fromRegsConvftoui
+; P9LE-LABEL: fromRegsConvftoui
+; P8BE-LABEL: fromRegsConvftoui
+; P8LE-LABEL: fromRegsConvftoui
+; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9BE: vmrgew v2, [[REG3]], [[REG4]]
+; P9BE: xvcvspuxws v2, v2
+; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9LE: vmrgew v2, [[REG4]], [[REG3]]
+; P9LE: xvcvspuxws v2, v2
+; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8BE: vmrgew v2, [[REG3]], [[REG4]]
+; P8BE: xvcvspuxws v2, v2
+; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8LE: vmrgew v2, [[REG4]], [[REG3]]
+; P8LE: xvcvspuxws v2, v2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @fromDiffConstsConvftoui() {
+entry:
+  ret <4 x i32> <i32 24, i32 234, i32 988, i32 422>
+; P9BE-LABEL: fromDiffConstsConvftoui
+; P9LE-LABEL: fromDiffConstsConvftoui
+; P8BE-LABEL: fromDiffConstsConvftoui
+; P8LE-LABEL: fromDiffConstsConvftoui
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvw4x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: xxswapd
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemConsAConvftoui(float* nocapture readonly %ptr) {
+entry:
+  %0 = bitcast float* %ptr to <4 x float>*
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = fptoui <4 x float> %1 to <4 x i32>
+  ret <4 x i32> %2
+; P9BE-LABEL: fromDiffMemConsAConvftoui
+; P9LE-LABEL: fromDiffMemConsAConvftoui
+; P8BE-LABEL: fromDiffMemConsAConvftoui
+; P8LE-LABEL: fromDiffMemConsAConvftoui
+; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9BE: xvcvspuxws v2, [[REG1]]
+; P9BE: blr
+; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9LE: xvcvspuxws v2, [[REG1]]
+; P9LE: blr
+; P8BE: lxvw4x [[REG1:[vs0-9]+]], 0, r3
+; P8BE: xvcvspuxws v2, [[REG1]]
+; P8BE: blr
+; P8LE: lxvd2x [[REG1:[vs0-9]+]], 0, r3
+; P8LE: xxswapd v2, [[REG1]]
+; P8LE: xvcvspuxws v2, v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemConsDConvftoui(float* nocapture readonly %ptr) {
+entry:
+  %arrayidx = getelementptr inbounds float, float* %ptr, i64 3
+  %0 = load float, float* %arrayidx, align 4
+  %conv = fptoui float %0 to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 2
+  %1 = load float, float* %arrayidx1, align 4
+  %conv2 = fptoui float %1 to i32
+  %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1
+  %arrayidx4 = getelementptr inbounds float, float* %ptr, i64 1
+  %2 = load float, float* %arrayidx4, align 4
+  %conv5 = fptoui float %2 to i32
+  %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2
+  %3 = load float, float* %ptr, align 4
+  %conv8 = fptoui float %3 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3
+  ret <4 x i32> %vecinit9
+; P9BE-LABEL: fromDiffMemConsDConvftoui
+; P9LE-LABEL: fromDiffMemConsDConvftoui
+; P8BE-LABEL: fromDiffMemConsDConvftoui
+; P8LE-LABEL: fromDiffMemConsDConvftoui
+; P9BE: lxvx
+; P9BE: lxvx
+; P9BE: vperm
+; P9BE: xvcvspuxws
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: lxvx
+; P9LE: vperm
+; P9LE: xvcvspuxws
+; P9LE: blr
+; P8BE: lxvw4x
+; P8BE: lxvw4x
+; P8BE: vperm
+; P8BE: xvcvspuxws
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE-DAG: lxvd2x
+; P8LE-DAG: xxswapd
+; P8LE: xxswapd
+; P8LE: vperm
+; P8LE: xvcvspuxws
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemVarAConvftoui(float* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  %conv = fptoui float %0 to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %add = add nsw i32 %elem, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1
+  %1 = load float, float* %arrayidx2, align 4
+  %conv3 = fptoui float %1 to i32
+  %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1
+  %add5 = add nsw i32 %elem, 2
+  %idxprom6 = sext i32 %add5 to i64
+  %arrayidx7 = getelementptr inbounds float, float* %arr, i64 %idxprom6
+  %2 = load float, float* %arrayidx7, align 4
+  %conv8 = fptoui float %2 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2
+  %add10 = add nsw i32 %elem, 3
+  %idxprom11 = sext i32 %add10 to i64
+  %arrayidx12 = getelementptr inbounds float, float* %arr, i64 %idxprom11
+  %3 = load float, float* %arrayidx12, align 4
+  %conv13 = fptoui float %3 to i32
+  %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3
+  ret <4 x i32> %vecinit14
+; P9BE-LABEL: fromDiffMemVarAConvftoui
+; P9LE-LABEL: fromDiffMemVarAConvftoui
+; P8BE-LABEL: fromDiffMemVarAConvftoui
+; P8LE-LABEL: fromDiffMemVarAConvftoui
+; FIXME: implement finding consecutive loads with pre-inc
+; P9BE: lfsux
+; P9LE: lfsux
+; P8BE: lfsux
+; P8LE: lfsux
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemVarDConvftoui(float* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  %conv = fptoui float %0 to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %sub = add nsw i32 %elem, -1
+  %idxprom1 = sext i32 %sub to i64
+  %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1
+  %1 = load float, float* %arrayidx2, align 4
+  %conv3 = fptoui float %1 to i32
+  %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1
+  %sub5 = add nsw i32 %elem, -2
+  %idxprom6 = sext i32 %sub5 to i64
+  %arrayidx7 = getelementptr inbounds float, float* %arr, i64 %idxprom6
+  %2 = load float, float* %arrayidx7, align 4
+  %conv8 = fptoui float %2 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2
+  %sub10 = add nsw i32 %elem, -3
+  %idxprom11 = sext i32 %sub10 to i64
+  %arrayidx12 = getelementptr inbounds float, float* %arr, i64 %idxprom11
+  %3 = load float, float* %arrayidx12, align 4
+  %conv13 = fptoui float %3 to i32
+  %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3
+  ret <4 x i32> %vecinit14
+; P9BE-LABEL: fromDiffMemVarDConvftoui
+; P9LE-LABEL: fromDiffMemVarDConvftoui
+; P8BE-LABEL: fromDiffMemVarDConvftoui
+; P8LE-LABEL: fromDiffMemVarDConvftoui
+; FIXME: implement finding consecutive loads with pre-inc
+; P9BE: lfsux
+; P9LE: lfsux
+; P8BE: lfsux
+; P8LE: lfsux
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltRegValConvftoui(float %val) {
+entry:
+  %conv = fptoui float %val to i32
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+; P9BE-LABEL: spltRegValConvftoui
+; P9LE-LABEL: spltRegValConvftoui
+; P8BE-LABEL: spltRegValConvftoui
+; P8LE-LABEL: spltRegValConvftoui
+; P9BE: xscvdpuxws f[[REG1:[0-9]+]], f1
+; P9BE: xxspltw v2, vs[[REG1]], 1
+; P9BE: blr
+; P9LE: xscvdpuxws f[[REG1:[0-9]+]], f1
+; P9LE: xxspltw v2, vs[[REG1]], 1
+; P9LE: blr
+; P8BE: xscvdpuxws f[[REG1:[0-9]+]], f1
+; P8BE: xxspltw v2, vs[[REG1]], 1
+; P8BE: blr
+; P8LE: xscvdpuxws f[[REG1:[0-9]+]], f1
+; P8LE: xxspltw v2, vs[[REG1]], 1
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @spltMemValConvftoui(float* nocapture readonly %ptr) {
+entry:
+  %0 = load float, float* %ptr, align 4
+  %conv = fptoui float %0 to i32
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+; P9BE-LABEL: spltMemValConvftoui
+; P9LE-LABEL: spltMemValConvftoui
+; P8BE-LABEL: spltMemValConvftoui
+; P8LE-LABEL: spltMemValConvftoui
+; P9BE: lxvwsx [[REG1:[vs0-9]+]], 0, r3
+; P9BE: xvcvspuxws v2, [[REG1]]
+; P9LE: [[REG1:[vs0-9]+]], 0, r3
+; P9LE: xvcvspuxws v2, [[REG1]]
+; P8BE: lxsspx [[REG1:f[0-9]+]], 0, r3
+; P8BE: xscvdpuxws f[[REG2:[0-9]+]], [[REG1]]
+; P8BE: xxspltw v2, vs[[REG2]], 1
+; P8LE: lxsspx [[REG1:f[0-9]+]], 0, r3
+; P8LE: xscvdpuxws f[[REG2:[vs0-9]+]], [[REG1]]
+; P8LE: xxspltw v2, vs[[REG2]], 1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltCnstConvdtoui() {
+entry:
+  ret <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+; P9BE-LABEL: spltCnstConvdtoui
+; P9LE-LABEL: spltCnstConvdtoui
+; P8BE-LABEL: spltCnstConvdtoui
+; P8LE-LABEL: spltCnstConvdtoui
+; P9BE: vspltisw v2, 4
+; P9BE: blr
+; P9LE: vspltisw v2, 4
+; P9LE: blr
+; P8BE: vspltisw v2, 4
+; P8BE: blr
+; P8LE: vspltisw v2, 4
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @fromRegsConvdtoui(double %a, double %b, double %c, double %d) {
+entry:
+  %conv = fptoui double %a to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %conv1 = fptoui double %b to i32
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %conv1, i32 1
+  %conv3 = fptoui double %c to i32
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %conv3, i32 2
+  %conv5 = fptoui double %d to i32
+  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %conv5, i32 3
+  ret <4 x i32> %vecinit6
+; P9BE-LABEL: fromRegsConvdtoui
+; P9LE-LABEL: fromRegsConvdtoui
+; P8BE-LABEL: fromRegsConvdtoui
+; P8LE-LABEL: fromRegsConvdtoui
+; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9BE: vmrgew v2, [[REG3]], [[REG4]]
+; P9BE: xvcvspuxws v2, v2
+; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9LE: vmrgew v2, [[REG4]], [[REG3]]
+; P9LE: xvcvspuxws v2, v2
+; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8BE: vmrgew v2, [[REG3]], [[REG4]]
+; P8BE: xvcvspuxws v2, v2
+; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8LE: vmrgew v2, [[REG4]], [[REG3]]
+; P8LE: xvcvspuxws v2, v2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @fromDiffConstsConvdtoui() {
+entry:
+  ret <4 x i32> <i32 24, i32 234, i32 988, i32 422>
+; P9BE-LABEL: fromDiffConstsConvdtoui
+; P9LE-LABEL: fromDiffConstsConvdtoui
+; P8BE-LABEL: fromDiffConstsConvdtoui
+; P8LE-LABEL: fromDiffConstsConvdtoui
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvw4x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: xxswapd
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemConsAConvdtoui(double* nocapture readonly %ptr) {
+entry:
+  %0 = bitcast double* %ptr to <2 x double>*
+  %1 = load <2 x double>, <2 x double>* %0, align 8
+  %2 = fptoui <2 x double> %1 to <2 x i32>
+  %arrayidx4 = getelementptr inbounds double, double* %ptr, i64 2
+  %3 = bitcast double* %arrayidx4 to <2 x double>*
+  %4 = load <2 x double>, <2 x double>* %3, align 8
+  %5 = fptoui <2 x double> %4 to <2 x i32>
+  %vecinit9 = shufflevector <2 x i32> %2, <2 x i32> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %vecinit9
+; P9BE-LABEL: fromDiffMemConsAConvdtoui
+; P9LE-LABEL: fromDiffMemConsAConvdtoui
+; P8BE-LABEL: fromDiffMemConsAConvdtoui
+; P8LE-LABEL: fromDiffMemConsAConvdtoui
+; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9BE: lxvx [[REG2:[vs0-9]+]], r3, r4
+; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
+; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
+; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
+; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
+; P9BE: vmrgew v2, [[REG6]], [[REG5]]
+; P9BE: xvcvspuxws v2, v2
+; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9LE: lxvx [[REG2:[vs0-9]+]], r3, r4
+; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]]
+; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]]
+; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
+; P9LE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
+; P9LE: vmrgew v2, [[REG6]], [[REG5]]
+; P9LE: xvcvspuxws v2, v2
+; P8BE: lxvd2x [[REG1:[vs0-9]+]], 0, r3
+; P8BE: lxvd2x [[REG2:[vs0-9]+]], r3, r4
+; P8BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
+; P8BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
+; P8BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
+; P8BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
+; P8BE: vmrgew v2, [[REG6]], [[REG5]]
+; P8BE: xvcvspuxws v2, v2
+; P8LE: lxvd2x [[REG1:[vs0-9]+]], 0, r3
+; P8LE: lxvd2x [[REG2:[vs0-9]+]], r3, r4
+; P8LE-DAG: xxswapd [[REG3:[vs0-9]+]], [[REG1]]
+; P8LE-DAG: xxswapd [[REG4:[vs0-9]+]], [[REG2]]
+; P8LE-DAG: xxmrgld [[REG5:[vs0-9]+]], [[REG4]], [[REG3]]
+; P8LE-DAG: xxmrghd [[REG6:[vs0-9]+]], [[REG4]], [[REG3]]
+; P8LE-DAG: xvcvdpsp [[REG7:[vs0-9]+]], [[REG5]]
+; P8LE-DAG: xvcvdpsp [[REG8:[vs0-9]+]], [[REG6]]
+; P8LE: vmrgew v2, [[REG8]], [[REG7]]
+; P8LE: xvcvspuxws v2, v2
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemConsDConvdtoui(double* nocapture readonly %ptr) {
+entry:
+  %arrayidx = getelementptr inbounds double, double* %ptr, i64 3
+  %0 = load double, double* %arrayidx, align 8
+  %conv = fptoui double %0 to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 2
+  %1 = load double, double* %arrayidx1, align 8
+  %conv2 = fptoui double %1 to i32
+  %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1
+  %arrayidx4 = getelementptr inbounds double, double* %ptr, i64 1
+  %2 = load double, double* %arrayidx4, align 8
+  %conv5 = fptoui double %2 to i32
+  %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2
+  %3 = load double, double* %ptr, align 8
+  %conv8 = fptoui double %3 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3
+  ret <4 x i32> %vecinit9
+; P9BE-LABEL: fromDiffMemConsDConvdtoui
+; P9LE-LABEL: fromDiffMemConsDConvdtoui
+; P8BE-LABEL: fromDiffMemConsDConvdtoui
+; P8LE-LABEL: fromDiffMemConsDConvdtoui
+; P9BE: lfd
+; P9BE: lfd
+; P9BE: lfd
+; P9BE: lfd
+; P9BE: xxmrghd
+; P9BE: xxmrghd
+; P9BE: xvcvdpsp
+; P9BE: xvcvdpsp
+; P9BE: vmrgew
+; P9BE: xvcvspuxws v2
+; P9LE: lfd
+; P9LE: lfd
+; P9LE: lfd
+; P9LE: lfd
+; P9LE: xxmrghd
+; P9LE: xxmrghd
+; P9LE: xvcvdpsp
+; P9LE: xvcvdpsp
+; P9LE: vmrgew
+; P9LE: xvcvspuxws v2
+; P8BE: lxsdx
+; P8BE: lxsdx
+; P8BE: lxsdx
+; P8BE: lxsdx
+; P8BE: xxmrghd
+; P8BE: xxmrghd
+; P8BE: xvcvdpsp
+; P8BE: xvcvdpsp
+; P8BE: vmrgew
+; P8BE: xvcvspuxws v2
+; P8LE: lxsdx
+; P8LE: lxsdx
+; P8LE: lxsdx
+; P8LE: lxsdx
+; P8LE: xxmrghd
+; P8LE: xxmrghd
+; P8LE: xvcvdpsp
+; P8LE: xvcvdpsp
+; P8LE: vmrgew
+; P8LE: xvcvspuxws v2
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemVarAConvdtoui(double* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %conv = fptoui double %0 to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %add = add nsw i32 %elem, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1
+  %1 = load double, double* %arrayidx2, align 8
+  %conv3 = fptoui double %1 to i32
+  %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1
+  %add5 = add nsw i32 %elem, 2
+  %idxprom6 = sext i32 %add5 to i64
+  %arrayidx7 = getelementptr inbounds double, double* %arr, i64 %idxprom6
+  %2 = load double, double* %arrayidx7, align 8
+  %conv8 = fptoui double %2 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2
+  %add10 = add nsw i32 %elem, 3
+  %idxprom11 = sext i32 %add10 to i64
+  %arrayidx12 = getelementptr inbounds double, double* %arr, i64 %idxprom11
+  %3 = load double, double* %arrayidx12, align 8
+  %conv13 = fptoui double %3 to i32
+  %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3
+  ret <4 x i32> %vecinit14
+; P9BE-LABEL: fromDiffMemVarAConvdtoui
+; P9LE-LABEL: fromDiffMemVarAConvdtoui
+; P8BE-LABEL: fromDiffMemVarAConvdtoui
+; P8LE-LABEL: fromDiffMemVarAConvdtoui
+; P9BE: lfdux
+; P9BE: lfd
+; P9BE: lfd
+; P9BE: lfd
+; P9BE: xxmrghd
+; P9BE: xxmrghd
+; P9BE: xvcvdpsp
+; P9BE: xvcvdpsp
+; P9BE: vmrgew
+; P9BE: xvcvspuxws v2
+; P9LE: lfdux
+; P9LE: lfd
+; P9LE: lfd
+; P9LE: lfd
+; P9LE: xxmrghd
+; P9LE: xxmrghd
+; P9LE: xvcvdpsp
+; P9LE: xvcvdpsp
+; P9LE: vmrgew
+; P9LE: xvcvspuxws v2
+; P8BE: lfdux
+; P8BE: lxsdx
+; P8BE: lxsdx
+; P8BE: lxsdx
+; P8BE: xxmrghd
+; P8BE: xxmrghd
+; P8BE: xvcvdpsp
+; P8BE: xvcvdpsp
+; P8BE: vmrgew
+; P8BE: xvcvspuxws v2
+; P8LE: lfdux
+; P8LE: lxsdx
+; P8LE: lxsdx
+; P8LE: lxsdx
+; P8LE: xxmrghd
+; P8LE: xxmrghd
+; P8LE: xvcvdpsp
+; P8LE: xvcvdpsp
+; P8LE: vmrgew
+; P8LE: xvcvspuxws v2
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @fromDiffMemVarDConvdtoui(double* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %conv = fptoui double %0 to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %sub = add nsw i32 %elem, -1
+  %idxprom1 = sext i32 %sub to i64
+  %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1
+  %1 = load double, double* %arrayidx2, align 8
+  %conv3 = fptoui double %1 to i32
+  %vecinit4 = insertelement <4 x i32> %vecinit, i32 %conv3, i32 1
+  %sub5 = add nsw i32 %elem, -2
+  %idxprom6 = sext i32 %sub5 to i64
+  %arrayidx7 = getelementptr inbounds double, double* %arr, i64 %idxprom6
+  %2 = load double, double* %arrayidx7, align 8
+  %conv8 = fptoui double %2 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit4, i32 %conv8, i32 2
+  %sub10 = add nsw i32 %elem, -3
+  %idxprom11 = sext i32 %sub10 to i64
+  %arrayidx12 = getelementptr inbounds double, double* %arr, i64 %idxprom11
+  %3 = load double, double* %arrayidx12, align 8
+  %conv13 = fptoui double %3 to i32
+  %vecinit14 = insertelement <4 x i32> %vecinit9, i32 %conv13, i32 3
+  ret <4 x i32> %vecinit14
+; P9BE-LABEL: fromDiffMemVarDConvdtoui
+; P9LE-LABEL: fromDiffMemVarDConvdtoui
+; P8BE-LABEL: fromDiffMemVarDConvdtoui
+; P8LE-LABEL: fromDiffMemVarDConvdtoui
+; P9BE: lfdux
+; P9BE: lfd
+; P9BE: lfd
+; P9BE: lfd
+; P9BE: xxmrghd
+; P9BE: xxmrghd
+; P9BE: xvcvdpsp
+; P9BE: xvcvdpsp
+; P9BE: vmrgew
+; P9BE: xvcvspuxws v2
+; P9LE: lfdux
+; P9LE: lfd
+; P9LE: lfd
+; P9LE: lfd
+; P9LE: xxmrghd
+; P9LE: xxmrghd
+; P9LE: xvcvdpsp
+; P9LE: xvcvdpsp
+; P9LE: vmrgew
+; P9LE: xvcvspuxws v2
+; P8BE: lfdux
+; P8BE: lxsdx
+; P8BE: lxsdx
+; P8BE: lxsdx
+; P8BE: xxmrghd
+; P8BE: xxmrghd
+; P8BE: xvcvdpsp
+; P8BE: xvcvdpsp
+; P8BE: vmrgew
+; P8BE: xvcvspuxws v2
+; P8LE: lfdux
+; P8LE: lxsdx
+; P8LE: lxsdx
+; P8LE: lxsdx
+; P8LE: xxmrghd
+; P8LE: xxmrghd
+; P8LE: xvcvdpsp
+; P8LE: xvcvdpsp
+; P8LE: vmrgew
+; P8LE: xvcvspuxws v2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @spltRegValConvdtoui(double %val) {
+entry:
+  %conv = fptoui double %val to i32
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+; P9BE-LABEL: spltRegValConvdtoui
+; P9LE-LABEL: spltRegValConvdtoui
+; P8BE-LABEL: spltRegValConvdtoui
+; P8LE-LABEL: spltRegValConvdtoui
+; P9BE: xscvdpuxws
+; P9BE: xxspltw
+; P9BE: blr
+; P9LE: xscvdpuxws
+; P9LE: xxspltw
+; P9LE: blr
+; P8BE: xscvdpuxws
+; P8BE: xxspltw
+; P8BE: blr
+; P8LE: xscvdpuxws
+; P8LE: xxspltw
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x i32> @spltMemValConvdtoui(double* nocapture readonly %ptr) {
+entry:
+  %0 = load double, double* %ptr, align 8
+  %conv = fptoui double %0 to i32
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+; P9BE-LABEL: spltMemValConvdtoui
+; P9LE-LABEL: spltMemValConvdtoui
+; P8BE-LABEL: spltMemValConvdtoui
+; P8LE-LABEL: spltMemValConvdtoui
+; P9BE: lfd
+; P9BE: xscvdpuxws
+; P9BE: xxspltw
+; P9BE: blr
+; P9LE: lfd
+; P9LE: xscvdpuxws
+; P9LE: xxspltw
+; P9LE: blr
+; P8BE: lxsdx
+; P8BE: xscvdpuxws
+; P8BE: xxspltw
+; P8BE: blr
+; P8LE: lxsdx
+; P8LE: xscvdpuxws
+; P8LE: xxspltw
+; P8LE: blr
+}
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @allZeroll() {
+entry:
+  ret <2 x i64> zeroinitializer
+; P9BE-LABEL: allZeroll
+; P9LE-LABEL: allZeroll
+; P8BE-LABEL: allZeroll
+; P8LE-LABEL: allZeroll
+; P9BE: xxlxor v2, v2, v2
+; P9BE: blr
+; P9LE: xxlxor v2, v2, v2
+; P9LE: blr
+; P8BE: xxlxor v2, v2, v2
+; P8BE: blr
+; P8LE: xxlxor v2, v2, v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @allOnell() {
+entry:
+  ret <2 x i64> <i64 -1, i64 -1>
+; P9BE-LABEL: allOnell
+; P9LE-LABEL: allOnell
+; P8BE-LABEL: allOnell
+; P8LE-LABEL: allOnell
+; P9BE: xxspltib v2, 255
+; P9BE: blr
+; P9LE: xxspltib v2, 255
+; P9LE: blr
+; P8BE: vspltisb v2, -1
+; P8BE: blr
+; P8LE: vspltisb v2, -1
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltConst1ll() {
+entry:
+  ret <2 x i64> <i64 1, i64 1>
+; P9BE-LABEL: spltConst1ll
+; P9LE-LABEL: spltConst1ll
+; P8BE-LABEL: spltConst1ll
+; P8LE-LABEL: spltConst1ll
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltConst16kll() {
+entry:
+  ret <2 x i64> <i64 32767, i64 32767>
+; P9BE-LABEL: spltConst16kll
+; P9LE-LABEL: spltConst16kll
+; P8BE-LABEL: spltConst16kll
+; P8LE-LABEL: spltConst16kll
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltConst32kll() {
+entry:
+  ret <2 x i64> <i64 65535, i64 65535>
+; P9BE-LABEL: spltConst32kll
+; P9LE-LABEL: spltConst32kll
+; P8BE-LABEL: spltConst32kll
+; P8LE-LABEL: spltConst32kll
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @fromRegsll(i64 %a, i64 %b) {
+entry:
+  %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
+  %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
+  ret <2 x i64> %vecinit1
+; P9BE-LABEL: fromRegsll
+; P9LE-LABEL: fromRegsll
+; P8BE-LABEL: fromRegsll
+; P8LE-LABEL: fromRegsll
+; P9BE: mtvsrdd v2, r3, r4
+; P9BE: blr
+; P9LE: mtvsrdd v2, r4, r3
+; P9LE: blr
+; P8BE-DAG: mtvsrd {{[vsf0-9]+}}, r3
+; P8BE-DAG: mtvsrd {{[vsf0-9]+}}, r4
+; P8BE: xxmrghd v2
+; P8BE: blr
+; P8LE-DAG: mtvsrd {{[vsf0-9]+}}, r3
+; P8LE-DAG: mtvsrd {{[vsf0-9]+}}, r4
+; P8LE: xxmrghd v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @fromDiffConstsll() {
+entry:
+  ret <2 x i64> <i64 242, i64 -113>
+; P9BE-LABEL: fromDiffConstsll
+; P9LE-LABEL: fromDiffConstsll
+; P8BE-LABEL: fromDiffConstsll
+; P8LE-LABEL: fromDiffConstsll
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemConsAll(i64* nocapture readonly %arr) {
+entry:
+  %0 = load i64, i64* %arr, align 8
+  %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0
+  %arrayidx1 = getelementptr inbounds i64, i64* %arr, i64 1
+  %1 = load i64, i64* %arrayidx1, align 8
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %1, i32 1
+  ret <2 x i64> %vecinit2
+; P9BE-LABEL: fromDiffMemConsAll
+; P9LE-LABEL: fromDiffMemConsAll
+; P8BE-LABEL: fromDiffMemConsAll
+; P8LE-LABEL: fromDiffMemConsAll
+; P9BE: lxvx v2
+; P9BE: blr
+; P9LE: lxvx v2
+; P9LE: blr
+; P8BE: lxvd2x v2
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: xxswapd v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemConsDll(i64* nocapture readonly %arr) {
+entry:
+  %arrayidx = getelementptr inbounds i64, i64* %arr, i64 3
+  %0 = load i64, i64* %arrayidx, align 8
+  %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0
+  %arrayidx1 = getelementptr inbounds i64, i64* %arr, i64 2
+  %1 = load i64, i64* %arrayidx1, align 8
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %1, i32 1
+  ret <2 x i64> %vecinit2
+; P9BE-LABEL: fromDiffMemConsDll
+; P9LE-LABEL: fromDiffMemConsDll
+; P8BE-LABEL: fromDiffMemConsDll
+; P8LE-LABEL: fromDiffMemConsDll
+; P9BE: lxvx v2
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: xxswapd v2
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: xxswapd v2
+; P8BE-NEXT: blr
+; P8LE: lxvd2x v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemVarAll(i64* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds i64, i64* %arr, i64 %idxprom
+  %0 = load i64, i64* %arrayidx, align 8
+  %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0
+  %add = add nsw i32 %elem, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i64, i64* %arr, i64 %idxprom1
+  %1 = load i64, i64* %arrayidx2, align 8
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %1, i32 1
+  ret <2 x i64> %vecinit3
+; P9BE-LABEL: fromDiffMemVarAll
+; P9LE-LABEL: fromDiffMemVarAll
+; P8BE-LABEL: fromDiffMemVarAll
+; P8LE-LABEL: fromDiffMemVarAll
+; P9BE: sldi
+; P9BE: lxvx v2
+; P9BE-NEXT: blr
+; P9LE: sldi
+; P9LE: lxvx v2
+; P9LE-NEXT: blr
+; P8BE: sldi
+; P8BE: lxvd2x v2
+; P8BE-NEXT: blr
+; P8LE: sldi
+; P8LE: lxvd2x
+; P8LE: xxswapd v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemVarDll(i64* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds i64, i64* %arr, i64 %idxprom
+  %0 = load i64, i64* %arrayidx, align 8
+  %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0
+  %sub = add nsw i32 %elem, -1
+  %idxprom1 = sext i32 %sub to i64
+  %arrayidx2 = getelementptr inbounds i64, i64* %arr, i64 %idxprom1
+  %1 = load i64, i64* %arrayidx2, align 8
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %1, i32 1
+  ret <2 x i64> %vecinit3
+; P9BE-LABEL: fromDiffMemVarDll
+; P9LE-LABEL: fromDiffMemVarDll
+; P8BE-LABEL: fromDiffMemVarDll
+; P8LE-LABEL: fromDiffMemVarDll
+; P9BE: sldi
+; P9BE: lxvx
+; P9BE: xxswapd v2
+; P9BE-NEXT: blr
+; P9LE: sldi
+; P9LE: lxvx
+; P9LE: xxswapd v2
+; P9LE-NEXT: blr
+; P8BE: sldi
+; P8BE: lxvd2x
+; P8BE: xxswapd v2
+; P8BE-NEXT: blr
+; P8LE: sldi
+; P8LE: lxvd2x v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromRandMemConsll(i64* nocapture readonly %arr) {
+entry:
+  %arrayidx = getelementptr inbounds i64, i64* %arr, i64 4
+  %0 = load i64, i64* %arrayidx, align 8
+  %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0
+  %arrayidx1 = getelementptr inbounds i64, i64* %arr, i64 18
+  %1 = load i64, i64* %arrayidx1, align 8
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %1, i32 1
+  ret <2 x i64> %vecinit2
+; P9BE-LABEL: fromRandMemConsll
+; P9LE-LABEL: fromRandMemConsll
+; P8BE-LABEL: fromRandMemConsll
+; P8LE-LABEL: fromRandMemConsll
+; P9BE: ld
+; P9BE: ld
+; P9BE: mtvsrdd v2
+; P9BE-NEXT: blr
+; P9LE: ld
+; P9LE: ld
+; P9LE: mtvsrdd v2
+; P9LE-NEXT: blr
+; P8BE: ld
+; P8BE: ld
+; P8BE-DAG: mtvsrd
+; P8BE-DAG: mtvsrd
+; P8BE: xxmrghd v2
+; P8BE-NEXT: blr
+; P8LE: ld
+; P8LE: ld
+; P8LE-DAG: mtvsrd
+; P8LE-DAG: mtvsrd
+; P8LE: xxmrghd v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromRandMemVarll(i64* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %add = add nsw i32 %elem, 4
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i64, i64* %arr, i64 %idxprom
+  %0 = load i64, i64* %arrayidx, align 8
+  %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0
+  %add1 = add nsw i32 %elem, 1
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds i64, i64* %arr, i64 %idxprom2
+  %1 = load i64, i64* %arrayidx3, align 8
+  %vecinit4 = insertelement <2 x i64> %vecinit, i64 %1, i32 1
+  ret <2 x i64> %vecinit4
+; P9BE-LABEL: fromRandMemVarll
+; P9LE-LABEL: fromRandMemVarll
+; P8BE-LABEL: fromRandMemVarll
+; P8LE-LABEL: fromRandMemVarll
+; P9BE: sldi
+; P9BE: ld
+; P9BE: ld
+; P9BE: mtvsrdd v2
+; P9BE-NEXT: blr
+; P9LE: sldi
+; P9LE: ld
+; P9LE: ld
+; P9LE: mtvsrdd v2
+; P9LE-NEXT: blr
+; P8BE: sldi
+; P8BE: ld
+; P8BE: ld
+; P8BE: mtvsrd
+; P8BE: mtvsrd
+; P8BE: xxmrghd v2
+; P8BE-NEXT: blr
+; P8LE: sldi
+; P8LE: ld
+; P8LE: ld
+; P8LE: mtvsrd
+; P8LE: mtvsrd
+; P8LE: xxmrghd v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltRegValll(i64 %val) {
+entry:
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %val, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %splat.splat
+; P9BE-LABEL: spltRegValll
+; P9LE-LABEL: spltRegValll
+; P8BE-LABEL: spltRegValll
+; P8LE-LABEL: spltRegValll
+; P9BE: mtvsrdd v2, r3, r3
+; P9BE-NEXT: blr
+; P9LE: mtvsrdd v2, r3, r3
+; P9LE-NEXT: blr
+; P8BE: mtvsrd {{[vsf]+}}[[REG1:[0-9]+]], r3
+; P8BE: xxspltd v2, {{[vsf]+}}[[REG1]], 0
+; P8BE-NEXT: blr
+; P8LE: mtvsrd {{[vsf]+}}[[REG1:[0-9]+]], r3
+; P8LE: xxspltd v2, {{[vsf]+}}[[REG1]], 0
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @spltMemValll(i64* nocapture readonly %ptr) {
+entry:
+  %0 = load i64, i64* %ptr, align 8
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %0, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %splat.splat
+; P9BE-LABEL: spltMemValll
+; P9LE-LABEL: spltMemValll
+; P8BE-LABEL: spltMemValll
+; P8LE-LABEL: spltMemValll
+; P9BE: lxvdsx v2
+; P9BE-NEXT: blr
+; P9LE: lxvdsx v2
+; P9LE-NEXT: blr
+; P8BE: lxvdsx v2
+; P8BE-NEXT: blr
+; P8LE: lxvdsx v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltCnstConvftoll() {
+entry:
+  ret <2 x i64> <i64 4, i64 4>
+; P9BE-LABEL: spltCnstConvftoll
+; P9LE-LABEL: spltCnstConvftoll
+; P8BE-LABEL: spltCnstConvftoll
+; P8LE-LABEL: spltCnstConvftoll
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @fromRegsConvftoll(float %a, float %b) {
+entry:
+  %conv = fptosi float %a to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %conv1 = fptosi float %b to i64
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %conv1, i32 1
+  ret <2 x i64> %vecinit2
+; P9BE-LABEL: fromRegsConvftoll
+; P9LE-LABEL: fromRegsConvftoll
+; P8BE-LABEL: fromRegsConvftoll
+; P8LE-LABEL: fromRegsConvftoll
+; P9BE: xxmrghd
+; P9BE: xvcvdpsxds v2
+; P9BE-NEXT: blr
+; P9LE: xxmrghd
+; P9LE: xvcvdpsxds v2
+; P9LE-NEXT: blr
+; P8BE: xxmrghd
+; P8BE: xvcvdpsxds v2
+; P8BE-NEXT: blr
+; P8LE: xxmrghd
+; P8LE: xvcvdpsxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @fromDiffConstsConvftoll() {
+entry:
+  ret <2 x i64> <i64 24, i64 234>
+; P9BE-LABEL: fromDiffConstsConvftoll
+; P9LE-LABEL: fromDiffConstsConvftoll
+; P8BE-LABEL: fromDiffConstsConvftoll
+; P8LE-LABEL: fromDiffConstsConvftoll
+; P9BE: lxvx v2
+; P9BE: blr
+; P9LE: lxvx v2
+; P9LE: blr
+; P8BE: lxvd2x v2
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: xxswapd v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemConsAConvftoll(float* nocapture readonly %ptr) {
+entry:
+  %0 = load float, float* %ptr, align 4
+  %conv = fptosi float %0 to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 1
+  %1 = load float, float* %arrayidx1, align 4
+  %conv2 = fptosi float %1 to i64
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+  ret <2 x i64> %vecinit3
+; P9BE-LABEL: fromDiffMemConsAConvftoll
+; P9LE-LABEL: fromDiffMemConsAConvftoll
+; P8BE-LABEL: fromDiffMemConsAConvftoll
+; P8LE-LABEL: fromDiffMemConsAConvftoll
+; P9BE: lfs
+; P9BE: lfs
+; P9BE: xxmrghd
+; P9BE-NEXT: xvcvdpsxds v2
+; P9BE-NEXT: blr
+; P9LE: lfs
+; P9LE: lfs
+; P9LE: xxmrghd
+; P9LE-NEXT: xvcvdpsxds v2
+; P9LE-NEXT: blr
+; P8BE: lxsspx
+; P8BE: lxsspx
+; P8BE: xxmrghd
+; P8BE-NEXT: xvcvdpsxds v2
+; P8BE-NEXT: blr
+; P8LE: lxsspx
+; P8LE: lxsspx
+; P8LE: xxmrghd
+; P8LE-NEXT: xvcvdpsxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemConsDConvftoll(float* nocapture readonly %ptr) {
+entry:
+  %arrayidx = getelementptr inbounds float, float* %ptr, i64 3
+  %0 = load float, float* %arrayidx, align 4
+  %conv = fptosi float %0 to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 2
+  %1 = load float, float* %arrayidx1, align 4
+  %conv2 = fptosi float %1 to i64
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+  ret <2 x i64> %vecinit3
+; P9BE-LABEL: fromDiffMemConsDConvftoll
+; P9LE-LABEL: fromDiffMemConsDConvftoll
+; P8BE-LABEL: fromDiffMemConsDConvftoll
+; P8LE-LABEL: fromDiffMemConsDConvftoll
+; P9BE: lfs
+; P9BE: lfs
+; P9BE: xxmrghd
+; P9BE-NEXT: xvcvdpsxds v2
+; P9BE-NEXT: blr
+; P9LE: lfs
+; P9LE: lfs
+; P9LE: xxmrghd
+; P9LE-NEXT: xvcvdpsxds v2
+; P9LE-NEXT: blr
+; P8BE: lxsspx
+; P8BE: lxsspx
+; P8BE: xxmrghd
+; P8BE-NEXT: xvcvdpsxds v2
+; P8BE-NEXT: blr
+; P8LE: lxsspx
+; P8LE: lxsspx
+; P8LE: xxmrghd
+; P8LE-NEXT: xvcvdpsxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemVarAConvftoll(float* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  %conv = fptosi float %0 to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %add = add nsw i32 %elem, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1
+  %1 = load float, float* %arrayidx2, align 4
+  %conv3 = fptosi float %1 to i64
+  %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1
+  ret <2 x i64> %vecinit4
+; P9BE-LABEL: fromDiffMemVarAConvftoll
+; P9LE-LABEL: fromDiffMemVarAConvftoll
+; P8BE-LABEL: fromDiffMemVarAConvftoll
+; P8LE-LABEL: fromDiffMemVarAConvftoll
+; P9BE: sldi
+; P9BE: lfsux
+; P9BE: lfs
+; P9BE: xxmrghd
+; P9BE-NEXT: xvcvdpsxds v2
+; P9BE-NEXT: blr
+; P9LE: sldi
+; P9LE: lfsux
+; P9LE: lfs
+; P9LE: xxmrghd
+; P9LE-NEXT: xvcvdpsxds v2
+; P9LE-NEXT: blr
+; P8BE: sldi
+; P8BE: lfsux
+; P8BE: lxsspx
+; P8BE: xxmrghd
+; P8BE-NEXT: xvcvdpsxds v2
+; P8BE-NEXT: blr
+; P8LE: sldi
+; P8LE: lfsux
+; P8LE: lxsspx
+; P8LE: xxmrghd
+; P8LE-NEXT: xvcvdpsxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemVarDConvftoll(float* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  %conv = fptosi float %0 to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %sub = add nsw i32 %elem, -1
+  %idxprom1 = sext i32 %sub to i64
+  %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1
+  %1 = load float, float* %arrayidx2, align 4
+  %conv3 = fptosi float %1 to i64
+  %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1
+  ret <2 x i64> %vecinit4
+; P9BE-LABEL: fromDiffMemVarDConvftoll
+; P9LE-LABEL: fromDiffMemVarDConvftoll
+; P8BE-LABEL: fromDiffMemVarDConvftoll
+; P8LE-LABEL: fromDiffMemVarDConvftoll
+; P9BE: sldi
+; P9BE: lfsux
+; P9BE: lfs
+; P9BE: xxmrghd
+; P9BE-NEXT: xvcvdpsxds v2
+; P9BE-NEXT: blr
+; P9LE: sldi
+; P9LE: lfsux
+; P9LE: lfs
+; P9LE: xxmrghd
+; P9LE-NEXT: xvcvdpsxds v2
+; P9LE-NEXT: blr
+; P8BE: sldi
+; P8BE: lfsux
+; P8BE: lxsspx
+; P8BE: xxmrghd
+; P8BE-NEXT: xvcvdpsxds v2
+; P8BE-NEXT: blr
+; P8LE: sldi
+; P8LE: lfsux
+; P8LE: lxsspx
+; P8LE: xxmrghd
+; P8LE-NEXT: xvcvdpsxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltRegValConvftoll(float %val) {
+entry:
+  %conv = fptosi float %val to i64
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %splat.splat
+; P9BE-LABEL: spltRegValConvftoll
+; P9LE-LABEL: spltRegValConvftoll
+; P8BE-LABEL: spltRegValConvftoll
+; P8LE-LABEL: spltRegValConvftoll
+; P9BE: xscvdpsxds
+; P9BE-NEXT: xxspltd v2
+; P9BE-NEXT: blr
+; P9LE: xscvdpsxds
+; P9LE-NEXT: xxspltd v2
+; P9LE-NEXT: blr
+; P8BE: xscvdpsxds
+; P8BE-NEXT: xxspltd v2
+; P8BE-NEXT: blr
+; P8LE: xscvdpsxds
+; P8LE-NEXT: xxspltd v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @spltMemValConvftoll(float* nocapture readonly %ptr) {
+entry:
+  %0 = load float, float* %ptr, align 4
+  %conv = fptosi float %0 to i64
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %splat.splat
+; P9BE-LABEL: spltMemValConvftoll
+; P9LE-LABEL: spltMemValConvftoll
+; P8BE-LABEL: spltMemValConvftoll
+; P8LE-LABEL: spltMemValConvftoll
+; P9BE: lfs
+; P9BE-NEXT: xscvdpsxds
+; P9BE-NEXT: xxspltd v2
+; P9BE-NEXT: blr
+; P9LE: lfs
+; P9LE-NEXT: xscvdpsxds
+; P9LE-NEXT: xxspltd v2
+; P9LE-NEXT: blr
+; P8BE: lxsspx
+; P8BE-NEXT: xscvdpsxds
+; P8BE-NEXT: xxspltd v2
+; P8BE-NEXT: blr
+; P8LE: lxsspx
+; P8LE-NEXT: xscvdpsxds
+; P8LE-NEXT: xxspltd v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltCnstConvdtoll() {
+entry:
+  ret <2 x i64> <i64 4, i64 4>
+; P9BE-LABEL: spltCnstConvdtoll
+; P9LE-LABEL: spltCnstConvdtoll
+; P8BE-LABEL: spltCnstConvdtoll
+; P8LE-LABEL: spltCnstConvdtoll
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @fromRegsConvdtoll(double %a, double %b) {
+entry:
+  %conv = fptosi double %a to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %conv1 = fptosi double %b to i64
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %conv1, i32 1
+  ret <2 x i64> %vecinit2
+; P9BE-LABEL: fromRegsConvdtoll
+; P9LE-LABEL: fromRegsConvdtoll
+; P8BE-LABEL: fromRegsConvdtoll
+; P8LE-LABEL: fromRegsConvdtoll
+; P9BE: xxmrghd
+; P9BE-NEXT: xvcvdpsxds
+; P9BE-NEXT: blr
+; P9LE: xxmrghd
+; P9LE-NEXT: xvcvdpsxds
+; P9LE-NEXT: blr
+; P8BE: xxmrghd
+; P8BE-NEXT: xvcvdpsxds
+; P8BE-NEXT: blr
+; P8LE: xxmrghd
+; P8LE-NEXT: xvcvdpsxds
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @fromDiffConstsConvdtoll() {
+entry:
+  ret <2 x i64> <i64 24, i64 234>
+; P9BE-LABEL: fromDiffConstsConvdtoll
+; P9LE-LABEL: fromDiffConstsConvdtoll
+; P8BE-LABEL: fromDiffConstsConvdtoll
+; P8LE-LABEL: fromDiffConstsConvdtoll
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemConsAConvdtoll(double* nocapture readonly %ptr) {
+entry:
+  %0 = bitcast double* %ptr to <2 x double>*
+  %1 = load <2 x double>, <2 x double>* %0, align 8
+  %2 = fptosi <2 x double> %1 to <2 x i64>
+  ret <2 x i64> %2
+; P9BE-LABEL: fromDiffMemConsAConvdtoll
+; P9LE-LABEL: fromDiffMemConsAConvdtoll
+; P8BE-LABEL: fromDiffMemConsAConvdtoll
+; P8LE-LABEL: fromDiffMemConsAConvdtoll
+; P9BE: lxvx
+; P9BE-NEXT: xvcvdpsxds v2
+; P9BE-NEXT: blr
+; P9LE: lxvx
+; P9LE-NEXT: xvcvdpsxds v2
+; P9LE-NEXT: blr
+; P8BE: lxvd2x
+; P8BE-NEXT: xvcvdpsxds v2
+; P8BE-NEXT: blr
+; P8LE: lxvd2x
+; P8LE: xxswapd
+; P8LE-NEXT: xvcvdpsxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemConsDConvdtoll(double* nocapture readonly %ptr) {
+entry:
+  %arrayidx = getelementptr inbounds double, double* %ptr, i64 3
+  %0 = load double, double* %arrayidx, align 8
+  %conv = fptosi double %0 to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 2
+  %1 = load double, double* %arrayidx1, align 8
+  %conv2 = fptosi double %1 to i64
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+  ret <2 x i64> %vecinit3
+; P9BE-LABEL: fromDiffMemConsDConvdtoll
+; P9LE-LABEL: fromDiffMemConsDConvdtoll
+; P8BE-LABEL: fromDiffMemConsDConvdtoll
+; P8LE-LABEL: fromDiffMemConsDConvdtoll
+; P9BE: lxvx
+; P9BE-NEXT: xxswapd
+; P9BE-NEXT: xvcvdpsxds v2
+; P9BE-NEXT: blr
+; P9LE: lxvx
+; P9LE-NEXT: xxswapd
+; P9LE-NEXT: xvcvdpsxds v2
+; P9LE-NEXT: blr
+; P8BE: lxvd2x
+; P8BE-NEXT: xxswapd
+; P8BE-NEXT: xvcvdpsxds v2
+; P8BE-NEXT: blr
+; P8LE: lxvd2x
+; P8LE-NEXT: xvcvdpsxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemVarAConvdtoll(double* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %conv = fptosi double %0 to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %add = add nsw i32 %elem, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1
+  %1 = load double, double* %arrayidx2, align 8
+  %conv3 = fptosi double %1 to i64
+  %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1
+  ret <2 x i64> %vecinit4
+; P9BE-LABEL: fromDiffMemVarAConvdtoll
+; P9LE-LABEL: fromDiffMemVarAConvdtoll
+; P8BE-LABEL: fromDiffMemVarAConvdtoll
+; P8LE-LABEL: fromDiffMemVarAConvdtoll
+; P9BE: sldi
+; P9BE: lxvx
+; P9BE-NEXT: xvcvdpsxds v2
+; P9BE-NEXT: blr
+; P9LE: sldi
+; P9LE: lxvx
+; P9LE-NEXT: xvcvdpsxds v2
+; P9LE-NEXT: blr
+; P8BE: sldi
+; P8BE: lxvd2x
+; P8BE-NEXT: xvcvdpsxds v2
+; P8BE-NEXT: blr
+; P8LE: sldi
+; P8LE: lxvd2x
+; P8LE-NEXT: xxswapd
+; P8LE-NEXT: xvcvdpsxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemVarDConvdtoll(double* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %conv = fptosi double %0 to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %sub = add nsw i32 %elem, -1
+  %idxprom1 = sext i32 %sub to i64
+  %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1
+  %1 = load double, double* %arrayidx2, align 8
+  %conv3 = fptosi double %1 to i64
+  %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1
+  ret <2 x i64> %vecinit4
+; P9BE-LABEL: fromDiffMemVarDConvdtoll
+; P9LE-LABEL: fromDiffMemVarDConvdtoll
+; P8BE-LABEL: fromDiffMemVarDConvdtoll
+; P8LE-LABEL: fromDiffMemVarDConvdtoll
+; P9BE: sldi
+; P9BE: lxvx
+; P9BE-NEXT: xxswapd
+; P9BE-NEXT: xvcvdpsxds v2
+; P9BE-NEXT: blr
+; P9LE: sldi
+; P9LE: lxvx
+; P9LE-NEXT: xxswapd
+; P9LE-NEXT: xvcvdpsxds v2
+; P9LE-NEXT: blr
+; P8BE: sldi
+; P8BE: lxvd2x
+; P8BE-NEXT: xxswapd
+; P8BE-NEXT: xvcvdpsxds v2
+; P8BE-NEXT: blr
+; P8LE: sldi
+; P8LE: lxvd2x
+; P8LE-NEXT: xvcvdpsxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltRegValConvdtoll(double %val) {
+entry:
+  %conv = fptosi double %val to i64
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %splat.splat
+; P9BE-LABEL: spltRegValConvdtoll
+; P9LE-LABEL: spltRegValConvdtoll
+; P8BE-LABEL: spltRegValConvdtoll
+; P8LE-LABEL: spltRegValConvdtoll
+; P9BE: xscvdpsxds
+; P9BE-NEXT: xxspltd v2
+; P9BE-NEXT: blr
+; P9LE: xscvdpsxds
+; P9LE-NEXT: xxspltd v2
+; P9LE-NEXT: blr
+; P8BE: xscvdpsxds
+; P8BE-NEXT: xxspltd v2
+; P8BE-NEXT: blr
+; P8LE: xscvdpsxds
+; P8LE-NEXT: xxspltd v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @spltMemValConvdtoll(double* nocapture readonly %ptr) {
+entry:
+  %0 = load double, double* %ptr, align 8
+  %conv = fptosi double %0 to i64
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %splat.splat
+; P9BE-LABEL: spltMemValConvdtoll
+; P9LE-LABEL: spltMemValConvdtoll
+; P8BE-LABEL: spltMemValConvdtoll
+; P8LE-LABEL: spltMemValConvdtoll
+; P9BE: lxvdsx
+; P9BE-NEXT: xvcvdpsxds
+; P9BE-NEXT: blr
+; P9LE: lxvdsx
+; P9LE-NEXT: xvcvdpsxds
+; P9LE-NEXT: blr
+; P8BE: lxvdsx
+; P8BE-NEXT: xvcvdpsxds
+; P8BE-NEXT: blr
+; P8LE: lxvdsx
+; P8LE-NEXT: xvcvdpsxds
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @allZeroull() {
+entry:
+  ret <2 x i64> zeroinitializer
+; P9BE-LABEL: allZeroull
+; P9LE-LABEL: allZeroull
+; P8BE-LABEL: allZeroull
+; P8LE-LABEL: allZeroull
+; P9BE: xxlxor v2, v2, v2
+; P9BE: blr
+; P9LE: xxlxor v2, v2, v2
+; P9LE: blr
+; P8BE: xxlxor v2, v2, v2
+; P8BE: blr
+; P8LE: xxlxor v2, v2, v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @allOneull() {
+entry:
+  ret <2 x i64> <i64 -1, i64 -1>
+; P9BE-LABEL: allOneull
+; P9LE-LABEL: allOneull
+; P8BE-LABEL: allOneull
+; P8LE-LABEL: allOneull
+; P9BE: xxspltib v2, 255
+; P9BE: blr
+; P9LE: xxspltib v2, 255
+; P9LE: blr
+; P8BE: vspltisb v2, -1
+; P8BE: blr
+; P8LE: vspltisb v2, -1
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltConst1ull() {
+entry:
+  ret <2 x i64> <i64 1, i64 1>
+; P9BE-LABEL: spltConst1ull
+; P9LE-LABEL: spltConst1ull
+; P8BE-LABEL: spltConst1ull
+; P8LE-LABEL: spltConst1ull
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltConst16kull() {
+entry:
+  ret <2 x i64> <i64 32767, i64 32767>
+; P9BE-LABEL: spltConst16kull
+; P9LE-LABEL: spltConst16kull
+; P8BE-LABEL: spltConst16kull
+; P8LE-LABEL: spltConst16kull
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltConst32kull() {
+entry:
+  ret <2 x i64> <i64 65535, i64 65535>
+; P9BE-LABEL: spltConst32kull
+; P9LE-LABEL: spltConst32kull
+; P8BE-LABEL: spltConst32kull
+; P8LE-LABEL: spltConst32kull
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @fromRegsull(i64 %a, i64 %b) {
+entry:
+  %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
+  %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
+  ret <2 x i64> %vecinit1
+; P9BE-LABEL: fromRegsull
+; P9LE-LABEL: fromRegsull
+; P8BE-LABEL: fromRegsull
+; P8LE-LABEL: fromRegsull
+; P9BE: mtvsrdd v2, r3, r4
+; P9BE: blr
+; P9LE: mtvsrdd v2, r4, r3
+; P9LE: blr
+; P8BE-DAG: mtvsrd {{[vsf0-9]+}}, r3
+; P8BE-DAG: mtvsrd {{[vsf0-9]+}}, r4
+; P8BE: xxmrghd v2
+; P8BE: blr
+; P8LE-DAG: mtvsrd {{[vsf0-9]+}}, r3
+; P8LE-DAG: mtvsrd {{[vsf0-9]+}}, r4
+; P8LE: xxmrghd v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @fromDiffConstsull() {
+entry:
+  ret <2 x i64> <i64 242, i64 -113>
+; P9BE-LABEL: fromDiffConstsull
+; P9LE-LABEL: fromDiffConstsull
+; P8BE-LABEL: fromDiffConstsull
+; P8LE-LABEL: fromDiffConstsull
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemConsAull(i64* nocapture readonly %arr) {
+entry:
+  %0 = load i64, i64* %arr, align 8
+  %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0
+  %arrayidx1 = getelementptr inbounds i64, i64* %arr, i64 1
+  %1 = load i64, i64* %arrayidx1, align 8
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %1, i32 1
+  ret <2 x i64> %vecinit2
+; P9BE-LABEL: fromDiffMemConsAull
+; P9LE-LABEL: fromDiffMemConsAull
+; P8BE-LABEL: fromDiffMemConsAull
+; P8LE-LABEL: fromDiffMemConsAull
+; P9BE: lxvx v2
+; P9BE: blr
+; P9LE: lxvx v2
+; P9LE: blr
+; P8BE: lxvd2x v2
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: xxswapd v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemConsDull(i64* nocapture readonly %arr) {
+entry:
+  %arrayidx = getelementptr inbounds i64, i64* %arr, i64 3
+  %0 = load i64, i64* %arrayidx, align 8
+  %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0
+  %arrayidx1 = getelementptr inbounds i64, i64* %arr, i64 2
+  %1 = load i64, i64* %arrayidx1, align 8
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %1, i32 1
+  ret <2 x i64> %vecinit2
+; P9BE-LABEL: fromDiffMemConsDull
+; P9LE-LABEL: fromDiffMemConsDull
+; P8BE-LABEL: fromDiffMemConsDull
+; P8LE-LABEL: fromDiffMemConsDull
+; P9BE: lxvx v2
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: xxswapd v2
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: xxswapd v2
+; P8BE-NEXT: blr
+; P8LE: lxvd2x v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemVarAull(i64* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds i64, i64* %arr, i64 %idxprom
+  %0 = load i64, i64* %arrayidx, align 8
+  %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0
+  %add = add nsw i32 %elem, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i64, i64* %arr, i64 %idxprom1
+  %1 = load i64, i64* %arrayidx2, align 8
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %1, i32 1
+  ret <2 x i64> %vecinit3
+; P9BE-LABEL: fromDiffMemVarAull
+; P9LE-LABEL: fromDiffMemVarAull
+; P8BE-LABEL: fromDiffMemVarAull
+; P8LE-LABEL: fromDiffMemVarAull
+; P9BE: sldi
+; P9BE: lxvx v2
+; P9BE-NEXT: blr
+; P9LE: sldi
+; P9LE: lxvx v2
+; P9LE-NEXT: blr
+; P8BE: sldi
+; P8BE: lxvd2x v2
+; P8BE-NEXT: blr
+; P8LE: sldi
+; P8LE: lxvd2x
+; P8LE: xxswapd v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemVarDull(i64* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds i64, i64* %arr, i64 %idxprom
+  %0 = load i64, i64* %arrayidx, align 8
+  %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0
+  %sub = add nsw i32 %elem, -1
+  %idxprom1 = sext i32 %sub to i64
+  %arrayidx2 = getelementptr inbounds i64, i64* %arr, i64 %idxprom1
+  %1 = load i64, i64* %arrayidx2, align 8
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %1, i32 1
+  ret <2 x i64> %vecinit3
+; P9BE-LABEL: fromDiffMemVarDull
+; P9LE-LABEL: fromDiffMemVarDull
+; P8BE-LABEL: fromDiffMemVarDull
+; P8LE-LABEL: fromDiffMemVarDull
+; P9BE: sldi
+; P9BE: lxvx
+; P9BE: xxswapd v2
+; P9BE-NEXT: blr
+; P9LE: sldi
+; P9LE: lxvx
+; P9LE: xxswapd v2
+; P9LE-NEXT: blr
+; P8BE: sldi
+; P8BE: lxvd2x
+; P8BE: xxswapd v2
+; P8BE-NEXT: blr
+; P8LE: sldi
+; P8LE: lxvd2x v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromRandMemConsull(i64* nocapture readonly %arr) {
+entry:
+  %arrayidx = getelementptr inbounds i64, i64* %arr, i64 4
+  %0 = load i64, i64* %arrayidx, align 8
+  %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0
+  %arrayidx1 = getelementptr inbounds i64, i64* %arr, i64 18
+  %1 = load i64, i64* %arrayidx1, align 8
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %1, i32 1
+  ret <2 x i64> %vecinit2
+; P9BE-LABEL: fromRandMemConsull
+; P9LE-LABEL: fromRandMemConsull
+; P8BE-LABEL: fromRandMemConsull
+; P8LE-LABEL: fromRandMemConsull
+; P9BE: ld
+; P9BE: ld
+; P9BE: mtvsrdd v2
+; P9BE-NEXT: blr
+; P9LE: ld
+; P9LE: ld
+; P9LE: mtvsrdd v2
+; P9LE-NEXT: blr
+; P8BE: ld
+; P8BE: ld
+; P8BE-DAG: mtvsrd
+; P8BE-DAG: mtvsrd
+; P8BE: xxmrghd v2
+; P8BE-NEXT: blr
+; P8LE: ld
+; P8LE: ld
+; P8LE-DAG: mtvsrd
+; P8LE-DAG: mtvsrd
+; P8LE: xxmrghd v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromRandMemVarull(i64* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %add = add nsw i32 %elem, 4
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i64, i64* %arr, i64 %idxprom
+  %0 = load i64, i64* %arrayidx, align 8
+  %vecinit = insertelement <2 x i64> undef, i64 %0, i32 0
+  %add1 = add nsw i32 %elem, 1
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds i64, i64* %arr, i64 %idxprom2
+  %1 = load i64, i64* %arrayidx3, align 8
+  %vecinit4 = insertelement <2 x i64> %vecinit, i64 %1, i32 1
+  ret <2 x i64> %vecinit4
+; P9BE-LABEL: fromRandMemVarull
+; P9LE-LABEL: fromRandMemVarull
+; P8BE-LABEL: fromRandMemVarull
+; P8LE-LABEL: fromRandMemVarull
+; P9BE: sldi
+; P9BE: ld
+; P9BE: ld
+; P9BE: mtvsrdd v2
+; P9BE-NEXT: blr
+; P9LE: sldi
+; P9LE: ld
+; P9LE: ld
+; P9LE: mtvsrdd v2
+; P9LE-NEXT: blr
+; P8BE: sldi
+; P8BE: ld
+; P8BE: ld
+; P8BE: mtvsrd
+; P8BE: mtvsrd
+; P8BE: xxmrghd v2
+; P8BE-NEXT: blr
+; P8LE: sldi
+; P8LE: ld
+; P8LE: ld
+; P8LE: mtvsrd
+; P8LE: mtvsrd
+; P8LE: xxmrghd v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltRegValull(i64 %val) {
+entry:
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %val, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %splat.splat
+; P9BE-LABEL: spltRegValull
+; P9LE-LABEL: spltRegValull
+; P8BE-LABEL: spltRegValull
+; P8LE-LABEL: spltRegValull
+; P9BE: mtvsrdd v2, r3, r3
+; P9BE-NEXT: blr
+; P9LE: mtvsrdd v2, r3, r3
+; P9LE-NEXT: blr
+; P8BE: mtvsrd {{[vsf]+}}[[REG1:[0-9]+]], r3
+; P8BE: xxspltd v2, {{[vsf]+}}[[REG1]], 0
+; P8BE-NEXT: blr
+; P8LE: mtvsrd {{[vsf]+}}[[REG1:[0-9]+]], r3
+; P8LE: xxspltd v2, {{[vsf]+}}[[REG1]], 0
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @spltMemValull(i64* nocapture readonly %ptr) {
+entry:
+  %0 = load i64, i64* %ptr, align 8
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %0, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %splat.splat
+; P9BE-LABEL: spltMemValull
+; P9LE-LABEL: spltMemValull
+; P8BE-LABEL: spltMemValull
+; P8LE-LABEL: spltMemValull
+; P9BE: lxvdsx v2
+; P9BE-NEXT: blr
+; P9LE: lxvdsx v2
+; P9LE-NEXT: blr
+; P8BE: lxvdsx v2
+; P8BE-NEXT: blr
+; P8LE: lxvdsx v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltCnstConvftoull() {
+entry:
+  ret <2 x i64> <i64 4, i64 4>
+; P9BE-LABEL: spltCnstConvftoull
+; P9LE-LABEL: spltCnstConvftoull
+; P8BE-LABEL: spltCnstConvftoull
+; P8LE-LABEL: spltCnstConvftoull
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @fromRegsConvftoull(float %a, float %b) {
+entry:
+  %conv = fptoui float %a to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %conv1 = fptoui float %b to i64
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %conv1, i32 1
+  ret <2 x i64> %vecinit2
+; P9BE-LABEL: fromRegsConvftoull
+; P9LE-LABEL: fromRegsConvftoull
+; P8BE-LABEL: fromRegsConvftoull
+; P8LE-LABEL: fromRegsConvftoull
+; P9BE: xxmrghd
+; P9BE: xvcvdpuxds v2
+; P9BE-NEXT: blr
+; P9LE: xxmrghd
+; P9LE: xvcvdpuxds v2
+; P9LE-NEXT: blr
+; P8BE: xxmrghd
+; P8BE: xvcvdpuxds v2
+; P8BE-NEXT: blr
+; P8LE: xxmrghd
+; P8LE: xvcvdpuxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @fromDiffConstsConvftoull() {
+entry:
+  ret <2 x i64> <i64 24, i64 234>
+; P9BE-LABEL: fromDiffConstsConvftoull
+; P9LE-LABEL: fromDiffConstsConvftoull
+; P8BE-LABEL: fromDiffConstsConvftoull
+; P8LE-LABEL: fromDiffConstsConvftoull
+; P9BE: lxvx v2
+; P9BE: blr
+; P9LE: lxvx v2
+; P9LE: blr
+; P8BE: lxvd2x v2
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: xxswapd v2
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemConsAConvftoull(float* nocapture readonly %ptr) {
+entry:
+  %0 = load float, float* %ptr, align 4
+  %conv = fptoui float %0 to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 1
+  %1 = load float, float* %arrayidx1, align 4
+  %conv2 = fptoui float %1 to i64
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+  ret <2 x i64> %vecinit3
+; P9BE-LABEL: fromDiffMemConsAConvftoull
+; P9LE-LABEL: fromDiffMemConsAConvftoull
+; P8BE-LABEL: fromDiffMemConsAConvftoull
+; P8LE-LABEL: fromDiffMemConsAConvftoull
+; P9BE: lfs
+; P9BE: lfs
+; P9BE: xxmrghd
+; P9BE-NEXT: xvcvdpuxds v2
+; P9BE-NEXT: blr
+; P9LE: lfs
+; P9LE: lfs
+; P9LE: xxmrghd
+; P9LE-NEXT: xvcvdpuxds v2
+; P9LE-NEXT: blr
+; P8BE: lxsspx
+; P8BE: lxsspx
+; P8BE: xxmrghd
+; P8BE-NEXT: xvcvdpuxds v2
+; P8BE-NEXT: blr
+; P8LE: lxsspx
+; P8LE: lxsspx
+; P8LE: xxmrghd
+; P8LE-NEXT: xvcvdpuxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemConsDConvftoull(float* nocapture readonly %ptr) {
+entry:
+  %arrayidx = getelementptr inbounds float, float* %ptr, i64 3
+  %0 = load float, float* %arrayidx, align 4
+  %conv = fptoui float %0 to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 2
+  %1 = load float, float* %arrayidx1, align 4
+  %conv2 = fptoui float %1 to i64
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+  ret <2 x i64> %vecinit3
+; P9BE-LABEL: fromDiffMemConsDConvftoull
+; P9LE-LABEL: fromDiffMemConsDConvftoull
+; P8BE-LABEL: fromDiffMemConsDConvftoull
+; P8LE-LABEL: fromDiffMemConsDConvftoull
+; P9BE: lfs
+; P9BE: lfs
+; P9BE: xxmrghd
+; P9BE-NEXT: xvcvdpuxds v2
+; P9BE-NEXT: blr
+; P9LE: lfs
+; P9LE: lfs
+; P9LE: xxmrghd
+; P9LE-NEXT: xvcvdpuxds v2
+; P9LE-NEXT: blr
+; P8BE: lxsspx
+; P8BE: lxsspx
+; P8BE: xxmrghd
+; P8BE-NEXT: xvcvdpuxds v2
+; P8BE-NEXT: blr
+; P8LE: lxsspx
+; P8LE: lxsspx
+; P8LE: xxmrghd
+; P8LE-NEXT: xvcvdpuxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemVarAConvftoull(float* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  %conv = fptoui float %0 to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %add = add nsw i32 %elem, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1
+  %1 = load float, float* %arrayidx2, align 4
+  %conv3 = fptoui float %1 to i64
+  %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1
+  ret <2 x i64> %vecinit4
+; P9BE-LABEL: fromDiffMemVarAConvftoull
+; P9LE-LABEL: fromDiffMemVarAConvftoull
+; P8BE-LABEL: fromDiffMemVarAConvftoull
+; P8LE-LABEL: fromDiffMemVarAConvftoull
+; P9BE: sldi
+; P9BE: lfsux
+; P9BE: lfs
+; P9BE: xxmrghd
+; P9BE-NEXT: xvcvdpuxds v2
+; P9BE-NEXT: blr
+; P9LE: sldi
+; P9LE: lfsux
+; P9LE: lfs
+; P9LE: xxmrghd
+; P9LE-NEXT: xvcvdpuxds v2
+; P9LE-NEXT: blr
+; P8BE: sldi
+; P8BE: lfsux
+; P8BE: lxsspx
+; P8BE: xxmrghd
+; P8BE-NEXT: xvcvdpuxds v2
+; P8BE-NEXT: blr
+; P8LE: sldi
+; P8LE: lfsux
+; P8LE: lxsspx
+; P8LE: xxmrghd
+; P8LE-NEXT: xvcvdpuxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemVarDConvftoull(float* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds float, float* %arr, i64 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  %conv = fptoui float %0 to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %sub = add nsw i32 %elem, -1
+  %idxprom1 = sext i32 %sub to i64
+  %arrayidx2 = getelementptr inbounds float, float* %arr, i64 %idxprom1
+  %1 = load float, float* %arrayidx2, align 4
+  %conv3 = fptoui float %1 to i64
+  %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1
+  ret <2 x i64> %vecinit4
+; P9BE-LABEL: fromDiffMemVarDConvftoull
+; P9LE-LABEL: fromDiffMemVarDConvftoull
+; P8BE-LABEL: fromDiffMemVarDConvftoull
+; P8LE-LABEL: fromDiffMemVarDConvftoull
+; P9BE: sldi
+; P9BE: lfsux
+; P9BE: lfs
+; P9BE: xxmrghd
+; P9BE-NEXT: xvcvdpuxds v2
+; P9BE-NEXT: blr
+; P9LE: sldi
+; P9LE: lfsux
+; P9LE: lfs
+; P9LE: xxmrghd
+; P9LE-NEXT: xvcvdpuxds v2
+; P9LE-NEXT: blr
+; P8BE: sldi
+; P8BE: lfsux
+; P8BE: lxsspx
+; P8BE: xxmrghd
+; P8BE-NEXT: xvcvdpuxds v2
+; P8BE-NEXT: blr
+; P8LE: sldi
+; P8LE: lfsux
+; P8LE: lxsspx
+; P8LE: xxmrghd
+; P8LE-NEXT: xvcvdpuxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltRegValConvftoull(float %val) {
+entry:
+  %conv = fptoui float %val to i64
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %splat.splat
+; P9BE-LABEL: spltRegValConvftoull
+; P9LE-LABEL: spltRegValConvftoull
+; P8BE-LABEL: spltRegValConvftoull
+; P8LE-LABEL: spltRegValConvftoull
+; P9BE: xscvdpuxds
+; P9BE-NEXT: xxspltd v2
+; P9BE-NEXT: blr
+; P9LE: xscvdpuxds
+; P9LE-NEXT: xxspltd v2
+; P9LE-NEXT: blr
+; P8BE: xscvdpuxds
+; P8BE-NEXT: xxspltd v2
+; P8BE-NEXT: blr
+; P8LE: xscvdpuxds
+; P8LE-NEXT: xxspltd v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @spltMemValConvftoull(float* nocapture readonly %ptr) {
+entry:
+  %0 = load float, float* %ptr, align 4
+  %conv = fptoui float %0 to i64
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %splat.splat
+; P9BE-LABEL: spltMemValConvftoull
+; P9LE-LABEL: spltMemValConvftoull
+; P8BE-LABEL: spltMemValConvftoull
+; P8LE-LABEL: spltMemValConvftoull
+; P9BE: lfs
+; P9BE-NEXT: xscvdpuxds
+; P9BE-NEXT: xxspltd v2
+; P9BE-NEXT: blr
+; P9LE: lfs
+; P9LE-NEXT: xscvdpuxds
+; P9LE-NEXT: xxspltd v2
+; P9LE-NEXT: blr
+; P8BE: lxsspx
+; P8BE-NEXT: xscvdpuxds
+; P8BE-NEXT: xxspltd v2
+; P8BE-NEXT: blr
+; P8LE: lxsspx
+; P8LE-NEXT: xscvdpuxds
+; P8LE-NEXT: xxspltd v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltCnstConvdtoull() {
+entry:
+  ret <2 x i64> <i64 4, i64 4>
+; P9BE-LABEL: spltCnstConvdtoull
+; P9LE-LABEL: spltCnstConvdtoull
+; P8BE-LABEL: spltCnstConvdtoull
+; P8LE-LABEL: spltCnstConvdtoull
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @fromRegsConvdtoull(double %a, double %b) {
+entry:
+  %conv = fptoui double %a to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %conv1 = fptoui double %b to i64
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %conv1, i32 1
+  ret <2 x i64> %vecinit2
+; P9BE-LABEL: fromRegsConvdtoull
+; P9LE-LABEL: fromRegsConvdtoull
+; P8BE-LABEL: fromRegsConvdtoull
+; P8LE-LABEL: fromRegsConvdtoull
+; P9BE: xxmrghd
+; P9BE-NEXT: xvcvdpuxds
+; P9BE-NEXT: blr
+; P9LE: xxmrghd
+; P9LE-NEXT: xvcvdpuxds
+; P9LE-NEXT: blr
+; P8BE: xxmrghd
+; P8BE-NEXT: xvcvdpuxds
+; P8BE-NEXT: blr
+; P8LE: xxmrghd
+; P8LE-NEXT: xvcvdpuxds
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @fromDiffConstsConvdtoull() {
+entry:
+  ret <2 x i64> <i64 24, i64 234>
+; P9BE-LABEL: fromDiffConstsConvdtoull
+; P9LE-LABEL: fromDiffConstsConvdtoull
+; P8BE-LABEL: fromDiffConstsConvdtoull
+; P8LE-LABEL: fromDiffConstsConvdtoull
+; P9BE: lxvx
+; P9BE: blr
+; P9LE: lxvx
+; P9LE: blr
+; P8BE: lxvd2x
+; P8BE: blr
+; P8LE: lxvd2x
+; P8LE: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemConsAConvdtoull(double* nocapture readonly %ptr) {
+entry:
+  %0 = bitcast double* %ptr to <2 x double>*
+  %1 = load <2 x double>, <2 x double>* %0, align 8
+  %2 = fptoui <2 x double> %1 to <2 x i64>
+  ret <2 x i64> %2
+; P9BE-LABEL: fromDiffMemConsAConvdtoull
+; P9LE-LABEL: fromDiffMemConsAConvdtoull
+; P8BE-LABEL: fromDiffMemConsAConvdtoull
+; P8LE-LABEL: fromDiffMemConsAConvdtoull
+; P9BE: lxvx
+; P9BE-NEXT: xvcvdpuxds v2
+; P9BE-NEXT: blr
+; P9LE: lxvx
+; P9LE-NEXT: xvcvdpuxds v2
+; P9LE-NEXT: blr
+; P8BE: lxvd2x
+; P8BE-NEXT: xvcvdpuxds v2
+; P8BE-NEXT: blr
+; P8LE: lxvd2x
+; P8LE: xxswapd
+; P8LE-NEXT: xvcvdpuxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemConsDConvdtoull(double* nocapture readonly %ptr) {
+entry:
+  %arrayidx = getelementptr inbounds double, double* %ptr, i64 3
+  %0 = load double, double* %arrayidx, align 8
+  %conv = fptoui double %0 to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 2
+  %1 = load double, double* %arrayidx1, align 8
+  %conv2 = fptoui double %1 to i64
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+  ret <2 x i64> %vecinit3
+; P9BE-LABEL: fromDiffMemConsDConvdtoull
+; P9LE-LABEL: fromDiffMemConsDConvdtoull
+; P8BE-LABEL: fromDiffMemConsDConvdtoull
+; P8LE-LABEL: fromDiffMemConsDConvdtoull
+; P9BE: lxvx
+; P9BE-NEXT: xxswapd
+; P9BE-NEXT: xvcvdpuxds v2
+; P9BE-NEXT: blr
+; P9LE: lxvx
+; P9LE-NEXT: xxswapd
+; P9LE-NEXT: xvcvdpuxds v2
+; P9LE-NEXT: blr
+; P8BE: lxvd2x
+; P8BE-NEXT: xxswapd
+; P8BE-NEXT: xvcvdpuxds v2
+; P8BE-NEXT: blr
+; P8LE: lxvd2x
+; P8LE-NEXT: xvcvdpuxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemVarAConvdtoull(double* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %conv = fptoui double %0 to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %add = add nsw i32 %elem, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1
+  %1 = load double, double* %arrayidx2, align 8
+  %conv3 = fptoui double %1 to i64
+  %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1
+  ret <2 x i64> %vecinit4
+; P9BE-LABEL: fromDiffMemVarAConvdtoull
+; P9LE-LABEL: fromDiffMemVarAConvdtoull
+; P8BE-LABEL: fromDiffMemVarAConvdtoull
+; P8LE-LABEL: fromDiffMemVarAConvdtoull
+; P9BE: sldi
+; P9BE: lxvx
+; P9BE-NEXT: xvcvdpuxds v2
+; P9BE-NEXT: blr
+; P9LE: sldi
+; P9LE: lxvx
+; P9LE-NEXT: xvcvdpuxds v2
+; P9LE-NEXT: blr
+; P8BE: sldi
+; P8BE: lxvd2x
+; P8BE-NEXT: xvcvdpuxds v2
+; P8BE-NEXT: blr
+; P8LE: sldi
+; P8LE: lxvd2x
+; P8LE-NEXT: xxswapd
+; P8LE-NEXT: xvcvdpuxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @fromDiffMemVarDConvdtoull(double* nocapture readonly %arr, i32 signext %elem) {
+entry:
+  %idxprom = sext i32 %elem to i64
+  %arrayidx = getelementptr inbounds double, double* %arr, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %conv = fptoui double %0 to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %sub = add nsw i32 %elem, -1
+  %idxprom1 = sext i32 %sub to i64
+  %arrayidx2 = getelementptr inbounds double, double* %arr, i64 %idxprom1
+  %1 = load double, double* %arrayidx2, align 8
+  %conv3 = fptoui double %1 to i64
+  %vecinit4 = insertelement <2 x i64> %vecinit, i64 %conv3, i32 1
+  ret <2 x i64> %vecinit4
+; P9BE-LABEL: fromDiffMemVarDConvdtoull
+; P9LE-LABEL: fromDiffMemVarDConvdtoull
+; P8BE-LABEL: fromDiffMemVarDConvdtoull
+; P8LE-LABEL: fromDiffMemVarDConvdtoull
+; P9BE: sldi
+; P9BE: lxvx
+; P9BE-NEXT: xxswapd
+; P9BE-NEXT: xvcvdpuxds v2
+; P9BE-NEXT: blr
+; P9LE: sldi
+; P9LE: lxvx
+; P9LE-NEXT: xxswapd
+; P9LE-NEXT: xvcvdpuxds v2
+; P9LE-NEXT: blr
+; P8BE: sldi
+; P8BE: lxvd2x
+; P8BE-NEXT: xxswapd
+; P8BE-NEXT: xvcvdpuxds v2
+; P8BE-NEXT: blr
+; P8LE: sldi
+; P8LE: lxvd2x
+; P8LE-NEXT: xvcvdpuxds v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define <2 x i64> @spltRegValConvdtoull(double %val) {
+entry:
+  %conv = fptoui double %val to i64
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %splat.splat
+; P9BE-LABEL: spltRegValConvdtoull
+; P9LE-LABEL: spltRegValConvdtoull
+; P8BE-LABEL: spltRegValConvdtoull
+; P8LE-LABEL: spltRegValConvdtoull
+; P9BE: xscvdpuxds
+; P9BE-NEXT: xxspltd v2
+; P9BE-NEXT: blr
+; P9LE: xscvdpuxds
+; P9LE-NEXT: xxspltd v2
+; P9LE-NEXT: blr
+; P8BE: xscvdpuxds
+; P8BE-NEXT: xxspltd v2
+; P8BE-NEXT: blr
+; P8LE: xscvdpuxds
+; P8LE-NEXT: xxspltd v2
+; P8LE-NEXT: blr
+}
+
+; Function Attrs: norecurse nounwind readonly
+define <2 x i64> @spltMemValConvdtoull(double* nocapture readonly %ptr) {
+entry:
+  %0 = load double, double* %ptr, align 8
+  %conv = fptoui double %0 to i64
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %splat.splat
+; P9BE-LABEL: spltMemValConvdtoull
+; P9LE-LABEL: spltMemValConvdtoull
+; P8BE-LABEL: spltMemValConvdtoull
+; P8LE-LABEL: spltMemValConvdtoull
+; P9BE: lxvdsx
+; P9BE-NEXT: xvcvdpuxds
+; P9BE-NEXT: blr
+; P9LE: lxvdsx
+; P9LE-NEXT: xvcvdpuxds
+; P9LE-NEXT: blr
+; P8BE: lxvdsx
+; P8BE-NEXT: xvcvdpuxds
+; P8BE-NEXT: blr
+; P8LE: lxvdsx
+; P8LE-NEXT: xvcvdpuxds
+; P8LE-NEXT: blr
+}
diff --git a/test/CodeGen/PowerPC/power9-moves-and-splats.ll b/test/CodeGen/PowerPC/power9-moves-and-splats.ll
index 68995de..54eea6a 100644
--- a/test/CodeGen/PowerPC/power9-moves-and-splats.ll
+++ b/test/CodeGen/PowerPC/power9-moves-and-splats.ll
@@ -156,14 +156,10 @@
 entry:
 ; CHECK-LABEL: test14
 ; CHECK: lwz [[LD:[0-9]+]],
-; FIXME: mtvsrws 34, [[LD]]
-; CHECK: mtvsrws [[SPLT:[0-9]+]], [[LD]]
-; CHECK: xxspltw 34, [[SPLT]], 3
+; CHECK: mtvsrws 34, [[LD]]
 ; CHECK-BE-LABEL: test14
 ; CHECK-BE: lwz [[LD:[0-9]+]],
-; FIXME: mtvsrws 34, [[LD]]
-; CHECK-BE: mtvsrws [[SPLT:[0-9]+]], [[LD]]
-; CHECK-BE: xxspltw 34, [[SPLT]], 0
+; CHECK-BE: mtvsrws 34, [[LD]]
   %0 = load i32, i32* %b, align 4
   %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0
   %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
diff --git a/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll b/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll
index 01f87ce..67146e4 100644
--- a/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll
+++ b/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll
@@ -57,11 +57,11 @@
   %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %splat.splat
 ; CHECK-LABEL: veculuc
-; CHECK: lxsibzx 34, 0, 3
-; CHECK-NEXT: xxspltd 34, 34, 0
+; CHECK: lxsibzx 0, 0, 3
+; CHECK-NEXT: xxspltd 34, 0, 0
 ; CHECK-BE-LABEL: veculuc
-; CHECK-BE: lxsibzx 34, 0, 3
-; CHECK-BE-NEXT: xxspltd 34, 34, 0
+; CHECK-BE: lxsibzx 0, 0, 3
+; CHECK-BE-NEXT: xxspltd 34, 0, 0
 }
 
 ; Function Attrs: norecurse nounwind readonly
@@ -120,11 +120,11 @@
   %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %splat.splat
 ; CHECK-LABEL: vecsluc
-; CHECK: lxsibzx 34, 0, 3
-; CHECK-NEXT: xxspltd 34, 34, 0
+; CHECK: lxsibzx 0, 0, 3
+; CHECK-NEXT: xxspltd 34, 0, 0
 ; CHECK-BE-LABEL: vecsluc
-; CHECK-BE: lxsibzx 34, 0, 3
-; CHECK-BE-NEXT: xxspltd 34, 34, 0
+; CHECK-BE: lxsibzx 0, 0, 3
+; CHECK-BE-NEXT: xxspltd 34, 0, 0
 }
 
 ; Function Attrs: norecurse nounwind readonly
@@ -366,11 +366,11 @@
   %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %splat.splat
 ; CHECK-LABEL: veculus
-; CHECK: lxsihzx 34, 0, 3
-; CHECK-NEXT: xxspltd 34, 34, 0
+; CHECK: lxsihzx 0, 0, 3
+; CHECK-NEXT: xxspltd 34, 0, 0
 ; CHECK-BE-LABEL: veculus
-; CHECK-BE: lxsihzx 34, 0, 3
-; CHECK-BE-NEXT: xxspltd 34, 34, 0
+; CHECK-BE: lxsihzx 0, 0, 3
+; CHECK-BE-NEXT: xxspltd 34, 0, 0
 }
 
 ; Function Attrs: norecurse nounwind readonly
@@ -430,11 +430,11 @@
   %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %splat.splat
 ; CHECK-LABEL: vecslus
-; CHECK: lxsihzx 34, 0, 3
-; CHECK-NEXT: xxspltd 34, 34, 0
+; CHECK: lxsihzx 0, 0, 3
+; CHECK-NEXT: xxspltd 34, 0, 0
 ; CHECK-BE-LABEL: vecslus
-; CHECK-BE: lxsihzx 34, 0, 3
-; CHECK-BE-NEXT: xxspltd 34, 34, 0
+; CHECK-BE: lxsihzx 0, 0, 3
+; CHECK-BE-NEXT: xxspltd 34, 0, 0
 }
 
 ; Function Attrs: norecurse nounwind readonly
diff --git a/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll b/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
index 0944adb..3439ebc 100644
--- a/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
+++ b/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
@@ -1,13 +1,23 @@
-; RUN: llc -march=x86-64 -mattr=-sse4.2,+sse4.1 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown -mattr=-sse4.2,+sse4.1 < %s | FileCheck %s
+
 ; Make sure we don't load from the location pointed to by %p
 ; twice: it has non-obvious performance implications, and
 ; the relevant transformation doesn't know how to update
 ; the chains correctly.
 ; PR10747
 
-; CHECK-LABEL: test:
-; CHECK: pextrd $2, %xmm
 define <4 x i32> @test(<4 x i32>* %p) {
+; CHECK-LABEL: test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movdqa (%rdi), %xmm0
+; CHECK-NEXT:    pextrd $2, %xmm0, %eax
+; CHECK-NEXT:    cmpl $3, %eax
+; CHECK-NEXT:    je .LBB0_2
+; CHECK-NEXT:  # BB#1:
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    retq
   %v = load <4 x i32>, <4 x i32>* %p
   %e = extractelement <4 x i32> %v, i32 2
   %cmp = icmp eq i32 %e, 3
diff --git a/test/CodeGen/X86/2012-1-10-buildvector.ll b/test/CodeGen/X86/2012-1-10-buildvector.ll
index 2d1b596..85f4c9e 100644
--- a/test/CodeGen/X86/2012-1-10-buildvector.ll
+++ b/test/CodeGen/X86/2012-1-10-buildvector.ll
@@ -19,7 +19,7 @@
 ; CHECK-LABEL: bad_insert:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovaps %ymm0, (%eax)
+; CHECK-NEXT:    vmovdqa %ymm0, (%eax)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
   %v2 = insertelement <8 x i32> zeroinitializer, i32 %t, i32 0
diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll
index 0332d3e..e847eda 100644
--- a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll
+++ b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll
@@ -25,6 +25,8 @@
 ; CHECK-NEXT:   hasMustTailInVarArgFunc: false
 ; CHECK-NEXT: body:
 ; CHECK-NEXT:   bb.0:
+; CHECK-NEXT:     successors: %bb.1(0x80000000)
+; CHECK:        bb.1:
 ; CHECK-NEXT:     RET 0
 entry:
   ret void
diff --git a/test/CodeGen/X86/absolute-bit-mask.ll b/test/CodeGen/X86/absolute-bit-mask.ll
new file mode 100644
index 0000000..6e11949
--- /dev/null
+++ b/test/CodeGen/X86/absolute-bit-mask.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -relocation-model=pic < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@bit_mask8 = external hidden global i8, !absolute_symbol !0
+@bit_mask32 = external hidden global i8, !absolute_symbol !1
+@bit_mask64 = external hidden global i8, !absolute_symbol !2
+
+declare void @f()
+
+define void @foo8(i8* %ptr) {
+  %load = load i8, i8* %ptr
+  ; CHECK: testb $bit_mask8, (%rdi)
+  %and = and i8 %load, ptrtoint (i8* @bit_mask8 to i8)
+  %icmp = icmp eq i8 %and, 0
+  br i1 %icmp, label %t, label %f
+
+t:
+  call void @f()
+  ret void
+
+f:
+  ret void
+}
+
+define void @foo32(i32* %ptr) {
+  %load = load i32, i32* %ptr
+  ; CHECK: testl $bit_mask32, (%rdi)
+  %and = and i32 %load, ptrtoint (i8* @bit_mask32 to i32)
+  %icmp = icmp eq i32 %and, 0
+  br i1 %icmp, label %t, label %f
+
+t:
+  call void @f()
+  ret void
+
+f:
+  ret void
+}
+
+define void @foo64(i64* %ptr) {
+  %load = load i64, i64* %ptr
+  ; CHECK: movabsq $bit_mask64, %rax
+  ; CHECK: testq (%rdi), %rax
+  %and = and i64 %load, ptrtoint (i8* @bit_mask64 to i64)
+  %icmp = icmp eq i64 %and, 0
+  br i1 %icmp, label %t, label %f
+
+t:
+  call void @f()
+  ret void
+
+f:
+  ret void
+}
+
+!0 = !{i64 0, i64 256}
+!1 = !{i64 0, i64 4294967296}
+!2 = !{i64 -1, i64 -1}
diff --git a/test/CodeGen/X86/absolute-bt.ll b/test/CodeGen/X86/absolute-bt.ll
new file mode 100644
index 0000000..ffc16bc
--- /dev/null
+++ b/test/CodeGen/X86/absolute-bt.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -relocation-model=pic < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@bit_mask8 = external hidden global i8, !absolute_symbol !0
+@bit_mask32 = external hidden global i8, !absolute_symbol !1
+@bit_mask64 = external hidden global i8, !absolute_symbol !2
+
+declare void @f()
+
+define void @foo32(i32* %ptr) {
+  %load = load i32, i32* %ptr
+  %and = and i32 %load, 31
+  %shl = shl i32 1, %and
+  %and2 = and i32 %shl, ptrtoint (i8* @bit_mask32 to i32)
+  ; CHECK: movl $bit_mask32, %eax
+  ; CHECK: btl %ecx, %eax
+  %icmp = icmp eq i32 %and2, 0
+  br i1 %icmp, label %t, label %f
+
+t:
+  call void @f()
+  ret void
+
+f:
+  ret void
+}
+
+define void @foo64(i64* %ptr) {
+  %load = load i64, i64* %ptr
+  %and = and i64 %load, 63
+  %shl = shl i64 1, %and
+  %and2 = and i64 %shl, ptrtoint (i8* @bit_mask64 to i64)
+  ; CHECK: movabsq $bit_mask64, %rax
+  ; CHECK: btq %rcx, %rax
+  %icmp = icmp eq i64 %and2, 0
+  br i1 %icmp, label %t, label %f
+
+t:
+  call void @f()
+  ret void
+
+f:
+  ret void
+}
+
+!0 = !{i64 0, i64 256}
+!1 = !{i64 0, i64 4294967296}
+!2 = !{i64 -1, i64 -1}
diff --git a/test/CodeGen/X86/absolute-constant.ll b/test/CodeGen/X86/absolute-constant.ll
new file mode 100644
index 0000000..d93fb27
--- /dev/null
+++ b/test/CodeGen/X86/absolute-constant.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -relocation-model=pic < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@foo = external global i8, align 1, !absolute_symbol !0
+
+define void @bar(i8* %x) {
+entry:
+  %0 = load i8, i8* %x, align 1
+  %conv = sext i8 %0 to i32
+  ; CHECK: testb $foo, (%rdi)
+  %and = and i32 %conv, sext (i8 ptrtoint (i8* @foo to i8) to i32)
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void (...) @xf()
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+declare void @xf(...)
+
+!0 = !{i32 0, i32 256}
diff --git a/test/CodeGen/X86/absolute-rotate.ll b/test/CodeGen/X86/absolute-rotate.ll
new file mode 100644
index 0000000..c0ecb82
--- /dev/null
+++ b/test/CodeGen/X86/absolute-rotate.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -relocation-model=pic < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@align = external hidden global i8, !absolute_symbol !0
+
+declare void @f()
+
+define void @foo(i64 %val) {
+  %shr = lshr i64 %val, zext (i8 ptrtoint (i8* @align to i8) to i64)
+  %shl = shl i64 %val, zext (i8 sub (i8 64, i8 ptrtoint (i8* @align to i8)) to i64)
+  ; CHECK: rorq $align, %rdi
+  %ror = or i64 %shr, %shl
+  %cmp = icmp ult i64 %ror, 109
+  br i1 %cmp, label %t, label %f
+
+t:
+  call void @f()
+  ret void
+
+f:
+  ret void
+}
+
+!0 = !{i64 0, i64 256}
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index a3b3353..99d3f20 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -2132,7 +2132,7 @@
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vmovdqa32 {{.*#+}} zmm4 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8]
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpaddd %zmm4, %zmm3, %zmm3
 ; AVX512F-NEXT:    vpaddd %zmm4, %zmm2, %zmm2
 ; AVX512F-NEXT:    vpaddd %zmm4, %zmm1, %zmm1
@@ -2405,7 +2405,7 @@
 ; AVX512F:       # BB#0:
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8]
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0
diff --git a/test/CodeGen/X86/avx-arith.ll b/test/CodeGen/X86/avx-arith.ll
index 792a998..66c09e0 100644
--- a/test/CodeGen/X86/avx-arith.ll
+++ b/test/CodeGen/X86/avx-arith.ll
@@ -1,121 +1,173 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
-; CHECK: vaddpd
 define <4 x double> @addpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
+; CHECK-LABEL: addpd256:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %add.i = fadd <4 x double> %x, %y
   ret <4 x double> %add.i
 }
 
-; CHECK: vaddpd LCP{{.*}}(%rip)
 define <4 x double> @addpd256fold(<4 x double> %y) nounwind uwtable readnone ssp {
+; CHECK-LABEL: addpd256fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vaddpd {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %add.i = fadd <4 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
   ret <4 x double> %add.i
 }
 
-; CHECK: vaddps
 define <8 x float> @addps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
+; CHECK-LABEL: addps256:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %add.i = fadd <8 x float> %x, %y
   ret <8 x float> %add.i
 }
 
-; CHECK: vaddps LCP{{.*}}(%rip)
 define <8 x float> @addps256fold(<8 x float> %y) nounwind uwtable readnone ssp {
+; CHECK-LABEL: addps256fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vaddps {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %add.i = fadd <8 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
   ret <8 x float> %add.i
 }
 
-; CHECK: vsubpd
 define <4 x double> @subpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
+; CHECK-LABEL: subpd256:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsubpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %sub.i = fsub <4 x double> %x, %y
   ret <4 x double> %sub.i
 }
 
-; CHECK: vsubpd (%
 define <4 x double> @subpd256fold(<4 x double> %y, <4 x double>* nocapture %x) nounwind uwtable readonly ssp {
+; CHECK-LABEL: subpd256fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsubpd (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %tmp2 = load <4 x double>, <4 x double>* %x, align 32
   %sub.i = fsub <4 x double> %y, %tmp2
   ret <4 x double> %sub.i
 }
 
-; CHECK: vsubps
 define <8 x float> @subps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
+; CHECK-LABEL: subps256:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsubps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %sub.i = fsub <8 x float> %x, %y
   ret <8 x float> %sub.i
 }
 
-; CHECK: vsubps (%
 define <8 x float> @subps256fold(<8 x float> %y, <8 x float>* nocapture %x) nounwind uwtable readonly ssp {
+; CHECK-LABEL: subps256fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsubps (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %tmp2 = load <8 x float>, <8 x float>* %x, align 32
   %sub.i = fsub <8 x float> %y, %tmp2
   ret <8 x float> %sub.i
 }
 
-; CHECK: vmulpd
 define <4 x double> @mulpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
+; CHECK-LABEL: mulpd256:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %mul.i = fmul <4 x double> %x, %y
   ret <4 x double> %mul.i
 }
 
-; CHECK: vmulpd LCP{{.*}}(%rip)
 define <4 x double> @mulpd256fold(<4 x double> %y) nounwind uwtable readnone ssp {
+; CHECK-LABEL: mulpd256fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %mul.i = fmul <4 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
   ret <4 x double> %mul.i
 }
 
-; CHECK: vmulps
 define <8 x float> @mulps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
+; CHECK-LABEL: mulps256:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %mul.i = fmul <8 x float> %x, %y
   ret <8 x float> %mul.i
 }
 
-; CHECK: vmulps LCP{{.*}}(%rip)
 define <8 x float> @mulps256fold(<8 x float> %y) nounwind uwtable readnone ssp {
+; CHECK-LABEL: mulps256fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %mul.i = fmul <8 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
   ret <8 x float> %mul.i
 }
 
-; CHECK: vdivpd
 define <4 x double> @divpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
+; CHECK-LABEL: divpd256:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %div.i = fdiv <4 x double> %x, %y
   ret <4 x double> %div.i
 }
 
-; CHECK: vdivpd LCP{{.*}}(%rip)
 define <4 x double> @divpd256fold(<4 x double> %y) nounwind uwtable readnone ssp {
+; CHECK-LABEL: divpd256fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vdivpd {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %div.i = fdiv <4 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
   ret <4 x double> %div.i
 }
 
-; CHECK: vdivps
 define <8 x float> @divps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
+; CHECK-LABEL: divps256:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %div.i = fdiv <8 x float> %x, %y
   ret <8 x float> %div.i
 }
 
-; CHECK: vdivps LCP{{.*}}(%rip)
 define <8 x float> @divps256fold(<8 x float> %y) nounwind uwtable readnone ssp {
+; CHECK-LABEL: divps256fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vdivps {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    retq
 entry:
   %div.i = fdiv <8 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
   ret <8 x float> %div.i
 }
 
-; CHECK: vsqrtss
 define float @sqrtA(float %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: sqrtA:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %conv1 = tail call float @sqrtf(float %a) nounwind readnone
   ret float %conv1
@@ -123,8 +175,11 @@
 
 declare double @sqrt(double) readnone
 
-; CHECK: vsqrtsd
 define double @sqrtB(double %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: sqrtB:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %call = tail call double @sqrt(double %a) nounwind readnone
   ret double %call
@@ -133,128 +188,161 @@
 declare float @sqrtf(float) readnone
 
 
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vextractf128 $1
-; CHECK-NEXT: vpaddq %xmm
-; CHECK-NEXT: vpaddq %xmm
-; CHECK-NEXT: vinsertf128 $1
 define <4 x i64> @vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+; CHECK-LABEL: vpaddq:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %x = add <4 x i64> %i, %j
   ret <4 x i64> %x
 }
 
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vextractf128 $1
-; CHECK-NEXT: vpaddd %xmm
-; CHECK-NEXT: vpaddd %xmm
-; CHECK-NEXT: vinsertf128 $1
 define <8 x i32> @vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+; CHECK-LABEL: vpaddd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %x = add <8 x i32> %i, %j
   ret <8 x i32> %x
 }
 
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vextractf128 $1
-; CHECK-NEXT: vpaddw %xmm
-; CHECK-NEXT: vpaddw %xmm
-; CHECK-NEXT: vinsertf128 $1
 define <16 x i16> @vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+; CHECK-LABEL: vpaddw:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %x = add <16 x i16> %i, %j
   ret <16 x i16> %x
 }
 
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vextractf128 $1
-; CHECK-NEXT: vpaddb %xmm
-; CHECK-NEXT: vpaddb %xmm
-; CHECK-NEXT: vinsertf128 $1
 define <32 x i8> @vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+; CHECK-LABEL: vpaddb:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %x = add <32 x i8> %i, %j
   ret <32 x i8> %x
 }
 
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vextractf128 $1
-; CHECK-NEXT: vpsubq %xmm
-; CHECK-NEXT: vpsubq %xmm
-; CHECK-NEXT: vinsertf128 $1
 define <4 x i64> @vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+; CHECK-LABEL: vpsubq:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %x = sub <4 x i64> %i, %j
   ret <4 x i64> %x
 }
 
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vextractf128 $1
-; CHECK-NEXT: vpsubd %xmm
-; CHECK-NEXT: vpsubd %xmm
-; CHECK-NEXT: vinsertf128 $1
 define <8 x i32> @vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+; CHECK-LABEL: vpsubd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %x = sub <8 x i32> %i, %j
   ret <8 x i32> %x
 }
 
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vextractf128 $1
-; CHECK-NEXT: vpsubw %xmm
-; CHECK-NEXT: vpsubw %xmm
-; CHECK-NEXT: vinsertf128 $1
 define <16 x i16> @vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+; CHECK-LABEL: vpsubw:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %x = sub <16 x i16> %i, %j
   ret <16 x i16> %x
 }
 
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vextractf128 $1
-; CHECK-NEXT: vpsubb %xmm
-; CHECK-NEXT: vpsubb %xmm
-; CHECK-NEXT: vinsertf128 $1
 define <32 x i8> @vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+; CHECK-LABEL: vpsubb:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %x = sub <32 x i8> %i, %j
   ret <32 x i8> %x
 }
 
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vextractf128 $1
-; CHECK-NEXT: vpmulld %xmm
-; CHECK-NEXT: vpmulld %xmm
-; CHECK-NEXT: vinsertf128 $1
 define <8 x i32> @vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+; CHECK-LABEL: vpmulld:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %x = mul <8 x i32> %i, %j
   ret <8 x i32> %x
 }
 
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vextractf128 $1
-; CHECK-NEXT: vpmullw %xmm
-; CHECK-NEXT: vpmullw %xmm
-; CHECK-NEXT: vinsertf128 $1
 define <16 x i16> @vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+; CHECK-LABEL: vpmullw:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %x = mul <16 x i16> %i, %j
   ret <16 x i16> %x
 }
 
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vextractf128 $1
-; CHECK-NEXT: vpmuludq %xmm
-; CHECK-NEXT: vpsrlq $32, %xmm
-; CHECK-NEXT: vpmuludq %xmm
-; CHECK-NEXT: vpsllq $32, %xmm
-; CHECK-NEXT: vpaddq %xmm
-; CHECK-NEXT: vpsrlq $32, %xmm
-; CHECK-NEXT: vpmuludq %xmm
-; CHECK-NEXT: vpsllq $32, %xmm
-; CHECK-NEXT: vpaddq %xmm
-; CHECK-NEXT: vpmuludq %xmm
-; CHECK-NEXT: vpsrlq $32, %xmm
-; CHECK-NEXT: vpmuludq %xmm
-; CHECK-NEXT: vpsllq $32, %xmm
-; CHECK-NEXT: vpaddq %xmm
-; CHECK-NEXT: vpsrlq $32, %xmm
-; CHECK-NEXT: vpmuludq %xmm
-; CHECK-NEXT: vpsllq $32, %xmm
-; CHECK-NEXT: vpaddq %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <4 x i64> @mul-v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+; CHECK-LABEL: mul_v4i64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpmuludq %xmm2, %xmm3, %xmm4
+; CHECK-NEXT:    vpsrlq $32, %xmm2, %xmm5
+; CHECK-NEXT:    vpmuludq %xmm5, %xmm3, %xmm5
+; CHECK-NEXT:    vpsllq $32, %xmm5, %xmm5
+; CHECK-NEXT:    vpaddq %xmm5, %xmm4, %xmm4
+; CHECK-NEXT:    vpsrlq $32, %xmm3, %xmm3
+; CHECK-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; CHECK-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; CHECK-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
+; CHECK-NEXT:    vpsrlq $32, %xmm1, %xmm4
+; CHECK-NEXT:    vpmuludq %xmm4, %xmm0, %xmm4
+; CHECK-NEXT:    vpsllq $32, %xmm4, %xmm4
+; CHECK-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
+; CHECK-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; CHECK-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpsllq $32, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %x = mul <4 x i64> %i, %j
   ret <4 x i64> %x
 }
@@ -262,10 +350,26 @@
 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 
 define <4 x float> @int_sqrt_ss() {
-; CHECK: int_sqrt_ss
-; CHECK: vsqrtss
+; CHECK-LABEL: int_sqrt_ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
  %x0 = load float, float addrspace(1)* undef, align 8
  %x1 = insertelement <4 x float> undef, float %x0, i32 0
  %x2 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %x1) nounwind
  ret <4 x float> %x2
 }
+
+define <2 x double> @vector_sqrt_scalar_load(double* %a0) optsize {
+; CHECK-LABEL: vector_sqrt_scalar_load:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vsqrtpd %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a1 = load double, double* %a0
+  %a2 = insertelement <2 x double> undef, double %a1, i32 0
+  %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a2) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) nounwind readnone
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 54f8098..22d017e 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -793,7 +793,7 @@
 ; X64-AVX2-LABEL: _inreg0:
 ; X64-AVX2:       ## BB#0:
 ; X64-AVX2-NEXT:    vmovd %edi, %xmm0
-; X64-AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
+; X64-AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
 ; X64-AVX2-NEXT:    retq
 ;
 ; X64-AVX512VL-LABEL: _inreg0:
@@ -1469,9 +1469,9 @@
 ; X64-AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-AVX2-NEXT:    movl (%rdi), %eax
 ; X64-AVX2-NEXT:    vmovd %eax, %xmm1
-; X64-AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
 ; X64-AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX2-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-AVX2-NEXT:    retq
 ;
 ; X64-AVX512VL-LABEL: isel_crash_4d:
@@ -1538,9 +1538,9 @@
 ; X64-AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-AVX2-NEXT:    movl (%rdi), %eax
 ; X64-AVX2-NEXT:    vmovd %eax, %xmm1
-; X64-AVX2-NEXT:    vbroadcastss %xmm1, %ymm1
+; X64-AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
 ; X64-AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-AVX2-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
 ; X64-AVX2-NEXT:    movq %rbp, %rsp
 ; X64-AVX2-NEXT:    popq %rbp
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1723,9 +1723,9 @@
 ; X64-AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; X64-AVX2-NEXT:    movq (%rdi), %rax
 ; X64-AVX2-NEXT:    vmovq %rax, %xmm1
-; X64-AVX2-NEXT:    vbroadcastsd %xmm1, %ymm1
+; X64-AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
 ; X64-AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-AVX2-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
 ; X64-AVX2-NEXT:    movq %rbp, %rsp
 ; X64-AVX2-NEXT:    popq %rbp
 ; X64-AVX2-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index ddd059e..c134fc3 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -4707,15 +4707,15 @@
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm3
-; CHECK-NEXT:    vfmadd132sd %xmm1, %xmm2, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm4
-; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm4
+; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT:    vmovaps %xmm0, %xmm4
+; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm4
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm5
-; CHECK-NEXT:    vfmadd132sd {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
-; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
-; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm0
-; CHECK-NEXT:    vaddpd %xmm5, %xmm1, %xmm1
-; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1}
+; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm1
+; CHECK-NEXT:    vaddpd %xmm5, %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -4735,15 +4735,15 @@
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm3
-; CHECK-NEXT:    vfmadd132ss %xmm1, %xmm2, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm4
-; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm4
+; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT:    vmovaps %xmm0, %xmm4
+; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm4
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm5
-; CHECK-NEXT:    vfmadd132ss {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
-; CHECK-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
-; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm0
-; CHECK-NEXT:    vaddps %xmm5, %xmm1, %xmm1
-; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1}
+; CHECK-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm1
+; CHECK-NEXT:    vaddps %xmm5, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
@@ -4762,10 +4762,10 @@
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %xmm1, %xmm3
-; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vaddpd %xmm1, %xmm3, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, %xmm3
+; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
@@ -4780,8 +4780,7 @@
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
@@ -4797,13 +4796,13 @@
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
 ; CHECK-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm4
-; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm4
+; CHECK-NEXT:    vmovaps %xmm2, %xmm4
+; CHECK-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm4
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm5
 ; CHECK-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
-; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2
 ; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm0
-; CHECK-NEXT:    vaddpd %xmm5, %xmm1, %xmm1
+; CHECK-NEXT:    vaddpd %xmm5, %xmm2, %xmm1
 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
@@ -4825,13 +4824,13 @@
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
 ; CHECK-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm4
-; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm4
+; CHECK-NEXT:    vmovaps %xmm2, %xmm4
+; CHECK-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm4
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm5
 ; CHECK-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
-; CHECK-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2
 ; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm0
-; CHECK-NEXT:    vaddps %xmm5, %xmm1, %xmm1
+; CHECK-NEXT:    vaddps %xmm5, %xmm2, %xmm1
 ; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
@@ -4853,13 +4852,13 @@
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
 ; CHECK-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm4
-; CHECK-NEXT:    vfmsub213sd %xmm2, %xmm0, %xmm4
+; CHECK-NEXT:    vmovaps %xmm2, %xmm4
+; CHECK-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm4
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm5
 ; CHECK-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
-; CHECK-NEXT:    vfmsub213sd {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2
 ; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm0
-; CHECK-NEXT:    vaddpd %xmm5, %xmm1, %xmm1
+; CHECK-NEXT:    vaddpd %xmm5, %xmm2, %xmm1
 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
@@ -4881,13 +4880,13 @@
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
 ; CHECK-NEXT:    vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm4
-; CHECK-NEXT:    vfmsub213ss %xmm2, %xmm0, %xmm4
+; CHECK-NEXT:    vmovaps %xmm2, %xmm4
+; CHECK-NEXT:    vfmsub231ss %xmm1, %xmm0, %xmm4
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm5
 ; CHECK-NEXT:    vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
-; CHECK-NEXT:    vfmsub213ss {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT:    vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2
 ; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm0
-; CHECK-NEXT:    vaddps %xmm5, %xmm1, %xmm1
+; CHECK-NEXT:    vaddps %xmm5, %xmm2, %xmm1
 ; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
@@ -4909,13 +4908,13 @@
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
 ; CHECK-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm4
-; CHECK-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm4
+; CHECK-NEXT:    vmovaps %xmm2, %xmm4
+; CHECK-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm4
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm5
 ; CHECK-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
-; CHECK-NEXT:    vfnmsub213sd {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2
 ; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm0
-; CHECK-NEXT:    vaddpd %xmm5, %xmm1, %xmm1
+; CHECK-NEXT:    vaddpd %xmm5, %xmm2, %xmm1
 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
@@ -4937,13 +4936,13 @@
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
 ; CHECK-NEXT:    vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm4
-; CHECK-NEXT:    vfnmsub213ss %xmm2, %xmm0, %xmm4
+; CHECK-NEXT:    vmovaps %xmm2, %xmm4
+; CHECK-NEXT:    vfnmsub231ss %xmm1, %xmm0, %xmm4
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm5
 ; CHECK-NEXT:    vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
-; CHECK-NEXT:    vfnmsub213ss {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT:    vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2
 ; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm0
-; CHECK-NEXT:    vaddps %xmm5, %xmm1, %xmm1
+; CHECK-NEXT:    vaddps %xmm5, %xmm2, %xmm1
 ; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
@@ -4988,8 +4987,7 @@
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kxorw %k0, %k0, %k1
-; CHECK-NEXT:    vfmadd213ss (%rdi), %xmm0, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %q = load float, float* %ptr_b
   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
diff --git a/test/CodeGen/X86/bit-piece-comment.ll b/test/CodeGen/X86/bit-piece-comment.ll
index 9ebe5bc..8eb258a 100644
--- a/test/CodeGen/X86/bit-piece-comment.ll
+++ b/test/CodeGen/X86/bit-piece-comment.ll
@@ -59,5 +59,5 @@
 !18 = !{!"clang version 3.8.0 (trunk 256088) (llvm/trunk 256097)"}
 !19 = !DIExpression()
 !20 = !DILocation(line: 4, column: 5, scope: !4)
-!21 = !DIExpression(DW_OP_bit_piece, 32, 32)
+!21 = !DIExpression(DW_OP_LLVM_fragment, 32, 32)
 !22 = !DILocation(line: 6, column: 1, scope: !4)
diff --git a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
new file mode 100644
index 0000000..b13965a
--- /dev/null
+++ b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -0,0 +1,1205 @@
+; NOTE: Assertions have been simpilfied MANUALLY after running utils/update_llc_test_checks.py
+;       Assertions for constant pools have been added MANUALLY.
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX2 
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX512 
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=AVX512BW -check-prefix=AVX512 
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX2 -check-prefix=AVX2-64
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX512 -check-prefix=AVX512F-64
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=AVX512BW -check-prefix=AVX512 -check-prefix=AVX512BW-64
+
+;===-----------------------------------------------------------------------------===
+;    This test checks the ability to recognize a cross element pattern of
+;    constants and perform the load via broadcasting a smaller constant
+;    vector.
+;    For example:
+;    <i32 0, i32 1, i32 0, i32 1> => broadcast of the constant vector <i32 0, i32 1>
+;===-----------------------------------------------------------------------------===
+
+; ALL:       LCPI0
+; ALL-NEXT:  .short	256                     # 0x100
+
+define <16 x i8> @f16xi8_i16(<16 x i8> %a) {
+; ALL32-LABEL: f16xi8_i16:
+; ALL32:       # BB#0:
+; ALL32-NEXT:    vpbroadcastw {{\.LCPI.*}}, %xmm1
+; ALL32-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL32-NEXT:    retl
+;
+; ALL64-LABEL: f16xi8_i16:
+; ALL64:       # BB#0:
+; ALL64-NEXT:    vpbroadcastw {{.*}}(%rip), %xmm1
+; ALL64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL64-NEXT:    retq
+  %res1 = add <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a
+  %res2 = and <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1
+  ret <16 x i8> %res2
+}
+
+
+; ALL:       .LCPI1
+; ALL-NEXT:  .long	50462976                # 0x3020100
+
+; AVX:       .LCPI1
+; AVX-NEXT   .long	50462976                # float 3.82047143E-37
+
+define <16 x i8> @f16xi8_i32(<16 x i8> %a) {
+; ALL32-LABEL: f16xi8_i32:
+; ALL32:       # BB#0:
+; ALL32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %xmm1
+; ALL32-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL32-NEXT:    retl
+;
+; ALL64-LABEL: f16xi8_i32:
+; ALL64:       # BB#0:
+; ALL64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; ALL64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL64-NEXT:    retq
+;
+; AVX-LABEL: f16xi8_i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vbroadcastss {{\.LCPI.*}}, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+  %res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
+  %res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
+  ret <16 x i8> %res2
+}
+
+
+; ALL64:       .LCPI2
+; ALL64-NEXT:  .quad	506097522914230528      # 0x706050403020100
+
+; AVX:         .LCPI2
+; AVX-NEXT:    .quad	506097522914230528      # double 7.9499288951273625E-275
+
+define <16 x i8> @f16xi8_i64(<16 x i8> %a) {
+; ALL32-LABEL: f16xi8_i64:
+; ALL32:       # BB#0:
+; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL32-NEXT:    retl
+;
+; ALL64-LABEL: f16xi8_i64:
+; ALL64:       # BB#0:
+; ALL64-NEXT:    vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL64-NEXT:    retq
+;
+; AVX-LABEL: f16xi8_i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+  %res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
+  %res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
+  ret <16 x i8> %res2
+}
+
+
+; ALL:       .LCPI3
+; ALL-NEXT:  .short	256                     # 0x100
+
+define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
+; ALL32-LABEL: f32xi8_i16:
+; ALL32:       # BB#0:
+; ALL32-NEXT:    vpbroadcastw {{\.LCPI.*}}, %ymm1
+; ALL32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT:    retl
+;
+; ALL64-LABEL: f32xi8_i16:
+; ALL64:       # BB#0:
+; ALL64-NEXT:    vpbroadcastw {{.*}}(%rip), %ymm1
+; ALL64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT:    retq
+  %res1 = add <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a
+  %res2 = and <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1
+  ret <32 x i8> %res2
+}
+
+
+; ALL:       .LCPI4
+; ALL-NEXT:  .long	50462976                # 0x3020100
+
+; AVX:       .LCPI4
+; AVX-NEXT:  .long	50462976                # float 3.82047143E-37
+
+define <32 x i8> @f32xi8_i32(<32 x i8> %a) {
+; ALL32-LABEL: f32xi8_i32:
+; ALL32:       # BB#0:
+; ALL32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm1
+; ALL32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT:    retl
+;
+; ALL64-LABEL: f32xi8_i32:
+; ALL64:       # BB#0:
+; ALL64-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; ALL64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT:    retq
+;
+; AVX-LABEL: f32xi8_i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vbroadcastss {{\.LCPI.*}}, %xmm2
+; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
+  %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
+  %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
+  ret <32 x i8> %res2
+}
+
+
+; ALL64:       .LCPI5
+; ALL64-NEXT:  .quad	506097522914230528      # 0x706050403020100
+
+; AVX:         .LCPI5
+; AVX-NEXT:    .quad	506097522914230528      # double 7.9499288951273625E-275
+
+define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
+; ALL32-LABEL: f32xi8_i64:
+; ALL32:       # BB#0:
+; ALL32-NEXT:    vpbroadcastq {{\.LCPI.*}}, %ymm1
+; ALL32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT:    retl
+;
+; ALL64-LABEL: f32xi8_i64:
+; ALL64:       # BB#0:
+; ALL64-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; ALL64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT:    retq
+;
+; AVX-LABEL: f32xi8_i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
+  %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
+  %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
+  ret <32 x i8> %res2
+}
+
+
+; ALL:       .LCPI6
+; ALL-NEXT:  .byte	0                       # 0x0
+; ALL-NEXT:  .byte	1                       # 0x1
+; ALL-NEXT:  .byte	2                       # 0x2
+; ALL-NEXT:  .byte	3                       # 0x3
+; ALL-NEXT:  .byte	4                       # 0x4
+; ALL-NEXT:  .byte	5                       # 0x5
+; ALL-NEXT:  .byte	6                       # 0x6
+; ALL-NEXT:  .byte	7                       # 0x7
+; ALL-NEXT:  .byte	8                       # 0x8
+; ALL-NEXT:  .byte	9                       # 0x9
+; ALL-NEXT:  .byte	10                      # 0xa
+; ALL-NEXT:  .byte	11                      # 0xb
+; ALL-NEXT:  .byte	12                      # 0xc
+; ALL-NEXT:  .byte	13                      # 0xd
+; ALL-NEXT:  .byte	14                      # 0xe
+; ALL-NEXT:  .byte	15                      # 0xf
+; ALL-NOT:   .byte
+
+define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
+; ALL-LABEL: f32xi8_i128:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; ALL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+  %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a
+  %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1
+  ret <32 x i8> %res2
+}
+
+
+; ALL:       .LCPI7
+; ALL-NEXT:  .short	256                     # 0x100
+
+define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
+; NO-AVX512BW-LABEL: f64xi8_i16:
+; NO-AVX512BW:       # BB#0:
+; NO-AVX512BW-NEXT:    vpbroadcastw {{\.LCPI.*}}, %ymm2
+; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512BW-LABEL: f64xi8_i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpbroadcastw {{\.LCPI.*}}, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+  %res1 = add <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a
+  %res2 = and <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1
+  ret <64 x i8> %res2
+}
+
+
+; ALL:       .LCPI8
+; ALL-NEXT:  .long	50462976                # 0x3020100
+
+; AVX:       .LCPI8
+; AVX-NEXT:  .long	50462976                # float 3.82047143E-37
+
+define <64 x i8> @f64i8_i32(<64 x i8> %a) {
+; NO-AVX512BW-LABEL: f64i8_i32:
+; NO-AVX512BW:       # BB#0:
+; NO-AVX512BW-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm2
+; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512BW-LABEL: f64i8_i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpbroadcastd {{\.LCPI.*}}, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+;
+; AVX-LABEL: f64i8_i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT:    vbroadcastss {{\.LCPI.*}}, %xmm3
+; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
+  %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
+  %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
+  ret <64 x i8> %res2
+}
+
+
+; ALL64:         .LCPI9
+; ALL64-NEXT:    .quad	506097522914230528      # 0x706050403020100
+
+; ALL32:         .LCPI9
+; ALL32-NEXT:    .quad	506097522914230528      # double 7.9499288951273625E-275
+
+; AVX:           .LCPI9
+; AVX-NEXT:      .quad	506097522914230528      # double 7.9499288951273625E-275
+
+define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
+; NO-AVX512BW-LABEL: f64xi8_i64:
+; NO-AVX512BW:       # BB#0:
+; NO-AVX512BW-NEXT:    vpbroadcastq {{.*}}, %ymm2
+; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512BW-LABEL: f64xi8_i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpbroadcastq {{.*}}, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+;
+; AVX-LABEL: f64xi8_i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
+  %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
+  %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
+  ret <64 x i8> %res2
+}
+
+
+; ALL:       .LCPI10
+; ALL-NEXT:  .byte	0                       # 0x0
+; ALL-NEXT:  .byte	1                       # 0x1
+; ALL-NEXT:  .byte	2                       # 0x2
+; ALL-NEXT:  .byte	3                       # 0x3
+; ALL-NEXT:  .byte	4                       # 0x4
+; ALL-NEXT:  .byte	5                       # 0x5
+; ALL-NEXT:  .byte	6                       # 0x6
+; ALL-NEXT:  .byte	7                       # 0x7
+; ALL-NEXT:  .byte	8                       # 0x8
+; ALL-NEXT:  .byte	9                       # 0x9
+; ALL-NEXT:  .byte	10                      # 0xa
+; ALL-NEXT:  .byte	11                      # 0xb
+; ALL-NEXT:  .byte	12                      # 0xc
+; ALL-NEXT:  .byte	13                      # 0xd
+; ALL-NEXT:  .byte	14                      # 0xe
+; ALL-NEXT:  .byte	15                      # 0xf
+; ALL-NOT:   .byte
+
+define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
+; NO-AVX512BW-LABEL: f64xi8_i128:
+; NO-AVX512BW:       # BB#0:
+; NO-AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512BW-LABEL: f64xi8_i128:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+  %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a
+  %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1
+  ret <64 x i8> %res2
+}
+
+
+; AVX512BW:       .LCPI11
+; AVX512BW-NEXT:  .byte	0                       # 0x0
+; AVX512BW-NEXT:  .byte	1                       # 0x1
+; AVX512BW-NEXT:  .byte	2                       # 0x2
+; AVX512BW-NEXT:  .byte	3                       # 0x3
+; AVX512BW-NEXT:  .byte	4                       # 0x4
+; AVX512BW-NEXT:  .byte	5                       # 0x5
+; AVX512BW-NEXT:  .byte	6                       # 0x6
+; AVX512BW-NEXT:  .byte	7                       # 0x7
+; AVX512BW-NEXT:  .byte	8                       # 0x8
+; AVX512BW-NEXT:  .byte	9                       # 0x9
+; AVX512BW-NEXT:  .byte	10                      # 0xa
+; AVX512BW-NEXT:  .byte	11                      # 0xb
+; AVX512BW-NEXT:  .byte	12                      # 0xc
+; AVX512BW-NEXT:  .byte	13                      # 0xd
+; AVX512BW-NEXT:  .byte	14                      # 0xe
+; AVX512BW-NEXT:  .byte	15                      # 0xf
+; AVX512BW-NEXT:  .byte	16                      # 0x10
+; AVX512BW-NEXT:  .byte	17                      # 0x11
+; AVX512BW-NEXT:  .byte	18                      # 0x12
+; AVX512BW-NEXT:  .byte	19                      # 0x13
+; AVX512BW-NEXT:  .byte	20                      # 0x14
+; AVX512BW-NEXT:  .byte	21                      # 0x15
+; AVX512BW-NEXT:  .byte	22                      # 0x16
+; AVX512BW-NEXT:  .byte	23                      # 0x17
+; AVX512BW-NEXT:  .byte	24                      # 0x18
+; AVX512BW-NEXT:  .byte	25                      # 0x19
+; AVX512BW-NEXT:  .byte	26                      # 0x1a
+; AVX512BW-NEXT:  .byte	27                      # 0x1b
+; AVX512BW-NEXT:  .byte	28                      # 0x1c
+; AVX512BW-NEXT:  .byte	29                      # 0x1d
+; AVX512BW-NEXT:  .byte	30                      # 0x1e
+; AVX512BW-NEXT:  .byte	31                      # 0x1f
+; AVX512BW-NOT:   .byte
+
+define <64 x i8> @f64xi8_i256(<64 x i8> %a) {
+; AVX512BW-LABEL: f64xi8_i256:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+  %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %a
+  %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %res1
+  ret <64 x i8> %res2
+}
+
+
+; ALL:       .LCPI12
+; ALL-NEXT:  .long	65536                   # 0x10000
+
+; AVX:       .LCPI12
+; AVX-NEXT:  .long	65536                   # float 9.18354962E-41
+
+define <8 x i16> @f8xi16_i32(<8 x i16> %a) {
+; ALL32-LABEL: f8xi16_i32:
+; ALL32:       # BB#0:
+; ALL32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %xmm1
+; ALL32-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL32-NEXT:    retl
+;
+; ALL64-LABEL: f8xi16_i32:
+; ALL64:       # BB#0:
+; ALL64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; ALL64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL64-NEXT:    retq
+;
+; AVX-LABEL: f8xi16_i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vbroadcastss {{\.LCPI.*}}, %xmm1
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+  %res1 = add <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
+  %res2 = and <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
+  ret <8 x i16> %res2
+}
+
+
+; ALL64:       .LCPI13
+; ALL64-NEXT:  .quad	844433520132096         # 0x3000200010000
+
+; ALL32:       .LCPI13
+; ALL32-NEXT:  .quad	844433520132096         # double 4.1720559249406128E-309
+
+; AVX:         .LCPI13
+; AVX-NEXT:    .quad	844433520132096         # double 4.1720559249406128E-309
+
+define <8 x i16> @f8xi16_i64(<8 x i16> %a) {
+; ALL32-LABEL: f8xi16_i64:
+; ALL32:       # BB#0:
+; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL32-NEXT:    retl
+;
+; ALL64-LABEL: f8xi16_i64:
+; ALL64:       # BB#0:
+; ALL64-NEXT:    vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL64-NEXT:    retq
+;
+; AVX-LABEL: f8xi16_i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+  %res1 = add <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
+  %res2 = and <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
+  ret <8 x i16> %res2
+}
+
+
+; ALL:       .LCPI14
+; ALL-NEXT:  .long	65536                   # 0x10000
+
+; AVX:       .LCPI14
+; AVX-NEXT:  .long	65536                   # float 9.18354962E-41
+
+define <16 x i16> @f16xi16_i32(<16 x i16> %a) {
+; ALL-LABEL: f16xi16_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm1
+; ALL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+;
+; AVX-LABEL: f16xi16_i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vbroadcastss {{\.LCPI.*}}, %xmm2
+; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
+  %res1 = add <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
+  %res2 = and <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
+  ret <16 x i16> %res2
+}
+
+
+; ALL64:       .LCPI15
+; ALL64-NEXT:  .quad	844433520132096         # 0x3000200010000
+
+; ALL32:       .LCPI15
+; ALL32-NEXT:  .quad	844433520132096         # double 4.1720559249406128E-309
+
+; AVX:         .LCPI15
+; AVX-NEXT:    .quad	844433520132096         # double 4.1720559249406128E-309
+
+define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
+; ALL-LABEL: f16xi16_i64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq {{.*}}, %ymm1
+; ALL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+;
+; AVX-LABEL: f16xi16_i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
+  %res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
+  %res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
+  ret <16 x i16> %res2
+}
+
+
+; ALL:       .LCPI16
+; ALL-NEXT:  .short	0                       # 0x0
+; ALL-NEXT:  .short	1                       # 0x1
+; ALL-NEXT:  .short	2                       # 0x2
+; ALL-NEXT:  .short	3                       # 0x3
+; ALL-NEXT:  .short	4                       # 0x4
+; ALL-NEXT:  .short	5                       # 0x5
+; ALL-NEXT:  .short	6                       # 0x6
+; ALL-NEXT:  .short	7                       # 0x7
+; ALL-NOT:   .short
+
+define <16 x i16> @f16xi16_i128(<16 x i16> %a) {
+; ALL-LABEL: f16xi16_i128:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; ALL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+  %res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a
+  %res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1
+  ret <16 x i16> %res2
+}
+
+
+; ALL:       .LCPI17
+; ALL-NEXT:  .long	65536                   # 0x10000
+
+; AVX:       .LCPI17
+; AVX-NEXT:  .long	65536                   # float 9.18354962E-41
+
+define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
+; NO-AVX512BW-LABEL: f32xi16_i32:
+; NO-AVX512BW:       # BB#0:
+; NO-AVX512BW-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm2
+; NO-AVX512BW-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512BW-LABEL: f32xi16_i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpbroadcastd {{\.LCPI.*}}, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+;
+; AVX-LABEL: f32xi16_i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT:    vbroadcastss {{\.LCPI.*}}, %xmm3
+; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
+  %res1 = add <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
+  %res2 = and <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
+  ret <32 x i16> %res2
+}
+
+
+; ALL64:         .LCPI18
+; ALL64-NEXT:    .quad	844433520132096         # 0x3000200010000
+
+; ALL32:         .LCPI18
+; ALL32-NEXT:    .quad	844433520132096         # double 4.1720559249406128E-309
+
+; AVX:           .LCPI18
+; AVX-NEXT:      .quad	844433520132096         # double 4.1720559249406128E-309
+
+define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
+; NO-AVX512BW-LABEL: f32xi16_i64:
+; NO-AVX512BW:       # BB#0:
+; NO-AVX512BW-NEXT:    vpbroadcastq {{.*}}, %ymm2
+; NO-AVX512BW-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512BW-LABEL: f32xi16_i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpbroadcastq {{.*}}, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+;
+; AVX-LABEL: f32xi16_i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
+  %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
+  %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
+  ret <32 x i16> %res2
+}
+
+
+; ALL:       .LCPI19
+; ALL-NEXT:  .short	0                       # 0x0
+; ALL-NEXT:  .short	1                       # 0x1
+; ALL-NEXT:  .short	2                       # 0x2
+; ALL-NEXT:  .short	3                       # 0x3
+; ALL-NEXT:  .short	4                       # 0x4
+; ALL-NEXT:  .short	5                       # 0x5
+; ALL-NEXT:  .short	6                       # 0x6
+; ALL-NEXT:  .short	7                       # 0x7
+; ALL-NOT:   .short
+
+define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
+; NO-AVX512BW-LABEL: f32xi16_i128:
+; NO-AVX512BW:       # BB#0:
+; NO-AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; NO-AVX512BW-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512BW-LABEL: f32xi16_i128:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+  %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a
+  %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1
+  ret <32 x i16> %res2
+}
+
+
+; AVX512BW:       .LCPI20
+; AVX512BW-NEXT:  .short	0                       # 0x0
+; AVX512BW-NEXT:  .short	1                       # 0x1
+; AVX512BW-NEXT:  .short	2                       # 0x2
+; AVX512BW-NEXT:  .short	3                       # 0x3
+; AVX512BW-NEXT:  .short	4                       # 0x4
+; AVX512BW-NEXT:  .short	5                       # 0x5
+; AVX512BW-NEXT:  .short	6                       # 0x6
+; AVX512BW-NEXT:  .short	7                       # 0x7
+; AVX512BW-NEXT:  .short	8                       # 0x8
+; AVX512BW-NEXT:  .short	9                       # 0x9
+; AVX512BW-NEXT:  .short	10                      # 0xa
+; AVX512BW-NEXT:  .short	11                      # 0xb
+; AVX512BW-NEXT:  .short	12                      # 0xc
+; AVX512BW-NEXT:  .short	13                      # 0xd
+; AVX512BW-NEXT:  .short	14                      # 0xe
+; AVX512BW-NEXT:  .short	15                      # 0xf
+; AVX512BW-NOT:   .short
+
+define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
+; AVX512BW-LABEL: f32xi16_i256:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+  %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
+  %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %res1
+  ret <32 x i16> %res2
+}
+
+
+; ALL64:       .LCPI21
+; ALL64-NEXT:  .quad	4294967296              # 0x100000000
+
+; ALL32:       .LCPI21
+; ALL32-NEXT:  .quad	4294967296              # double 2.1219957909652723E-314
+
+; AVX:         .LCPI21
+; AVX-NEXT:    .quad	4294967296              # double 2.1219957909652723E-314
+
+define <4 x i32> @f4xi32_i64(<4 x i32> %a) {
+; ALL32-LABEL: f4xi32_i64:
+; ALL32:       # BB#0:
+; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL32-NEXT:    retl
+;
+; ALL64-LABEL: f4xi32_i64:
+; ALL64:       # BB#0:
+; ALL64-NEXT:    vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL64-NEXT:    retq
+;
+; AVX-LABEL: f4xi32_i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+  %res1 = add <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %a
+  %res2 = and <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %res1
+  ret <4 x i32> %res2
+}
+
+
+; ALL64:       .LCPI22
+; ALL64-NEXT:  .quad	4294967296              # 0x100000000
+
+; ALL32:       .LCPI22
+; ALL32-NEXT:  .quad	4294967296              # double 2.1219957909652723E-314
+
+; AVX:         .LCPI22
+; AVX-NEXT:    .quad	4294967296              # double 2.1219957909652723E-314
+
+define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
+; ALL-LABEL: f8xi32_i64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq {{.*}}, %ymm1
+; ALL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+;
+; AVX-LABEL: f8xi32_i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
+  %res1 = add <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
+  %res2 = and <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
+  ret <8 x i32> %res2
+}
+
+
+; ALL:       .LCPI23
+; ALL-NEXT:  .long	0                       # 0x0
+; ALL-NEXT:  .long	1                       # 0x1
+; ALL-NEXT:  .long	2                       # 0x2
+; ALL-NEXT:  .long	3                       # 0x3
+; ALL-NOT:   .long
+
+define <8 x i32> @f8xi32_i128(<8 x i32> %a) {
+; ALL-LABEL: f8xi32_i128:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; ALL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+  %res1 = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
+  %res2 = and <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
+  ret <8 x i32> %res2
+}
+
+
+; ALL64:         .LCPI24
+; ALL64-NEXT:    .quad	4294967296              # 0x100000000
+
+; ALL32:         .LCPI24
+; ALL32-NEXT:    .quad	4294967296              # double 2.1219957909652723E-314
+
+; AVX:           .LCPI24
+; AVX-NEXT:      .quad	4294967296              # double 2.1219957909652723E-314
+
+define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
+; AVX2-LABEL: f16xi32_i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq {{.*}}, %ymm2
+; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512-LABEL: f16xi32_i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpbroadcastq {{.*}}, %zmm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+;
+; AVX-LABEL: f16xi32_i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
+; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
+  %res1 = add <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
+  %res2 = and <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
+  ret <16 x i32> %res2
+}
+
+
+; ALL:       .LCPI25
+; ALL-NEXT:  .long	0                       # 0x0
+; ALL-NEXT:  .long	1                       # 0x1
+; ALL-NEXT:  .long	2                       # 0x2
+; ALL-NEXT:  .long	3                       # 0x3
+; ALL-NOT:   .long
+
+define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
+; AVX2-LABEL: f16xi32_i128:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+;
+; AVX512-LABEL: f16xi32_i128:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+  %res1 = add <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
+  %res2 = and <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
+  ret <16 x i32> %res2
+}
+
+
+; ALL64:       .LCPI26
+; ALL64-NEXT:  .quad	0                       # 0x0
+; ALL64-NEXT:  .quad	1                       # 0x1
+; ALL64-NOT:   .quad
+
+define <4 x i64> @f4xi64_i128(<4 x i64> %a) {
+; ALL64-LABEL: f4xi64_i128:
+; ALL64:       # BB#0:
+; ALL64-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; ALL64-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT:    retq
+  %res1 = add <4 x i64> <i64 0, i64 1, i64 0, i64 1>, %a
+  %res2 = and <4 x i64> <i64 0, i64 1, i64 0, i64 1>, %res1
+  ret <4 x i64> %res2
+}
+
+
+; ALL64:       .LCPI27
+; ALL64-NEXT:  .quad	0                       # 0x0
+; ALL64-NEXT:  .quad	1                       # 0x1
+; ALL64-NOT:   .quad
+
+define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
+; AVX2-64-LABEL: f8xi64_i128:
+; AVX2-64:       # BB#0:
+; AVX2-64-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-64-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT:    retq
+;
+; AVX512F-64-LABEL: f8xi64_i128:
+; AVX512F-64:       # BB#0:
+; AVX512F-64-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT:    retq
+;
+; AVX512BW-64-LABEL: f8xi64_i128:
+; AVX512BW-64:       # BB#0:
+; AVX512BW-64-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT:    retq
+  %res1 = add <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %a
+  %res2 = and <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %res1
+  ret <8 x i64> %res2
+}
+
+
+; ALL64:            .LCPI28
+; ALL64-NEXT:       .quad	0                       # 0x0
+; ALL64-NEXT:       .quad	1                       # 0x1
+; ALL64-NEXT:       .quad	2                       # 0x2
+; ALL64-NEXT:       .quad	3                       # 0x3
+; ALL64-NOT:        .quad
+
+define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
+; AVX512F-64-LABEL: f8xi64_i256:
+; AVX512F-64:       # BB#0:
+; AVX512F-64-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT:    retq
+;
+; AVX512BW-64-LABEL: f8xi64_i256:
+; AVX512BW-64:       # BB#0:
+; AVX512BW-64-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT:    retq
+  %res1 = add <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %a
+  %res2 = and <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %res1
+  ret <8 x i64> %res2
+}
+
+
+; ALL:       .LCPI29
+; ALL-NEXT:  .quad	4575657222482165760     
+
+; AVX:       .LCPI29
+; AVX-NEXT:  .quad	4575657222482165760     # double 0.0078125018626451492
+
+define <4 x float> @f4xf32_f64(<4 x float> %a) {
+; ALL32-LABEL: f4xf32_f64:
+; ALL32:       # BB#0:
+; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; ALL32-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; ALL32-NEXT:    retl
+;
+; ALL64-LABEL: f4xf32_f64:
+; ALL64:       # BB#0:
+; ALL64-NEXT:    vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; ALL64-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; ALL64-NEXT:    retq
+;
+; AVX-LABEL: f4xf32_f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+  %res1 = fadd <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %a
+  %res2 = fdiv <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %res1
+  ret <4 x float> %res2
+}
+
+
+; ALL64:       .LCPI30
+; ALL64-NEXT:  .quad	4575657222482165760     # 0x3f80000040000000
+
+; ALL32:         .LCPI30
+; ALL32-NEXT:    .quad	4575657222482165760     # double 0.0078125018626451492
+
+; AVX:         .LCPI30
+; AVX-NEXT:    .quad	4575657222482165760     # double 0.0078125018626451492
+
+define <8 x float> @f8xf32_f64(<8 x float> %a) {
+; ALL-LABEL: f8xf32_f64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd {{.*}}, %ymm1
+; ALL-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+;
+; AVX-LABEL: f8xf32_f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vbroadcastsd {{\.LCPI.*}}, %ymm1
+; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+  %res1 = fadd <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a
+  %res2 = fdiv <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1
+  ret <8 x float> %res2
+}
+
+
+; ALL:       .LCPI31
+; ALL-NEXT:  .long	1082130432              # float 4
+; ALL-NEXT:  .long	1065353216              # float 1
+; ALL-NEXT:  .long	1073741824              # float 2
+; ALL-NEXT:  .long	1077936128              # float 3
+; ALL-NOT:   .long
+
+define <8 x float> @f8xf32_f128(<8 x float> %a) {
+; ALL-LABEL: f8xf32_f128:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; ALL-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+;
+; AVX-LABEL: f8xf32_f128:
+; AVX:       # BB#0:
+; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+  %res1 = fadd <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a
+  %res2 = fdiv <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1
+  ret <8 x float> %res2
+}
+
+
+; ALL64:       .LCPI32
+; ALL64-NEXT:  .quad	4575657222482165760     # 0x3f80000040000000
+
+; ALL32:       .LCPI32
+; ALL32-NEXT:  .quad	4575657222482165760     # double 0.0078125018626451492
+
+; AVX:       .LCPI32
+; AVX-NEXT:  .quad	4575657222482165760     # double 0.0078125018626451492
+
+define <16 x float> @f16xf32_f64(<16 x float> %a) {
+; AVX2-LABEL: f16xf32_f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastsd {{.*}}, %ymm2
+; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+;
+; AVX512-LABEL: f16xf32_f64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vbroadcastsd {{.*}}, %zmm1
+; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
+;
+; AVX-LABEL: f16xf32_f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vbroadcastsd {{\.LCPI.*}}, %ymm2
+; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; AVX-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+  %res1 = fadd <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a
+  %res2 = fdiv <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1
+  ret <16 x float> %res2
+}
+
+
+; ALL:       .LCPI33
+; ALL-NEXT:  .long	1082130432              # float 4
+; ALL-NEXT:  .long	1065353216              # float 1
+; ALL-NEXT:  .long	1073741824              # float 2
+; ALL-NEXT:  .long	1077936128              # float 3
+; ALL-NOT:   .long
+
+define <16 x float> @f16xf32_f128(<16 x float> %a) {
+; AVX2-LABEL: f16xf32_f128:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+;
+; AVX512-LABEL: f16xf32_f128:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
+;
+; AVX-LABEL: f16xf32_f128:
+; AVX:       # BB#0:
+; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; AVX-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+  %res1 = fadd <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a
+  %res2 = fdiv <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1
+  ret <16 x float> %res2
+}
+
+
+; AVX512:       .LCPI34
+; AVX512-NEXT:  .long	1090519040              # float 8
+; AVX512-NEXT:  .long	1065353216              # float 1
+; AVX512-NEXT:  .long	1073741824              # float 2
+; AVX512-NEXT:  .long	1077936128              # float 3
+; AVX512-NEXT:  .long	1082130432              # float 4
+; AVX512-NEXT:  .long	1084227584              # float 5
+; AVX512-NEXT:  .long	1086324736              # float 6
+; AVX512-NEXT:  .long	1088421888              # float 7
+; AVX512-NOT:   .long
+
+define <16 x float> @f16xf32_f256(<16 x float> %a) {
+; AVX512-LABEL: f16xf32_f256:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
+  %res1 = fadd <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %a
+  %res2 = fdiv <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %res1
+  ret <16 x float> %res2
+}
+
+
+; ALL:       .LCPI35
+; ALL-NEXT:  .quad	4611686018427387904     # double 2
+; ALL-NEXT:  .quad	4607182418800017408     # double 1
+; ALL-NOT:   .quad
+
+define <4 x double> @f4xf64_f128(<4 x double> %a) {
+; ALL-LABEL: f4xf64_f128:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; ALL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
+;
+; AVX-LABEL: f4xf64_f128:
+; AVX:       # BB#0:
+; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
+  %res1 = fadd <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %a
+  %res2 = fdiv <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %res1
+  ret <4 x double> %res2
+}
+
+
+; ALL:       .LCPI36
+; ALL-NEXT:  .quad	4611686018427387904     # double 2
+; ALL-NEXT:  .quad	4607182418800017408     # double 1
+; ALL-NOT:   .quad
+
+define <8 x double> @f8xf64_f128(<8 x double> %a) {
+; AVX2-LABEL: f8xf64_f128:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
+;
+; AVX512-LABEL: f8xf64_f128:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
+;
+; AVX-LABEL: f8xf64_f128:
+; AVX:       # BB#0:
+; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
+; AVX-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
+  %res1 = fadd <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %a
+  %res2 = fdiv <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %res1
+  ret <8 x double> %res2
+}
+
+
+; AVX512:       .LCPI37
+; AVX512-NEXT:  .quad	4616189618054758400     # double 4
+; AVX512-NEXT:  .quad	4607182418800017408     # double 1
+; AVX512-NEXT:  .quad	4611686018427387904     # double 2
+; AVX512-NEXT:  .quad	4613937818241073152     # double 3
+; AVX512-NOT:   .quad
+
+define <8 x double> @f8xf64_f256(<8 x double> %a) {
+; AVX512-LABEL: f8xf64_f256:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
+  %res1 = fadd <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %a
+  %res2 = fdiv <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %res1
+  ret <8 x double> %res2
+}
+
diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
index 9ae3483..693bf2e 100644
--- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll
+++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
@@ -303,7 +303,7 @@
 ; SSE:       # BB#0:
 ; SSE-NEXT:    xorl %eax, %eax
 ; SSE-NEXT:    movd %eax, %xmm2
-; SSE-NEXT:    movaps %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
diff --git a/test/CodeGen/X86/combine-sext-in-reg.ll b/test/CodeGen/X86/combine-sext-in-reg.ll
new file mode 100644
index 0000000..3e60f3b
--- /dev/null
+++ b/test/CodeGen/X86/combine-sext-in-reg.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
+
+; fold sextinreg(zext) -> sext
+define <4 x i64> @sextinreg_zext_v16i8_4i64(<16 x i8> %a0) {
+; SSE-LABEL: sextinreg_zext_v16i8_4i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    pmovsxbq %xmm0, %xmm2
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pmovsxbq %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sextinreg_zext_v16i8_4i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovsxbq %xmm0, %ymm0
+; AVX-NEXT:    retq
+  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = zext <4 x i8> %1 to <4 x i64>
+  %3 = shl <4 x i64> %2, <i64 56, i64 56, i64 56, i64 56>
+  %4 = ashr <4 x i64> %3, <i64 56, i64 56, i64 56, i64 56>
+  ret <4 x i64> %4
+}
+
+; fold sextinreg(zext(sext)) -> sext
+define <4 x i64> @sextinreg_zext_sext_v16i8_4i64(<16 x i8> %a0) {
+; SSE-LABEL: sextinreg_zext_sext_v16i8_4i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    pmovsxbq %xmm0, %xmm2
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pmovsxbq %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sextinreg_zext_sext_v16i8_4i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovsxbq %xmm0, %ymm0
+; AVX-NEXT:    retq
+  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = zext <4 x i32> %2 to <4 x i64>
+  %4 = shl <4 x i64> %3, <i64 32, i64 32, i64 32, i64 32>
+  %5 = ashr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32>
+  ret <4 x i64> %5
+}
diff --git a/test/CodeGen/X86/combine-shl.ll b/test/CodeGen/X86/combine-shl.ll
index 68c4ddf..88a68b5 100644
--- a/test/CodeGen/X86/combine-shl.ll
+++ b/test/CodeGen/X86/combine-shl.ll
@@ -543,6 +543,42 @@
   ret <4 x i32> %2
 }
 
+; FIXME: fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
+define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
+; SSE-LABEL: combine_vec_shl_or0:
+; SSE:       # BB#0:
+; SSE-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE-NEXT:    pslld $2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_shl_or0:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = or  <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
+  %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
+; SSE-LABEL: combine_vec_shl_or1:
+; SSE:       # BB#0:
+; SSE-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_shl_or1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = or  <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
+  %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %2
+}
+
 ; fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
 define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_shl_mul0:
diff --git a/test/CodeGen/X86/exedepsfix-broadcast.ll b/test/CodeGen/X86/exedepsfix-broadcast.ll
index 992b3a3..e67bb0f 100644
--- a/test/CodeGen/X86/exedepsfix-broadcast.ll
+++ b/test/CodeGen/X86/exedepsfix-broadcast.ll
@@ -38,8 +38,8 @@
 ; CHECK-LABEL: ExeDepsFix_broadcastss_inreg:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmovd %edi, %xmm2
-; CHECK-NEXT:    vbroadcastss %xmm2, %xmm2
-; CHECK-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpbroadcastd %xmm2, %xmm2
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %bitcast = bitcast <4 x float> %arg to <4 x i32>
@@ -56,8 +56,8 @@
 ; CHECK-LABEL: ExeDepsFix_broadcastss256_inreg:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmovd %edi, %xmm2
-; CHECK-NEXT:    vbroadcastss %xmm2, %ymm2
-; CHECK-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpbroadcastd %xmm2, %ymm2
+; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %bitcast = bitcast <8 x float> %arg to <8 x i32>
@@ -124,8 +124,8 @@
 ; CHECK-LABEL: ExeDepsFix_broadcastsd256_inreg:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmovq %rdi, %xmm2
-; CHECK-NEXT:    vbroadcastsd %xmm2, %ymm2
-; CHECK-NEXT:    vandpd %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpbroadcastq %xmm2, %ymm2
+; CHECK-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %bitcast = bitcast <4 x double> %arg to <4 x i64>
diff --git a/test/CodeGen/X86/extractelement-index.ll b/test/CodeGen/X86/extractelement-index.ll
index fc34c56..13448a1 100644
--- a/test/CodeGen/X86/extractelement-index.ll
+++ b/test/CodeGen/X86/extractelement-index.ll
@@ -244,12 +244,19 @@
 ; SSE-NEXT:    movd %xmm1, %eax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: extractelement_v8i32_0:
-; AVX:       # BB#0:
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX1-LABEL: extractelement_v8i32_0:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: extractelement_v8i32_0:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
   %b = extractelement <8 x i32> %a, i256 4
   ret i32 %b
 }
@@ -260,12 +267,19 @@
 ; SSE-NEXT:    movd %xmm1, %eax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: extractelement_v8i32_4:
-; AVX:       # BB#0:
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX1-LABEL: extractelement_v8i32_4:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: extractelement_v8i32_4:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
   %b = extractelement <8 x i32> %a, i256 4
   ret i32 %b
 }
diff --git a/test/CodeGen/X86/fma-fneg-combine.ll b/test/CodeGen/X86/fma-fneg-combine.ll
index 76d8cb5..5636a5b 100644
--- a/test/CodeGen/X86/fma-fneg-combine.ll
+++ b/test/CodeGen/X86/fma-fneg-combine.ll
@@ -126,8 +126,8 @@
 define <2 x double> @test10(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; CHECK-LABEL: test10:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 -1, i32 4) #2
@@ -188,7 +188,7 @@
 ; CHECK-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vfmadd132sd %xmm1, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
 entry:
   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index d830340..2554b02 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1,9 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA --check-prefix=FMA-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4 --check-prefix=FMA4-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4 --check-prefix=FMA4-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=ALL --check-prefix=FMA --check-prefix=FMA-NOINFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=ALL --check-prefix=FMA4 --check-prefix=FMA4-NOINFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=ALL --check-prefix=FMA4 --check-prefix=FMA4-NOINFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512-NOINFS
 
 ;
 ; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z)
@@ -558,288 +561,468 @@
 ;
 
 define <4 x float> @test_v4f32_mul_add_x_one_y(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_add_x_one_y:
-; FMA:       # BB#0:
-; FMA-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
-; FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_add_x_one_y:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_add_x_one_y:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
-; FMA4-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_add_x_one_y:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_add_x_one_y:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_add_x_one_y:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_add_x_one_y:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_add_x_one_y:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_add_x_one_y:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
   %m = fmul <4 x float> %a, %y
   ret <4 x float> %m
 }
 
 define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_y_add_x_one:
-; FMA:       # BB#0:
-; FMA-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
-; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_one:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_y_add_x_one:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
-; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_one:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_y_add_x_one:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_one:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_one:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_one:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_one:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
   %m = fmul <4 x float> %y, %a
   ret <4 x float> %m
 }
 
 define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_add_x_negone_y:
-; FMA:       # BB#0:
-; FMA-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
-; FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_add_x_negone_y:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_add_x_negone_y:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
-; FMA4-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_add_x_negone_y:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_add_x_negone_y:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_add_x_negone_y:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_add_x_negone_y:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_add_x_negone_y:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_add_x_negone_y:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
   %m = fmul <4 x float> %a, %y
   ret <4 x float> %m
 }
 
 define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_y_add_x_negone:
-; FMA:       # BB#0:
-; FMA-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
-; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_negone:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_y_add_x_negone:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
-; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_negone:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_y_add_x_negone:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_negone:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
   %m = fmul <4 x float> %y, %a
   ret <4 x float> %m
 }
 
 define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_sub_one_x_y:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_sub_one_x_y:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA4-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; FMA4-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_sub_one_x_y:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; AVX512-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_sub_one_x_y:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmadd213ps %xmm1, %xmm1, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_one_x_y:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_one_x_y:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmadd213ps %xmm1, %xmm1, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   %m = fmul <4 x float> %s, %y
   ret <4 x float> %m
 }
 
 define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_y_sub_one_x:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_y_sub_one_x:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA4-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_y_sub_one_x:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; AVX512-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmadd213ps %xmm1, %xmm1, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmadd213ps %xmm1, %xmm1, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   %m = fmul <4 x float> %y, %s
   ret <4 x float> %m
 }
 
 define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_sub_negone_x_y:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_sub_negone_x_y:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA4-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; FMA4-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_sub_negone_x_y:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; AVX512-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_sub_negone_x_y:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmsub213ps %xmm1, %xmm1, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_negone_x_y:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_negone_x_y:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmsub213ps %xmm1, %xmm1, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
   %m = fmul <4 x float> %s, %y
   ret <4 x float> %m
 }
 
 define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_y_sub_negone_x:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA4-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; AVX512-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmsub213ps %xmm1, %xmm1, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmsub213ps %xmm1, %xmm1, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
   %m = fmul <4 x float> %y, %s
   ret <4 x float> %m
 }
 
 define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_sub_x_one_y:
-; FMA:       # BB#0:
-; FMA-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
-; FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_sub_x_one_y:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_sub_x_one_y:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
-; FMA4-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_sub_x_one_y:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_sub_x_one_y:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_sub_x_one_y:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_sub_x_one_y:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_x_one_y:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_x_one_y:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
   %m = fmul <4 x float> %s, %y
   ret <4 x float> %m
 }
 
 define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_y_sub_x_one:
-; FMA:       # BB#0:
-; FMA-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
-; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_one:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_y_sub_x_one:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
-; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_one:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_y_sub_x_one:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_one:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmsub213ps %xmm1, %xmm1, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
   %m = fmul <4 x float> %y, %s
   ret <4 x float> %m
 }
 
 define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_sub_x_negone_y:
-; FMA:       # BB#0:
-; FMA-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
-; FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_sub_x_negone_y:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_sub_x_negone_y:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
-; FMA4-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_sub_x_negone_y:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_sub_x_negone_y:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_sub_x_negone_y:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_sub_x_negone_y:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_x_negone_y:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_x_negone_y:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
   %m = fmul <4 x float> %s, %y
   ret <4 x float> %m
 }
 
 define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y) {
-; FMA-LABEL: test_v4f32_mul_y_sub_x_negone:
-; FMA:       # BB#0:
-; FMA-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
-; FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_negone:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
-; FMA4-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_negone:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_negone:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmadd213ps %xmm1, %xmm1, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
   %m = fmul <4 x float> %y, %s
   ret <4 x float> %m
@@ -850,29 +1033,47 @@
 ;
 
 define float @test_f32_interp(float %x, float %y, float %t) {
-; FMA-LABEL: test_f32_interp:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm3
-; FMA-NEXT:    vmulss %xmm3, %xmm1, %xmm1
-; FMA-NEXT:    vfmadd213ss %xmm1, %xmm2, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_f32_interp:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; FMA-INFS-NEXT:    vsubss %xmm2, %xmm3, %xmm3
+; FMA-INFS-NEXT:    vmulss %xmm3, %xmm1, %xmm1
+; FMA-INFS-NEXT:    vfmadd213ss %xmm1, %xmm2, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_f32_interp:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; FMA4-NEXT:    vsubss %xmm2, %xmm3, %xmm3
-; FMA4-NEXT:    vmulss %xmm3, %xmm1, %xmm1
-; FMA4-NEXT:    vfmaddss %xmm1, %xmm2, %xmm0, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_f32_interp:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; FMA4-INFS-NEXT:    vsubss %xmm2, %xmm3, %xmm3
+; FMA4-INFS-NEXT:    vmulss %xmm3, %xmm1, %xmm1
+; FMA4-INFS-NEXT:    vfmaddss %xmm1, %xmm2, %xmm0, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_f32_interp:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX512-NEXT:    vsubss %xmm2, %xmm3, %xmm3
-; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
-; AVX512-NEXT:    vfmadd213ss %xmm1, %xmm2, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_f32_interp:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX512-INFS-NEXT:    vsubss %xmm2, %xmm3, %xmm3
+; AVX512-INFS-NEXT:    vmulss %xmm3, %xmm1, %xmm1
+; AVX512-INFS-NEXT:    vfmadd213ss %xmm1, %xmm2, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_f32_interp:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmadd213ss %xmm1, %xmm2, %xmm1
+; FMA-NOINFS-NEXT:    vfmadd213ss %xmm1, %xmm2, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_f32_interp:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmaddss %xmm1, %xmm1, %xmm2, %xmm1
+; FMA4-NOINFS-NEXT:    vfmaddss %xmm1, %xmm2, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_f32_interp:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmadd213ss %xmm1, %xmm2, %xmm1
+; AVX512-NOINFS-NEXT:    vfmadd213ss %xmm1, %xmm2, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %t1 = fsub float 1.0, %t
   %tx = fmul float %x, %t
   %ty = fmul float %y, %t1
@@ -881,29 +1082,47 @@
 }
 
 define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) {
-; FMA-LABEL: test_v4f32_interp:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm3
-; FMA-NEXT:    vmulps %xmm3, %xmm1, %xmm1
-; FMA-NEXT:    vfmadd213ps %xmm1, %xmm2, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f32_interp:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-INFS-NEXT:    vsubps %xmm2, %xmm3, %xmm3
+; FMA-INFS-NEXT:    vmulps %xmm3, %xmm1, %xmm1
+; FMA-INFS-NEXT:    vfmadd213ps %xmm1, %xmm2, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f32_interp:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA4-NEXT:    vsubps %xmm2, %xmm3, %xmm3
-; FMA4-NEXT:    vmulps %xmm3, %xmm1, %xmm1
-; FMA4-NEXT:    vfmaddps %xmm1, %xmm2, %xmm0, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f32_interp:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA4-INFS-NEXT:    vsubps %xmm2, %xmm3, %xmm3
+; FMA4-INFS-NEXT:    vmulps %xmm3, %xmm1, %xmm1
+; FMA4-INFS-NEXT:    vfmaddps %xmm1, %xmm2, %xmm0, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32_interp:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %xmm3
-; AVX512-NEXT:    vsubps %xmm2, %xmm3, %xmm3
-; AVX512-NEXT:    vmulps %xmm3, %xmm1, %xmm1
-; AVX512-NEXT:    vfmadd213ps %xmm1, %xmm2, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f32_interp:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vbroadcastss {{.*}}(%rip), %xmm3
+; AVX512-INFS-NEXT:    vsubps %xmm2, %xmm3, %xmm3
+; AVX512-INFS-NEXT:    vmulps %xmm3, %xmm1, %xmm1
+; AVX512-INFS-NEXT:    vfmadd213ps %xmm1, %xmm2, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_interp:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmadd213ps %xmm1, %xmm2, %xmm1
+; FMA-NOINFS-NEXT:    vfmadd213ps %xmm1, %xmm2, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_interp:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmaddps %xmm1, %xmm1, %xmm2, %xmm1
+; FMA4-NOINFS-NEXT:    vfmaddps %xmm1, %xmm2, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_interp:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmadd213ps %xmm1, %xmm2, %xmm1
+; AVX512-NOINFS-NEXT:    vfmadd213ps %xmm1, %xmm2, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %t1 = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %t
   %tx = fmul <4 x float> %x, %t
   %ty = fmul <4 x float> %y, %t1
@@ -912,29 +1131,47 @@
 }
 
 define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) {
-; FMA-LABEL: test_v8f32_interp:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm3
-; FMA-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA-NEXT:    vfmadd213ps %ymm1, %ymm2, %ymm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v8f32_interp:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-INFS-NEXT:    vsubps %ymm2, %ymm3, %ymm3
+; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA-INFS-NEXT:    vfmadd213ps %ymm1, %ymm2, %ymm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v8f32_interp:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA4-NEXT:    vsubps %ymm2, %ymm3, %ymm3
-; FMA4-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA4-NEXT:    vfmaddps %ymm1, %ymm2, %ymm0, %ymm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v8f32_interp:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA4-INFS-NEXT:    vsubps %ymm2, %ymm3, %ymm3
+; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    vfmaddps %ymm1, %ymm2, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8f32_interp:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %ymm3
-; AVX512-NEXT:    vsubps %ymm2, %ymm3, %ymm3
-; AVX512-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; AVX512-NEXT:    vfmadd213ps %ymm1, %ymm2, %ymm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v8f32_interp:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vbroadcastss {{.*}}(%rip), %ymm3
+; AVX512-INFS-NEXT:    vsubps %ymm2, %ymm3, %ymm3
+; AVX512-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; AVX512-INFS-NEXT:    vfmadd213ps %ymm1, %ymm2, %ymm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v8f32_interp:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmadd213ps %ymm1, %ymm2, %ymm1
+; FMA-NOINFS-NEXT:    vfmadd213ps %ymm1, %ymm2, %ymm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v8f32_interp:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmaddps %ymm1, %ymm1, %ymm2, %ymm1
+; FMA4-NOINFS-NEXT:    vfmaddps %ymm1, %ymm2, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v8f32_interp:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmadd213ps %ymm1, %ymm2, %ymm1
+; AVX512-NOINFS-NEXT:    vfmadd213ps %ymm1, %ymm2, %ymm0
+; AVX512-NOINFS-NEXT:    retq
   %t1 = fsub <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %t
   %tx = fmul <8 x float> %x, %t
   %ty = fmul <8 x float> %y, %t1
@@ -943,29 +1180,47 @@
 }
 
 define double @test_f64_interp(double %x, double %y, double %t) {
-; FMA-LABEL: test_f64_interp:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; FMA-NEXT:    vsubsd %xmm2, %xmm3, %xmm3
-; FMA-NEXT:    vmulsd %xmm3, %xmm1, %xmm1
-; FMA-NEXT:    vfmadd213sd %xmm1, %xmm2, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_f64_interp:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; FMA-INFS-NEXT:    vsubsd %xmm2, %xmm3, %xmm3
+; FMA-INFS-NEXT:    vmulsd %xmm3, %xmm1, %xmm1
+; FMA-INFS-NEXT:    vfmadd213sd %xmm1, %xmm2, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_f64_interp:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; FMA4-NEXT:    vsubsd %xmm2, %xmm3, %xmm3
-; FMA4-NEXT:    vmulsd %xmm3, %xmm1, %xmm1
-; FMA4-NEXT:    vfmaddsd %xmm1, %xmm2, %xmm0, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_f64_interp:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; FMA4-INFS-NEXT:    vsubsd %xmm2, %xmm3, %xmm3
+; FMA4-INFS-NEXT:    vmulsd %xmm3, %xmm1, %xmm1
+; FMA4-INFS-NEXT:    vfmaddsd %xmm1, %xmm2, %xmm0, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_f64_interp:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512-NEXT:    vsubsd %xmm2, %xmm3, %xmm3
-; AVX512-NEXT:    vmulsd %xmm3, %xmm1, %xmm1
-; AVX512-NEXT:    vfmadd213sd %xmm1, %xmm2, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_f64_interp:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX512-INFS-NEXT:    vsubsd %xmm2, %xmm3, %xmm3
+; AVX512-INFS-NEXT:    vmulsd %xmm3, %xmm1, %xmm1
+; AVX512-INFS-NEXT:    vfmadd213sd %xmm1, %xmm2, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_f64_interp:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmadd213sd %xmm1, %xmm2, %xmm1
+; FMA-NOINFS-NEXT:    vfmadd213sd %xmm1, %xmm2, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_f64_interp:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmaddsd %xmm1, %xmm1, %xmm2, %xmm1
+; FMA4-NOINFS-NEXT:    vfmaddsd %xmm1, %xmm2, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_f64_interp:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmadd213sd %xmm1, %xmm2, %xmm1
+; AVX512-NOINFS-NEXT:    vfmadd213sd %xmm1, %xmm2, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %t1 = fsub double 1.0, %t
   %tx = fmul double %x, %t
   %ty = fmul double %y, %t1
@@ -974,29 +1229,47 @@
 }
 
 define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) {
-; FMA-LABEL: test_v2f64_interp:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovapd {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00]
-; FMA-NEXT:    vsubpd %xmm2, %xmm3, %xmm3
-; FMA-NEXT:    vmulpd %xmm3, %xmm1, %xmm1
-; FMA-NEXT:    vfmadd213pd %xmm1, %xmm2, %xmm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v2f64_interp:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00]
+; FMA-INFS-NEXT:    vsubpd %xmm2, %xmm3, %xmm3
+; FMA-INFS-NEXT:    vmulpd %xmm3, %xmm1, %xmm1
+; FMA-INFS-NEXT:    vfmadd213pd %xmm1, %xmm2, %xmm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v2f64_interp:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovapd {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00]
-; FMA4-NEXT:    vsubpd %xmm2, %xmm3, %xmm3
-; FMA4-NEXT:    vmulpd %xmm3, %xmm1, %xmm1
-; FMA4-NEXT:    vfmaddpd %xmm1, %xmm2, %xmm0, %xmm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v2f64_interp:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00]
+; FMA4-INFS-NEXT:    vsubpd %xmm2, %xmm3, %xmm3
+; FMA4-INFS-NEXT:    vmulpd %xmm3, %xmm1, %xmm1
+; FMA4-INFS-NEXT:    vfmaddpd %xmm1, %xmm2, %xmm0, %xmm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v2f64_interp:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vmovapd {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00]
-; AVX512-NEXT:    vsubpd %xmm2, %xmm3, %xmm3
-; AVX512-NEXT:    vmulpd %xmm3, %xmm1, %xmm1
-; AVX512-NEXT:    vfmadd213pd %xmm1, %xmm2, %xmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v2f64_interp:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vmovapd {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00]
+; AVX512-INFS-NEXT:    vsubpd %xmm2, %xmm3, %xmm3
+; AVX512-INFS-NEXT:    vmulpd %xmm3, %xmm1, %xmm1
+; AVX512-INFS-NEXT:    vfmadd213pd %xmm1, %xmm2, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v2f64_interp:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmadd213pd %xmm1, %xmm2, %xmm1
+; FMA-NOINFS-NEXT:    vfmadd213pd %xmm1, %xmm2, %xmm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v2f64_interp:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmaddpd %xmm1, %xmm1, %xmm2, %xmm1
+; FMA4-NOINFS-NEXT:    vfmaddpd %xmm1, %xmm2, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v2f64_interp:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmadd213pd %xmm1, %xmm2, %xmm1
+; AVX512-NOINFS-NEXT:    vfmadd213pd %xmm1, %xmm2, %xmm0
+; AVX512-NOINFS-NEXT:    retq
   %t1 = fsub <2 x double> <double 1.0, double 1.0>, %t
   %tx = fmul <2 x double> %x, %t
   %ty = fmul <2 x double> %y, %t1
@@ -1005,29 +1278,47 @@
 }
 
 define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) {
-; FMA-LABEL: test_v4f64_interp:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovapd {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-NEXT:    vsubpd %ymm2, %ymm3, %ymm3
-; FMA-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
-; FMA-NEXT:    vfmadd213pd %ymm1, %ymm2, %ymm0
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v4f64_interp:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-INFS-NEXT:    vsubpd %ymm2, %ymm3, %ymm3
+; FMA-INFS-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
+; FMA-INFS-NEXT:    vfmadd213pd %ymm1, %ymm2, %ymm0
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v4f64_interp:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovapd {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA4-NEXT:    vsubpd %ymm2, %ymm3, %ymm3
-; FMA4-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
-; FMA4-NEXT:    vfmaddpd %ymm1, %ymm2, %ymm0, %ymm0
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v4f64_interp:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA4-INFS-NEXT:    vsubpd %ymm2, %ymm3, %ymm3
+; FMA4-INFS-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    vfmaddpd %ymm1, %ymm2, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f64_interp:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm3
-; AVX512-NEXT:    vsubpd %ymm2, %ymm3, %ymm3
-; AVX512-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
-; AVX512-NEXT:    vfmadd213pd %ymm1, %ymm2, %ymm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v4f64_interp:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm3
+; AVX512-INFS-NEXT:    vsubpd %ymm2, %ymm3, %ymm3
+; AVX512-INFS-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
+; AVX512-INFS-NEXT:    vfmadd213pd %ymm1, %ymm2, %ymm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f64_interp:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmadd213pd %ymm1, %ymm2, %ymm1
+; FMA-NOINFS-NEXT:    vfmadd213pd %ymm1, %ymm2, %ymm0
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f64_interp:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmaddpd %ymm1, %ymm1, %ymm2, %ymm1
+; FMA4-NOINFS-NEXT:    vfmaddpd %ymm1, %ymm2, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f64_interp:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmadd213pd %ymm1, %ymm2, %ymm1
+; AVX512-NOINFS-NEXT:    vfmadd213pd %ymm1, %ymm2, %ymm0
+; AVX512-NOINFS-NEXT:    retq
   %t1 = fsub <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %t
   %tx = fmul <4 x double> %x, %t
   %ty = fmul <4 x double> %y, %t1
diff --git a/test/CodeGen/X86/fma_patterns_wide.ll b/test/CodeGen/X86/fma_patterns_wide.ll
index 36c2989..ab1bf4d 100644
--- a/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/test/CodeGen/X86/fma_patterns_wide.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -fp-contract=fast | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA --check-prefix=FMA-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -fp-contract=fast | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-INFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA --check-prefix=FMA-NOINFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-NOINFS
 
 ;
 ; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z)
@@ -253,352 +257,556 @@
 ;
 
 define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %y) {
-; FMA-LABEL: test_v16f32_mul_add_x_one_y:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-NEXT:    vaddps %ymm4, %ymm1, %ymm1
-; FMA-NEXT:    vaddps %ymm4, %ymm0, %ymm0
-; FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; FMA-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v16f32_mul_add_x_one_y:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
+; FMA-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
+; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v16f32_mul_add_x_one_y:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA4-NEXT:    vaddps %ymm4, %ymm1, %ymm1
-; FMA4-NEXT:    vaddps %ymm4, %ymm0, %ymm0
-; FMA4-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; FMA4-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v16f32_mul_add_x_one_y:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v16f32_mul_add_x_one_y:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v16f32_mul_add_x_one_y:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v16f32_mul_add_x_one_y:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmadd213ps %ymm2, %ymm2, %ymm0
+; FMA-NOINFS-NEXT:    vfmadd213ps %ymm3, %ymm3, %ymm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v16f32_mul_add_x_one_y:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmaddps %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    vfmaddps %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v16f32_mul_add_x_one_y:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmadd213ps %zmm1, %zmm1, %zmm0
+; AVX512-NOINFS-NEXT:    retq
   %a = fadd <16 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
   %m = fmul <16 x float> %a, %y
   ret <16 x float> %m
 }
 
 define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y) {
-; FMA-LABEL: test_v8f64_mul_y_add_x_one:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
-; FMA-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
-; FMA-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
-; FMA-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_one:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
+; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
+; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
+; FMA-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v8f64_mul_y_add_x_one:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA4-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
-; FMA4-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
-; FMA4-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
-; FMA4-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_one:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
+; FMA4-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8f64_mul_y_add_x_one:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vaddpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v8f64_mul_y_add_x_one:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vaddpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v8f64_mul_y_add_x_one:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmadd213pd %ymm2, %ymm2, %ymm0
+; FMA-NOINFS-NEXT:    vfmadd213pd %ymm3, %ymm3, %ymm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v8f64_mul_y_add_x_one:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmaddpd %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    vfmaddpd %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v8f64_mul_y_add_x_one:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmadd213pd %zmm1, %zmm1, %zmm0
+; AVX512-NOINFS-NEXT:    retq
   %a = fadd <8 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>
   %m = fmul <8 x double> %y, %a
   ret <8 x double> %m
 }
 
 define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float> %y) {
-; FMA-LABEL: test_v16f32_mul_add_x_negone_y:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA-NEXT:    vaddps %ymm4, %ymm1, %ymm1
-; FMA-NEXT:    vaddps %ymm4, %ymm0, %ymm0
-; FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; FMA-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
+; FMA-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
+; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v16f32_mul_add_x_negone_y:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA4-NEXT:    vaddps %ymm4, %ymm1, %ymm1
-; FMA4-NEXT:    vaddps %ymm4, %ymm0, %ymm0
-; FMA4-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; FMA4-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v16f32_mul_add_x_negone_y:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v16f32_mul_add_x_negone_y:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmsub213ps %ymm2, %ymm2, %ymm0
+; FMA-NOINFS-NEXT:    vfmsub213ps %ymm3, %ymm3, %ymm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v16f32_mul_add_x_negone_y:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmsubps %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    vfmsubps %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v16f32_mul_add_x_negone_y:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmsub213ps %zmm1, %zmm1, %zmm0
+; AVX512-NOINFS-NEXT:    retq
   %a = fadd <16 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>
   %m = fmul <16 x float> %a, %y
   ret <16 x float> %m
 }
 
 define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> %y) {
-; FMA-LABEL: test_v8f64_mul_y_add_x_negone:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
-; FMA-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
-; FMA-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
-; FMA-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
+; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
+; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
+; FMA-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v8f64_mul_y_add_x_negone:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA4-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
-; FMA4-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
-; FMA4-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
-; FMA4-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
+; FMA4-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8f64_mul_y_add_x_negone:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vaddpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vaddpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v8f64_mul_y_add_x_negone:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmsub213pd %ymm2, %ymm2, %ymm0
+; FMA-NOINFS-NEXT:    vfmsub213pd %ymm3, %ymm3, %ymm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v8f64_mul_y_add_x_negone:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmsubpd %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    vfmsubpd %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v8f64_mul_y_add_x_negone:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmsub213pd %zmm1, %zmm1, %zmm0
+; AVX512-NOINFS-NEXT:    retq
   %a = fadd <8 x double> %x, <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
   %m = fmul <8 x double> %y, %a
   ret <8 x double> %m
 }
 
 define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %y) {
-; FMA-LABEL: test_v16f32_mul_sub_one_x_y:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-NEXT:    vsubps %ymm1, %ymm4, %ymm1
-; FMA-NEXT:    vsubps %ymm0, %ymm4, %ymm0
-; FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; FMA-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
+; FMA-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
+; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v16f32_mul_sub_one_x_y:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA4-NEXT:    vsubps %ymm1, %ymm4, %ymm1
-; FMA4-NEXT:    vsubps %ymm0, %ymm4, %ymm0
-; FMA4-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; FMA4-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA4-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
+; FMA4-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
+; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v16f32_mul_sub_one_x_y:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %zmm2
-; AVX512-NEXT:    vsubps %zmm0, %zmm2, %zmm0
-; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vbroadcastss {{.*}}(%rip), %zmm2
+; AVX512-INFS-NEXT:    vsubps %zmm0, %zmm2, %zmm0
+; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v16f32_mul_sub_one_x_y:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmadd213ps %ymm2, %ymm2, %ymm0
+; FMA-NOINFS-NEXT:    vfnmadd213ps %ymm3, %ymm3, %ymm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_one_x_y:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmaddps %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    vfnmaddps %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_one_x_y:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmadd213ps %zmm1, %zmm1, %zmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   %m = fmul <16 x float> %s, %y
   ret <16 x float> %m
 }
 
 define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y) {
-; FMA-LABEL: test_v8f64_mul_y_sub_one_x:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
-; FMA-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
-; FMA-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
-; FMA-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
+; FMA-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
+; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
+; FMA-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v8f64_mul_y_sub_one_x:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA4-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
-; FMA4-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
-; FMA4-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
-; FMA4-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA4-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
+; FMA4-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
+; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
+; FMA4-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8f64_mul_y_sub_one_x:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastsd {{.*}}(%rip), %zmm2
-; AVX512-NEXT:    vsubpd %zmm0, %zmm2, %zmm0
-; AVX512-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vbroadcastsd {{.*}}(%rip), %zmm2
+; AVX512-INFS-NEXT:    vsubpd %zmm0, %zmm2, %zmm0
+; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_one_x:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmadd213pd %ymm2, %ymm2, %ymm0
+; FMA-NOINFS-NEXT:    vfnmadd213pd %ymm3, %ymm3, %ymm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_one_x:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmaddpd %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    vfnmaddpd %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_one_x:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmadd213pd %zmm1, %zmm1, %zmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <8 x double> <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>, %x
   %m = fmul <8 x double> %y, %s
   ret <8 x double> %m
 }
 
 define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float> %y) {
-; FMA-LABEL: test_v16f32_mul_sub_negone_x_y:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA-NEXT:    vsubps %ymm1, %ymm4, %ymm1
-; FMA-NEXT:    vsubps %ymm0, %ymm4, %ymm0
-; FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; FMA-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
+; FMA-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
+; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v16f32_mul_sub_negone_x_y:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA4-NEXT:    vsubps %ymm1, %ymm4, %ymm1
-; FMA4-NEXT:    vsubps %ymm0, %ymm4, %ymm0
-; FMA4-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; FMA4-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA4-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
+; FMA4-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
+; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v16f32_mul_sub_negone_x_y:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %zmm2
-; AVX512-NEXT:    vsubps %zmm0, %zmm2, %zmm0
-; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vbroadcastss {{.*}}(%rip), %zmm2
+; AVX512-INFS-NEXT:    vsubps %zmm0, %zmm2, %zmm0
+; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v16f32_mul_sub_negone_x_y:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmsub213ps %ymm2, %ymm2, %ymm0
+; FMA-NOINFS-NEXT:    vfnmsub213ps %ymm3, %ymm3, %ymm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_negone_x_y:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmsubps %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    vfnmsubps %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_negone_x_y:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmsub213ps %zmm1, %zmm1, %zmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <16 x float> <float -1.0, float -1.0, float -1.0, float -1.0,float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>, %x
   %m = fmul <16 x float> %s, %y
   ret <16 x float> %m
 }
 
 define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> %y) {
-; FMA-LABEL: test_v8f64_mul_y_sub_negone_x:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
-; FMA-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
-; FMA-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
-; FMA-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
+; FMA-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
+; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
+; FMA-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v8f64_mul_y_sub_negone_x:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA4-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
-; FMA4-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
-; FMA4-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
-; FMA4-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA4-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
+; FMA4-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
+; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
+; FMA4-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8f64_mul_y_sub_negone_x:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastsd {{.*}}(%rip), %zmm2
-; AVX512-NEXT:    vsubpd %zmm0, %zmm2, %zmm0
-; AVX512-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vbroadcastsd {{.*}}(%rip), %zmm2
+; AVX512-INFS-NEXT:    vsubpd %zmm0, %zmm2, %zmm0
+; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_negone_x:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmsub213pd %ymm2, %ymm2, %ymm0
+; FMA-NOINFS-NEXT:    vfnmsub213pd %ymm3, %ymm3, %ymm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_negone_x:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmsubpd %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    vfnmsubpd %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_negone_x:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmsub213pd %zmm1, %zmm1, %zmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <8 x double> <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>, %x
   %m = fmul <8 x double> %y, %s
   ret <8 x double> %m
 }
 
 define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %y) {
-; FMA-LABEL: test_v16f32_mul_sub_x_one_y:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-NEXT:    vsubps %ymm4, %ymm1, %ymm1
-; FMA-NEXT:    vsubps %ymm4, %ymm0, %ymm0
-; FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; FMA-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-INFS-NEXT:    vsubps %ymm4, %ymm1, %ymm1
+; FMA-INFS-NEXT:    vsubps %ymm4, %ymm0, %ymm0
+; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v16f32_mul_sub_x_one_y:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA4-NEXT:    vsubps %ymm4, %ymm1, %ymm1
-; FMA4-NEXT:    vsubps %ymm4, %ymm0, %ymm0
-; FMA4-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; FMA4-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v16f32_mul_sub_x_one_y:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vsubps {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vsubps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v16f32_mul_sub_x_one_y:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmsub213ps %ymm2, %ymm2, %ymm0
+; FMA-NOINFS-NEXT:    vfmsub213ps %ymm3, %ymm3, %ymm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_x_one_y:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmsubps %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    vfmsubps %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_x_one_y:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmsub213ps %zmm1, %zmm1, %zmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <16 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
   %m = fmul <16 x float> %s, %y
   ret <16 x float> %m
 }
 
 define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y) {
-; FMA-LABEL: test_v8f64_mul_y_sub_x_one:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
-; FMA-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
-; FMA-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
-; FMA-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
+; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
+; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
+; FMA-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v8f64_mul_y_sub_x_one:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA4-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
-; FMA4-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
-; FMA4-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
-; FMA4-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
+; FMA4-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8f64_mul_y_sub_x_one:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_x_one:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmsub213pd %ymm2, %ymm2, %ymm0
+; FMA-NOINFS-NEXT:    vfmsub213pd %ymm3, %ymm3, %ymm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_x_one:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmsubpd %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    vfmsubpd %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_x_one:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmsub213pd %zmm1, %zmm1, %zmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <8 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>
   %m = fmul <8 x double> %y, %s
   ret <8 x double> %m
 }
 
 define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float> %y) {
-; FMA-LABEL: test_v16f32_mul_sub_x_negone_y:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA-NEXT:    vsubps %ymm4, %ymm1, %ymm1
-; FMA-NEXT:    vsubps %ymm4, %ymm0, %ymm0
-; FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; FMA-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA-INFS-NEXT:    vsubps %ymm4, %ymm1, %ymm1
+; FMA-INFS-NEXT:    vsubps %ymm4, %ymm0, %ymm0
+; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v16f32_mul_sub_x_negone_y:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA4-NEXT:    vsubps %ymm4, %ymm1, %ymm1
-; FMA4-NEXT:    vsubps %ymm4, %ymm0, %ymm0
-; FMA4-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; FMA4-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v16f32_mul_sub_x_negone_y:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vsubps {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vsubps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v16f32_mul_sub_x_negone_y:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmadd213ps %ymm2, %ymm2, %ymm0
+; FMA-NOINFS-NEXT:    vfmadd213ps %ymm3, %ymm3, %ymm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_x_negone_y:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmaddps %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    vfmaddps %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_x_negone_y:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmadd213ps %zmm1, %zmm1, %zmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <16 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>
   %m = fmul <16 x float> %s, %y
   ret <16 x float> %m
 }
 
 define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> %y) {
-; FMA-LABEL: test_v8f64_mul_y_sub_x_negone:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
-; FMA-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
-; FMA-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
-; FMA-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
+; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
+; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
+; FMA-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v8f64_mul_y_sub_x_negone:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
-; FMA4-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
-; FMA4-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
-; FMA4-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
-; FMA4-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
+; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
+; FMA4-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8f64_mul_y_sub_x_negone:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_x_negone:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfmadd213pd %ymm2, %ymm2, %ymm0
+; FMA-NOINFS-NEXT:    vfmadd213pd %ymm3, %ymm3, %ymm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_x_negone:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfmaddpd %ymm2, %ymm2, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    vfmaddpd %ymm3, %ymm3, %ymm1, %ymm1
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_x_negone:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfmadd213pd %zmm1, %zmm1, %zmm0
+; AVX512-NOINFS-NEXT:    retq
   %s = fsub <8 x double> %x, <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
   %m = fmul <8 x double> %y, %s
   ret <8 x double> %m
@@ -609,35 +817,57 @@
 ;
 
 define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x float> %t) {
-; FMA-LABEL: test_v16f32_interp:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovaps {{.*#+}} ymm6 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-NEXT:    vsubps %ymm4, %ymm6, %ymm7
-; FMA-NEXT:    vsubps %ymm5, %ymm6, %ymm6
-; FMA-NEXT:    vmulps %ymm6, %ymm3, %ymm3
-; FMA-NEXT:    vmulps %ymm7, %ymm2, %ymm2
-; FMA-NEXT:    vfmadd213ps %ymm2, %ymm4, %ymm0
-; FMA-NEXT:    vfmadd213ps %ymm3, %ymm5, %ymm1
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v16f32_interp:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm6 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-INFS-NEXT:    vsubps %ymm4, %ymm6, %ymm7
+; FMA-INFS-NEXT:    vsubps %ymm5, %ymm6, %ymm6
+; FMA-INFS-NEXT:    vmulps %ymm6, %ymm3, %ymm3
+; FMA-INFS-NEXT:    vmulps %ymm7, %ymm2, %ymm2
+; FMA-INFS-NEXT:    vfmadd213ps %ymm2, %ymm4, %ymm0
+; FMA-INFS-NEXT:    vfmadd213ps %ymm3, %ymm5, %ymm1
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v16f32_interp:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovaps {{.*#+}} ymm6 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA4-NEXT:    vsubps %ymm4, %ymm6, %ymm7
-; FMA4-NEXT:    vsubps %ymm5, %ymm6, %ymm6
-; FMA4-NEXT:    vmulps %ymm6, %ymm3, %ymm3
-; FMA4-NEXT:    vmulps %ymm7, %ymm2, %ymm2
-; FMA4-NEXT:    vfmaddps %ymm2, %ymm4, %ymm0, %ymm0
-; FMA4-NEXT:    vfmaddps %ymm3, %ymm5, %ymm1, %ymm1
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v16f32_interp:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm6 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm6, %ymm7
+; FMA4-INFS-NEXT:    vsubps %ymm5, %ymm6, %ymm6
+; FMA4-INFS-NEXT:    vmulps %ymm6, %ymm3, %ymm3
+; FMA4-INFS-NEXT:    vmulps %ymm7, %ymm2, %ymm2
+; FMA4-INFS-NEXT:    vfmaddps %ymm2, %ymm4, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vfmaddps %ymm3, %ymm5, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v16f32_interp:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %zmm3
-; AVX512-NEXT:    vsubps %zmm2, %zmm3, %zmm3
-; AVX512-NEXT:    vmulps %zmm3, %zmm1, %zmm1
-; AVX512-NEXT:    vfmadd213ps %zmm1, %zmm2, %zmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v16f32_interp:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vbroadcastss {{.*}}(%rip), %zmm3
+; AVX512-INFS-NEXT:    vsubps %zmm2, %zmm3, %zmm3
+; AVX512-INFS-NEXT:    vmulps %zmm3, %zmm1, %zmm1
+; AVX512-INFS-NEXT:    vfmadd213ps %zmm1, %zmm2, %zmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v16f32_interp:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmadd213ps %ymm3, %ymm5, %ymm3
+; FMA-NOINFS-NEXT:    vfnmadd213ps %ymm2, %ymm4, %ymm2
+; FMA-NOINFS-NEXT:    vfmadd213ps %ymm2, %ymm4, %ymm0
+; FMA-NOINFS-NEXT:    vfmadd213ps %ymm3, %ymm5, %ymm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v16f32_interp:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmaddps %ymm3, %ymm3, %ymm5, %ymm3
+; FMA4-NOINFS-NEXT:    vfnmaddps %ymm2, %ymm2, %ymm4, %ymm2
+; FMA4-NOINFS-NEXT:    vfmaddps %ymm2, %ymm4, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    vfmaddps %ymm3, %ymm5, %ymm1, %ymm1
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v16f32_interp:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmadd213ps %zmm1, %zmm2, %zmm1
+; AVX512-NOINFS-NEXT:    vfmadd213ps %zmm1, %zmm2, %zmm0
+; AVX512-NOINFS-NEXT:    retq
   %t1 = fsub <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %t
   %tx = fmul <16 x float> %x, %t
   %ty = fmul <16 x float> %y, %t1
@@ -646,35 +876,57 @@
 }
 
 define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x double> %t) {
-; FMA-LABEL: test_v8f64_interp:
-; FMA:       # BB#0:
-; FMA-NEXT:    vmovapd {{.*#+}} ymm6 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-NEXT:    vsubpd %ymm4, %ymm6, %ymm7
-; FMA-NEXT:    vsubpd %ymm5, %ymm6, %ymm6
-; FMA-NEXT:    vmulpd %ymm6, %ymm3, %ymm3
-; FMA-NEXT:    vmulpd %ymm7, %ymm2, %ymm2
-; FMA-NEXT:    vfmadd213pd %ymm2, %ymm4, %ymm0
-; FMA-NEXT:    vfmadd213pd %ymm3, %ymm5, %ymm1
-; FMA-NEXT:    retq
+; FMA-INFS-LABEL: test_v8f64_interp:
+; FMA-INFS:       # BB#0:
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm6 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm6, %ymm7
+; FMA-INFS-NEXT:    vsubpd %ymm5, %ymm6, %ymm6
+; FMA-INFS-NEXT:    vmulpd %ymm6, %ymm3, %ymm3
+; FMA-INFS-NEXT:    vmulpd %ymm7, %ymm2, %ymm2
+; FMA-INFS-NEXT:    vfmadd213pd %ymm2, %ymm4, %ymm0
+; FMA-INFS-NEXT:    vfmadd213pd %ymm3, %ymm5, %ymm1
+; FMA-INFS-NEXT:    retq
 ;
-; FMA4-LABEL: test_v8f64_interp:
-; FMA4:       # BB#0:
-; FMA4-NEXT:    vmovapd {{.*#+}} ymm6 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA4-NEXT:    vsubpd %ymm4, %ymm6, %ymm7
-; FMA4-NEXT:    vsubpd %ymm5, %ymm6, %ymm6
-; FMA4-NEXT:    vmulpd %ymm6, %ymm3, %ymm3
-; FMA4-NEXT:    vmulpd %ymm7, %ymm2, %ymm2
-; FMA4-NEXT:    vfmaddpd %ymm2, %ymm4, %ymm0, %ymm0
-; FMA4-NEXT:    vfmaddpd %ymm3, %ymm5, %ymm1, %ymm1
-; FMA4-NEXT:    retq
+; FMA4-INFS-LABEL: test_v8f64_interp:
+; FMA4-INFS:       # BB#0:
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm6 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm6, %ymm7
+; FMA4-INFS-NEXT:    vsubpd %ymm5, %ymm6, %ymm6
+; FMA4-INFS-NEXT:    vmulpd %ymm6, %ymm3, %ymm3
+; FMA4-INFS-NEXT:    vmulpd %ymm7, %ymm2, %ymm2
+; FMA4-INFS-NEXT:    vfmaddpd %ymm2, %ymm4, %ymm0, %ymm0
+; FMA4-INFS-NEXT:    vfmaddpd %ymm3, %ymm5, %ymm1, %ymm1
+; FMA4-INFS-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8f64_interp:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vbroadcastsd {{.*}}(%rip), %zmm3
-; AVX512-NEXT:    vsubpd %zmm2, %zmm3, %zmm3
-; AVX512-NEXT:    vmulpd %zmm3, %zmm1, %zmm1
-; AVX512-NEXT:    vfmadd213pd %zmm1, %zmm2, %zmm0
-; AVX512-NEXT:    retq
+; AVX512-INFS-LABEL: test_v8f64_interp:
+; AVX512-INFS:       # BB#0:
+; AVX512-INFS-NEXT:    vbroadcastsd {{.*}}(%rip), %zmm3
+; AVX512-INFS-NEXT:    vsubpd %zmm2, %zmm3, %zmm3
+; AVX512-INFS-NEXT:    vmulpd %zmm3, %zmm1, %zmm1
+; AVX512-INFS-NEXT:    vfmadd213pd %zmm1, %zmm2, %zmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v8f64_interp:
+; FMA-NOINFS:       # BB#0:
+; FMA-NOINFS-NEXT:    vfnmadd213pd %ymm3, %ymm5, %ymm3
+; FMA-NOINFS-NEXT:    vfnmadd213pd %ymm2, %ymm4, %ymm2
+; FMA-NOINFS-NEXT:    vfmadd213pd %ymm2, %ymm4, %ymm0
+; FMA-NOINFS-NEXT:    vfmadd213pd %ymm3, %ymm5, %ymm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v8f64_interp:
+; FMA4-NOINFS:       # BB#0:
+; FMA4-NOINFS-NEXT:    vfnmaddpd %ymm3, %ymm3, %ymm5, %ymm3
+; FMA4-NOINFS-NEXT:    vfnmaddpd %ymm2, %ymm2, %ymm4, %ymm2
+; FMA4-NOINFS-NEXT:    vfmaddpd %ymm2, %ymm4, %ymm0, %ymm0
+; FMA4-NOINFS-NEXT:    vfmaddpd %ymm3, %ymm5, %ymm1, %ymm1
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v8f64_interp:
+; AVX512-NOINFS:       # BB#0:
+; AVX512-NOINFS-NEXT:    vfnmadd213pd %zmm1, %zmm2, %zmm1
+; AVX512-NOINFS-NEXT:    vfmadd213pd %zmm1, %zmm2, %zmm0
+; AVX512-NOINFS-NEXT:    retq
   %t1 = fsub <8 x double> <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>, %t
   %tx = fmul <8 x double> %x, %t
   %ty = fmul <8 x double> %y, %t1
diff --git a/test/CodeGen/X86/fp-logic-replace.ll b/test/CodeGen/X86/fp-logic-replace.ll
index 50e2c1b..0a233fd 100644
--- a/test/CodeGen/X86/fp-logic-replace.ll
+++ b/test/CodeGen/X86/fp-logic-replace.ll
@@ -29,16 +29,13 @@
 define double @FsANDNPSrr(double %x, double %y) {
 ; SSE-LABEL: FsANDNPSrr:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT:    xorpd %xmm1, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
+; SSE-NEXT:    andnps %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: FsANDNPSrr:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX-NEXT:    vxorpd %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vandnps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
   %bc1 = bitcast double %x to i64
diff --git a/test/CodeGen/X86/fp-logic.ll b/test/CodeGen/X86/fp-logic.ll
index 301fa8f..d940101 100644
--- a/test/CodeGen/X86/fp-logic.ll
+++ b/test/CodeGen/X86/fp-logic.ll
@@ -76,7 +76,7 @@
 ; CHECK-LABEL: f5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movd %edi, %xmm1
-; CHECK-NEXT:    andps %xmm1, %xmm0
+; CHECK-NEXT:    pand %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 ;
   %bc1 = bitcast float %x to i32
@@ -91,7 +91,7 @@
 ; CHECK-LABEL: f6:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movd %edi, %xmm1
-; CHECK-NEXT:    andps %xmm1, %xmm0
+; CHECK-NEXT:    pand %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 ;
   %bc1 = bitcast float %x to i32
@@ -135,7 +135,7 @@
 define i32 @f9(float %x, float %y) {
 ; CHECK-LABEL: f9:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    andps %xmm1, %xmm0
+; CHECK-NEXT:    pand %xmm1, %xmm0
 ; CHECK-NEXT:    movd %xmm0, %eax
 ; CHECK-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll
new file mode 100644
index 0000000..8614d1b
--- /dev/null
+++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll
@@ -0,0 +1,41 @@
+; Test ensuring debug intrinsics do not affect generated function prologue.
+;
+; RUN: llc -O1 -mtriple=x86_64-unknown-unknown -o - %s | FileCheck %s
+
+
+define i64 @noDebug(i64 %a) {
+  %call = call i64 @fn(i64 %a, i64 0)
+  ret i64 %call
+}
+
+; CHECK-LABEL: noDebug
+; CHECK: popq %rcx
+; CHECK: ret
+
+
+define i64 @withDebug(i64 %a) !dbg !4 {
+  %call = call i64 @fn(i64 %a, i64 0)
+  tail call void @llvm.dbg.value(metadata i64 %call, i64 0, metadata !5, metadata !6), !dbg !7
+  ret i64 %call
+}
+
+; CHECK-LABEL: withDebug
+; CHECK: popq %rcx
+; CHECK: ret
+
+
+declare i64 @fn(i64, i64)
+
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2,!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0")
+!1 = !DIFile(filename: "test.c", directory: "/")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "withDebug", unit: !0)
+!5 = !DILocalVariable(name: "w", scope: !4)
+!6 = !DIExpression()
+!7 = !DILocation(line: 210, column: 12, scope: !4)
diff --git a/test/CodeGen/X86/implicit-use-spill.mir b/test/CodeGen/X86/implicit-use-spill.mir
new file mode 100644
index 0000000..827f0f1
--- /dev/null
+++ b/test/CodeGen/X86/implicit-use-spill.mir
@@ -0,0 +1,22 @@
+# RUN: llc -run-pass=greedy -mtriple=x86_64-apple-macosx -o - %s 2>&1 | FileCheck %s
+
+# Make sure we don't assert when we try to reload a value that is just implicitly used.
+---
+# CHECK: name: foo
+# This test forces a spill of %0.
+name: foo
+registers:
+  - { id: 0, class: gr64 }
+body: |
+  bb.0:
+  ; CHECK: NOOP implicit-def [[VAL:%[0-9]+]]
+  ; VAL should be spilled before csr_noregs, i.e., before we clobber all the registers
+  ; CHECK-NEXT: MOV64mr [[SLOT:%stack.[0-9]+]], 1, _, 0, _, [[VAL]]
+  ; CHECK-NEXT: NOOP csr_noregs
+  ; We need to reload before the (implicit) use.
+  ; CHECK-NEXT: [[RELOADED_VAL:%[0-9]+]] = MOV64rm [[SLOT]], 1, _, 0, _
+  ; CHECK-NEXT: NOOP implicit [[RELOADED_VAL]]
+  NOOP implicit-def %0
+  NOOP csr_noregs
+  NOOP implicit %0
+...
diff --git a/test/CodeGen/X86/known-bits-vector.ll b/test/CodeGen/X86/known-bits-vector.ll
index 4bb8d77..944b370 100644
--- a/test/CodeGen/X86/known-bits-vector.ll
+++ b/test/CodeGen/X86/known-bits-vector.ll
@@ -50,6 +50,43 @@
   ret float %3
 }
 
+define <4 x float> @knownbits_insert_uitofp(<4 x i32> %a0, i16 %a1, i16 %a2) nounwind {
+; X32-LABEL: knownbits_insert_uitofp:
+; X32:       # BB#0:
+; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X32-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X32-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X32-NEXT:    vaddps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: knownbits_insert_uitofp:
+; X64:       # BB#0:
+; X64-NEXT:    movzwl %di, %eax
+; X64-NEXT:    movzwl %si, %ecx
+; X64-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
+; X64-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X64-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X64-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X64-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X64-NEXT:    retq
+  %1 = zext i16 %a1 to i32
+  %2 = zext i16 %a2 to i32
+  %3 = insertelement <4 x i32> %a0, i32 %1, i32 0
+  %4 = insertelement <4 x i32>  %3, i32 %2, i32 2
+  %5 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %6 = uitofp <4 x i32> %5 to <4 x float>
+  ret <4 x float> %6
+}
+
 define <4 x i32> @knownbits_mask_shuffle_sext(<8 x i16> %a0) nounwind {
 ; X32-LABEL: knownbits_mask_shuffle_sext:
 ; X32:       # BB#0:
@@ -70,6 +107,48 @@
   ret <4 x i32> %3
 }
 
+define <4 x i32> @knownbits_mask_shuffle_shuffle_sext(<8 x i16> %a0) nounwind {
+; X32-LABEL: knownbits_mask_shuffle_shuffle_sext:
+; X32:       # BB#0:
+; X32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X32-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT:    retl
+;
+; X64-LABEL: knownbits_mask_shuffle_shuffle_sext:
+; X64:       # BB#0:
+; X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT:    retq
+  %1 = and <8 x i16> %a0, <i16 -1, i16 -1, i16 -1, i16 -1, i16 15, i16 15, i16 15, i16 15>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = sext <4 x i16> %3 to <4 x i32>
+  ret <4 x i32> %4
+}
+
+define <4 x i32> @knownbits_mask_shuffle_shuffle_undef_sext(<8 x i16> %a0) nounwind {
+; X32-LABEL: knownbits_mask_shuffle_shuffle_undef_sext:
+; X32:       # BB#0:
+; X32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-NEXT:    vpmovsxwd %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: knownbits_mask_shuffle_shuffle_undef_sext:
+; X64:       # BB#0:
+; X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT:    vpmovsxwd %xmm0, %xmm0
+; X64-NEXT:    retq
+  %1 = and <8 x i16> %a0, <i16 -1, i16 -1, i16 -1, i16 -1, i16 15, i16 15, i16 15, i16 15>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = sext <4 x i16> %3 to <4 x i32>
+  ret <4 x i32> %4
+}
+
 define <4 x float> @knownbits_mask_shuffle_uitofp(<4 x i32> %a0) nounwind {
 ; X32-LABEL: knownbits_mask_shuffle_uitofp:
 ; X32:       # BB#0:
@@ -379,3 +458,92 @@
   %4 = uitofp <4 x i32> %3 to <4 x float>
   ret <4 x float> %4
 }
+
+define <4 x float> @knownbits_smax_smin_shuffle_uitofp(<4 x i32> %a0) {
+; X32-LABEL: knownbits_smax_smin_shuffle_uitofp:
+; X32:       # BB#0:
+; X32-NEXT:    vpminsd {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vpmaxsd {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3]
+; X32-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X32-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X32-NEXT:    vaddps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: knownbits_smax_smin_shuffle_uitofp:
+; X64:       # BB#0:
+; X64-NEXT:    vpminsd {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpmaxsd {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3]
+; X64-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X64-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X64-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X64-NEXT:    retq
+  %1 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> <i32 0, i32 -65535, i32 -65535, i32 0>)
+  %2 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %1, <4 x i32> <i32 65535, i32 -1, i32 -1, i32 131071>)
+  %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
+  %4 = uitofp <4 x i32> %3 to <4 x float>
+  ret <4 x float> %4
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x float> @knownbits_umax_umin_shuffle_uitofp(<4 x i32> %a0) {
+; X32-LABEL: knownbits_umax_umin_shuffle_uitofp:
+; X32:       # BB#0:
+; X32-NEXT:    vpmaxud {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vpminud {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3]
+; X32-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X32-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X32-NEXT:    vaddps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: knownbits_umax_umin_shuffle_uitofp:
+; X64:       # BB#0:
+; X64-NEXT:    vpmaxud {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3]
+; X64-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X64-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X64-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X64-NEXT:    retq
+  %1 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> <i32 255, i32 -1, i32 -1, i32 1023>)
+  %2 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %1, <4 x i32> <i32 65535, i32 -1, i32 -1, i32 262143>)
+  %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
+  %4 = uitofp <4 x i32> %3 to <4 x float>
+  ret <4 x float> %4
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x float> @knownbits_mask_umax_shuffle_uitofp(<4 x i32> %a0) {
+; X32-LABEL: knownbits_mask_umax_shuffle_uitofp:
+; X32:       # BB#0:
+; X32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vpmaxud {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3]
+; X32-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: knownbits_mask_umax_shuffle_uitofp:
+; X64:       # BB#0:
+; X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpmaxud {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3]
+; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; X64-NEXT:    retq
+  %1 = and <4 x i32> %a0, <i32 65535, i32 -1, i32 -1, i32 262143>
+  %2 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %1, <4 x i32> <i32 255, i32 -1, i32 -1, i32 1023>)
+  %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
+  %4 = uitofp <4 x i32> %3 to <4 x float>
+  ret <4 x float> %4
+}
diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll
index 98b1e87..7e1837c 100644
--- a/test/CodeGen/X86/masked_memop.ll
+++ b/test/CodeGen/X86/masked_memop.ll
@@ -1019,7 +1019,7 @@
 ; AVX512F-LABEL: one_mask_bit_set3:
 ; AVX512F:       ## BB#0:
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vmovq %xmm0, 16(%rdi)
+; AVX512F-NEXT:    vmovlps %xmm0, 16(%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: one_mask_bit_set3:
diff --git a/test/CodeGen/X86/merge-consecutive-loads-128.ll b/test/CodeGen/X86/merge-consecutive-loads-128.ll
index bb60440..1f2bd4b 100644
--- a/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -935,26 +935,14 @@
 ; SSE-LABEL: merge_4i32_i32_combine:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT:    movaps %xmm0, (%rdi)
+; SSE-NEXT:    movdqa %xmm0, (%rdi)
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: merge_4i32_i32_combine:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vmovaps %xmm0, (%rdi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: merge_4i32_i32_combine:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vmovaps %xmm0, (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: merge_4i32_i32_combine:
-; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512F-NEXT:    vmovdqa %xmm0, (%rdi)
-; AVX512F-NEXT:    retq
+; AVX-LABEL: merge_4i32_i32_combine:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX-NEXT:    retq
 ;
 ; X32-SSE1-LABEL: merge_4i32_i32_combine:
 ; X32-SSE1:       # BB#0:
@@ -972,7 +960,7 @@
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE41-NEXT:    movaps %xmm0, (%eax)
+; X32-SSE41-NEXT:    movdqa %xmm0, (%eax)
 ; X32-SSE41-NEXT:    retl
  %1 = getelementptr i32, i32* %src, i32 0
  %2 = load i32, i32* %1
diff --git a/test/CodeGen/X86/not-and-simplify.ll b/test/CodeGen/X86/not-and-simplify.ll
index 3bee447..dfce6c6 100644
--- a/test/CodeGen/X86/not-and-simplify.ll
+++ b/test/CodeGen/X86/not-and-simplify.ll
@@ -5,20 +5,12 @@
 ; Clear high bits via shift, set them with xor (not), then mask them off.
 
 define i32 @shrink_xor_constant1(i32 %x) {
-; NO_BMI-LABEL: shrink_xor_constant1:
-; NO_BMI:       # BB#0:
-; NO_BMI-NEXT:    shrl $31, %edi
-; NO_BMI-NEXT:    notl %edi
-; NO_BMI-NEXT:    andl $1, %edi
-; NO_BMI-NEXT:    movl %edi, %eax
-; NO_BMI-NEXT:    retq
-;
-; BMI-LABEL: shrink_xor_constant1:
-; BMI:       # BB#0:
-; BMI-NEXT:    shrl $31, %edi
-; BMI-NEXT:    movl $1, %eax
-; BMI-NEXT:    andnl %eax, %edi, %eax
-; BMI-NEXT:    retq
+; ALL-LABEL: shrink_xor_constant1:
+; ALL:       # BB#0:
+; ALL-NEXT:    shrl $31, %edi
+; ALL-NEXT:    xorl $1, %edi
+; ALL-NEXT:    movl %edi, %eax
+; ALL-NEXT:    retq
 ;
   %sh = lshr i32 %x, 31
   %not = xor i32 %sh, -1
@@ -32,8 +24,7 @@
 ; ALL-LABEL: shrink_xor_constant2:
 ; ALL:       # BB#0:
 ; ALL-NEXT:    shlb $5, %dil
-; ALL-NEXT:    notb %dil
-; ALL-NEXT:    andb $-32, %dil
+; ALL-NEXT:    xorb $-32, %dil
 ; ALL-NEXT:    movl %edi, %eax
 ; ALL-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index a128a64..894cc5d 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1153,71 +1153,31 @@
 define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
 ; SSE2-LABEL: mul_v4i64_zero_upper:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    psrlq $32, %xmm5
-; SSE2-NEXT:    pmuludq %xmm0, %xmm5
-; SSE2-NEXT:    psllq $32, %xmm5
-; SSE2-NEXT:    psrlq $32, %xmm0
-; SSE2-NEXT:    pmuludq %xmm1, %xmm0
-; SSE2-NEXT:    psllq $32, %xmm0
-; SSE2-NEXT:    paddq %xmm5, %xmm0
-; SSE2-NEXT:    paddq %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm2, %xmm4
-; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    psrlq $32, %xmm2
-; SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; SSE2-NEXT:    psllq $32, %xmm2
-; SSE2-NEXT:    paddq %xmm4, %xmm2
-; SSE2-NEXT:    paddq %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    pmuludq %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: mul_v4i64_zero_upper:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; SSE41-NEXT:    movdqa %xmm1, %xmm5
-; SSE41-NEXT:    psrlq $32, %xmm5
-; SSE41-NEXT:    pmuludq %xmm0, %xmm5
-; SSE41-NEXT:    psllq $32, %xmm5
-; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm1, %xmm0
-; SSE41-NEXT:    psllq $32, %xmm0
-; SSE41-NEXT:    paddq %xmm5, %xmm0
-; SSE41-NEXT:    paddq %xmm3, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
-; SSE41-NEXT:    pmuludq %xmm4, %xmm1
-; SSE41-NEXT:    movdqa %xmm4, %xmm3
-; SSE41-NEXT:    psrlq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm2, %xmm3
-; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    psrlq $32, %xmm2
-; SSE41-NEXT:    pmuludq %xmm4, %xmm2
-; SSE41-NEXT:    psllq $32, %xmm2
-; SSE41-NEXT:    paddq %xmm3, %xmm2
-; SSE41-NEXT:    paddq %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE41-NEXT:    pmuludq %xmm0, %xmm1
+; SSE41-NEXT:    pmuludq %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
@@ -1255,67 +1215,47 @@
 define <4 x i32> @mul_v4i64_zero_upper_left(<4 x i32> %val1, <4 x i64> %val2) {
 ; SSE2-LABEL: mul_v4i64_zero_upper_left:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    psrlq $32, %xmm5
-; SSE2-NEXT:    pmuludq %xmm0, %xmm5
-; SSE2-NEXT:    psllq $32, %xmm5
-; SSE2-NEXT:    psrlq $32, %xmm0
-; SSE2-NEXT:    pmuludq %xmm1, %xmm0
-; SSE2-NEXT:    psllq $32, %xmm0
-; SSE2-NEXT:    paddq %xmm5, %xmm0
-; SSE2-NEXT:    paddq %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm3, %xmm4
-; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    psrlq $32, %xmm3
-; SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; SSE2-NEXT:    psllq $32, %xmm3
-; SSE2-NEXT:    paddq %xmm4, %xmm3
-; SSE2-NEXT:    paddq %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pmuludq %xmm1, %xmm3
+; SSE2-NEXT:    psrlq $32, %xmm1
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    psllq $32, %xmm1
+; SSE2-NEXT:    paddq %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NEXT:    psrlq $32, %xmm2
+; SSE2-NEXT:    pmuludq %xmm4, %xmm2
+; SSE2-NEXT:    psllq $32, %xmm2
+; SSE2-NEXT:    paddq %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: mul_v4i64_zero_upper_left:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pmuludq %xmm1, %xmm4
-; SSE41-NEXT:    movdqa %xmm1, %xmm5
-; SSE41-NEXT:    psrlq $32, %xmm5
-; SSE41-NEXT:    pmuludq %xmm3, %xmm5
-; SSE41-NEXT:    psllq $32, %xmm5
-; SSE41-NEXT:    psrlq $32, %xmm3
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE41-NEXT:    movdqa %xmm4, %xmm3
 ; SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    paddq %xmm5, %xmm3
-; SSE41-NEXT:    paddq %xmm4, %xmm3
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pmuludq %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm4
-; SSE41-NEXT:    psrlq $32, %xmm4
-; SSE41-NEXT:    pmuludq %xmm0, %xmm4
-; SSE41-NEXT:    psllq $32, %xmm4
-; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm2, %xmm0
-; SSE41-NEXT:    psllq $32, %xmm0
-; SSE41-NEXT:    paddq %xmm4, %xmm0
-; SSE41-NEXT:    paddq %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    psrlq $32, %xmm1
+; SSE41-NEXT:    pmuludq %xmm4, %xmm1
+; SSE41-NEXT:    psllq $32, %xmm1
+; SSE41-NEXT:    paddq %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; SSE41-NEXT:    psrlq $32, %xmm2
+; SSE41-NEXT:    pmuludq %xmm0, %xmm2
+; SSE41-NEXT:    psllq $32, %xmm2
+; SSE41-NEXT:    paddq %xmm3, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX2-LABEL: mul_v4i64_zero_upper_left:
@@ -1357,72 +1297,40 @@
 define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) {
 ; SSE2-LABEL: mul_v4i64_zero_lower:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [0,4294967295,0,4294967295]
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    psrlq $32, %xmm5
-; SSE2-NEXT:    pmuludq %xmm0, %xmm5
-; SSE2-NEXT:    psllq $32, %xmm5
-; SSE2-NEXT:    psrlq $32, %xmm0
-; SSE2-NEXT:    pmuludq %xmm1, %xmm0
-; SSE2-NEXT:    psllq $32, %xmm0
-; SSE2-NEXT:    paddq %xmm5, %xmm0
-; SSE2-NEXT:    paddq %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm3, %xmm4
-; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    psrlq $32, %xmm3
-; SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; SSE2-NEXT:    psllq $32, %xmm3
-; SSE2-NEXT:    paddq %xmm4, %xmm3
-; SSE2-NEXT:    paddq %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,4294967295,0,4294967295]
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    psrlq $32, %xmm1
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    psllq $32, %xmm1
+; SSE2-NEXT:    psrlq $32, %xmm2
+; SSE2-NEXT:    pmuludq %xmm4, %xmm2
+; SSE2-NEXT:    psllq $32, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: mul_v4i64_zero_lower:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3],xmm4[4,5],xmm1[6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7]
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pmuludq %xmm2, %xmm4
-; SSE41-NEXT:    movdqa %xmm2, %xmm5
-; SSE41-NEXT:    psrlq $32, %xmm5
-; SSE41-NEXT:    pmuludq %xmm0, %xmm5
-; SSE41-NEXT:    psllq $32, %xmm5
-; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm2, %xmm0
-; SSE41-NEXT:    psllq $32, %xmm0
-; SSE41-NEXT:    paddq %xmm5, %xmm0
-; SSE41-NEXT:    paddq %xmm4, %xmm0
-; SSE41-NEXT:    movdqa %xmm3, %xmm2
-; SSE41-NEXT:    pmuludq %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    psrlq $32, %xmm4
-; SSE41-NEXT:    pmuludq %xmm3, %xmm4
-; SSE41-NEXT:    psllq $32, %xmm4
-; SSE41-NEXT:    psrlq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    paddq %xmm4, %xmm3
-; SSE41-NEXT:    paddq %xmm2, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT:    psrlq $32, %xmm2
+; SSE41-NEXT:    pmuludq %xmm0, %xmm2
+; SSE41-NEXT:    psllq $32, %xmm2
+; SSE41-NEXT:    psrlq $32, %xmm1
+; SSE41-NEXT:    pmuludq %xmm4, %xmm1
+; SSE41-NEXT:    psllq $32, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX2-LABEL: mul_v4i64_zero_lower:
@@ -1430,11 +1338,9 @@
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
@@ -1447,11 +1353,9 @@
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX512-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
 ; AVX512-NEXT:    vpsrlq $32, %ymm1, %ymm1
 ; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsllq $32, %ymm0, %ymm0
-; AVX512-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
@@ -1469,131 +1373,51 @@
 define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
 ; SSE2-LABEL: mul_v8i64_zero_upper:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
-; SSE2-NEXT:    movdqa %xmm2, %xmm8
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
-; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm9
-; SSE2-NEXT:    pmuludq %xmm3, %xmm9
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    psrlq $32, %xmm6
-; SSE2-NEXT:    pmuludq %xmm1, %xmm6
-; SSE2-NEXT:    psllq $32, %xmm6
-; SSE2-NEXT:    psrlq $32, %xmm1
-; SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; SSE2-NEXT:    psllq $32, %xmm1
-; SSE2-NEXT:    paddq %xmm6, %xmm1
-; SSE2-NEXT:    paddq %xmm9, %xmm1
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
-; SSE2-NEXT:    pmuludq %xmm7, %xmm3
-; SSE2-NEXT:    movdqa %xmm7, %xmm6
-; SSE2-NEXT:    psrlq $32, %xmm6
-; SSE2-NEXT:    pmuludq %xmm4, %xmm6
-; SSE2-NEXT:    psllq $32, %xmm6
-; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm7, %xmm4
-; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    paddq %xmm6, %xmm4
-; SSE2-NEXT:    paddq %xmm3, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    psrlq $32, %xmm6
-; SSE2-NEXT:    pmuludq %xmm0, %xmm6
-; SSE2-NEXT:    psllq $32, %xmm6
-; SSE2-NEXT:    psrlq $32, %xmm0
-; SSE2-NEXT:    pmuludq %xmm2, %xmm0
-; SSE2-NEXT:    psllq $32, %xmm0
-; SSE2-NEXT:    paddq %xmm6, %xmm0
-; SSE2-NEXT:    paddq %xmm3, %xmm0
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
-; SSE2-NEXT:    pmuludq %xmm8, %xmm2
-; SSE2-NEXT:    movdqa %xmm8, %xmm3
-; SSE2-NEXT:    psrlq $32, %xmm3
-; SSE2-NEXT:    pmuludq %xmm5, %xmm3
-; SSE2-NEXT:    psllq $32, %xmm3
-; SSE2-NEXT:    psrlq $32, %xmm5
-; SSE2-NEXT:    pmuludq %xmm8, %xmm5
-; SSE2-NEXT:    psllq $32, %xmm5
-; SSE2-NEXT:    paddq %xmm3, %xmm5
-; SSE2-NEXT:    paddq %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm8
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm3
+; SSE2-NEXT:    pmuludq %xmm6, %xmm5
+; SSE2-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NEXT:    pmuludq %xmm8, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: mul_v8i64_zero_upper:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pxor %xmm6, %xmm6
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; SSE41-NEXT:    movdqa %xmm1, %xmm6
-; SSE41-NEXT:    pmuludq %xmm3, %xmm6
-; SSE41-NEXT:    movdqa %xmm3, %xmm7
-; SSE41-NEXT:    psrlq $32, %xmm7
-; SSE41-NEXT:    pmuludq %xmm1, %xmm7
-; SSE41-NEXT:    psllq $32, %xmm7
-; SSE41-NEXT:    psrlq $32, %xmm1
-; SSE41-NEXT:    pmuludq %xmm3, %xmm1
-; SSE41-NEXT:    psllq $32, %xmm1
-; SSE41-NEXT:    paddq %xmm7, %xmm1
-; SSE41-NEXT:    paddq %xmm6, %xmm1
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pmuludq %xmm2, %xmm3
-; SSE41-NEXT:    movdqa %xmm2, %xmm6
-; SSE41-NEXT:    psrlq $32, %xmm6
-; SSE41-NEXT:    pmuludq %xmm0, %xmm6
-; SSE41-NEXT:    psllq $32, %xmm6
-; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm2, %xmm0
-; SSE41-NEXT:    psllq $32, %xmm0
-; SSE41-NEXT:    paddq %xmm6, %xmm0
-; SSE41-NEXT:    paddq %xmm3, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm2
-; SSE41-NEXT:    pmuludq %xmm9, %xmm2
-; SSE41-NEXT:    movdqa %xmm9, %xmm3
-; SSE41-NEXT:    psrlq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm4, %xmm3
-; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    psrlq $32, %xmm4
-; SSE41-NEXT:    pmuludq %xmm9, %xmm4
-; SSE41-NEXT:    psllq $32, %xmm4
-; SSE41-NEXT:    paddq %xmm3, %xmm4
-; SSE41-NEXT:    paddq %xmm2, %xmm4
-; SSE41-NEXT:    movdqa %xmm5, %xmm2
-; SSE41-NEXT:    pmuludq %xmm8, %xmm2
-; SSE41-NEXT:    movdqa %xmm8, %xmm3
-; SSE41-NEXT:    psrlq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm5, %xmm3
-; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    psrlq $32, %xmm5
-; SSE41-NEXT:    pmuludq %xmm8, %xmm5
-; SSE41-NEXT:    psllq $32, %xmm5
-; SSE41-NEXT:    paddq %xmm3, %xmm5
-; SSE41-NEXT:    paddq %xmm2, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,3,2,3]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE41-NEXT:    pmuludq %xmm1, %xmm3
+; SSE41-NEXT:    pmuludq %xmm0, %xmm2
+; SSE41-NEXT:    pmuludq %xmm6, %xmm5
+; SSE41-NEXT:    pmuludq %xmm8, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,3,2,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,1,1,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
@@ -1605,24 +1429,8 @@
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm4
-; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm5
-; AVX2-NEXT:    vpmuludq %ymm5, %ymm0, %ymm5
-; AVX2-NEXT:    vpsllq $32, %ymm5, %ymm5
-; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddq %ymm0, %ymm5, %ymm0
-; AVX2-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm1
-; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm4
-; AVX2-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
-; AVX2-NEXT:    vpsllq $32, %ymm4, %ymm4
-; AVX2-NEXT:    vpsrlq $32, %ymm2, %ymm2
-; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddq %ymm2, %ymm4, %ymm2
-; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
diff --git a/test/CodeGen/X86/pr31143.ll b/test/CodeGen/X86/pr31143.ll
new file mode 100644
index 0000000..1c7d77b
--- /dev/null
+++ b/test/CodeGen/X86/pr31143.ll
@@ -0,0 +1,60 @@
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -mattr=+sse4.2 < %s | FileCheck %s
+
+; CHECK-LABEL: testss:
+; CHECK: movss {{.*}}, %[[XMM0:xmm[0-9]+]]
+; CHECK: xorps %[[XMM1:xmm[0-9]+]], %[[XMM1]]
+; CHECK: roundss $9, %[[XMM0]], %[[XMM1]]
+
+define void @testss(float* nocapture %a, <4 x float>* nocapture %b, i32 %k) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %v = load float, float* %arrayidx, align 4
+  %floor = call float @floorf(float %v)
+  %sub = fsub float %floor, %v
+  %v1 = insertelement <4 x float> undef, float %sub, i32 0
+  %br = shufflevector <4 x float> %v1, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  store volatile <4 x float> %br, <4 x float>* %b, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %k
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: testsd:
+; CHECK: movsd {{.*}}, %[[XMM0:xmm[0-9]+]]
+; CHECK: xorps %[[XMM1:xmm[0-9]+]], %[[XMM1]]
+; CHECK: roundsd $9, %[[XMM0]], %[[XMM1]]
+
+define void @testsd(double* nocapture %a, <2 x double>* nocapture %b, i32 %k) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  %v = load double, double* %arrayidx, align 4
+  %floor = call double @floor(double %v)
+  %sub = fsub double %floor, %v
+  %v1 = insertelement <2 x double> undef, double %sub, i32 0
+  %br = shufflevector <2 x double> %v1, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  store volatile <2 x double> %br, <2 x double>* %b, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %k
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @floorf(float) nounwind readnone
+
+declare double @floor(double) nounwind readnone
+
diff --git a/test/CodeGen/X86/pr31242.ll b/test/CodeGen/X86/pr31242.ll
new file mode 100644
index 0000000..273ae76
--- /dev/null
+++ b/test/CodeGen/X86/pr31242.ll
@@ -0,0 +1,55 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+declare void @raf(i32, ...)
+
+; CHECK-LABEL: test3
+; CHECK-DAG:   movl    $7, %edi
+; CHECK-DAG:   xorl    %esi, %esi
+; CHECK-DAG:   xorl    %edx, %edx
+; CHECK-DAG:   xorl    %ecx, %ecx
+; CHECK-DAG:   xorl    %r8d, %r8d
+; CHECK-DAG:   xorl    %r9d, %r9d
+; CHECK-DAG:   xorl    %eax, %eax
+; CHECK:       pushq   %rbx
+; CHECK:       pushq   $0
+; CHECK:       callq   raf
+
+
+; Function Attrs: nounwind uwtable
+define void @test3() {
+entry:
+  br label %for.body
+
+for.body:
+  %i.04 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  tail call void @llvm.dbg.value(metadata i32 %i.04, i64 0, metadata !10, metadata !12), !dbg !6
+  tail call void (i32, ...) @raf(i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 %i.04)
+  %inc = add nuw nsw i32 %i.04, 1
+  %exitcond = icmp eq i32 %inc, 21
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void  
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) 
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 288844)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "pr31242.c", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 4.0.0 (trunk 288844)"}
+!6 = !DILocation(line: 2, column: 16, scope: !7)
+!7 = distinct !DISubprogram(name: "test3", scope: !1, file: !1, line: 5, type: !8, isLocal: false, isDefinition: true, scopeLine: 5, isOptimized: true, unit: !0)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null}
+!10 = !DILocalVariable(name: "value", arg: 1, scope: !7, file: !1, line: 2, type: !11)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !DIExpression()
diff --git a/test/CodeGen/X86/pr31271.ll b/test/CodeGen/X86/pr31271.ll
new file mode 100644
index 0000000..e38e176
--- /dev/null
+++ b/test/CodeGen/X86/pr31271.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=i386-unknown-linux-gnu < %s | FileCheck %s
+
+@c = external global [1 x i32], align 4
+
+; CHECK-LABEL: fn1
+; CHECK: leal c(%eax), %ecx
+define void @fn1(i32 %k) {
+  %g = getelementptr inbounds [1 x i32], [1 x i32]* @c, i32 0, i32 %k
+  %cmp = icmp ne i32* undef, %g
+  %z = zext i1 %cmp to i32
+  store i32 %z, i32* undef, align 4
+  %cmp2 = icmp eq i32* %g, null
+  br i1 %cmp2, label %u, label %r
+
+u:
+  unreachable
+
+r:
+  ret void
+}
diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll
index 8364915..d447bf9 100644
--- a/test/CodeGen/X86/pshufb-mask-comments.ll
+++ b/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -56,7 +56,7 @@
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    movd %rax, %xmm1
-; CHECK-NEXT:    movaps %xmm1, (%rax)
+; CHECK-NEXT:    movdqa %xmm1, (%rax)
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
 ; CHECK-NEXT:    movdqa %xmm1, (%rax)
 ; CHECK-NEXT:    pshufb %xmm1, %xmm0
diff --git a/test/CodeGen/X86/scalar-int-to-fp.ll b/test/CodeGen/X86/scalar-int-to-fp.ll
index 9ea86b0..d39b206 100644
--- a/test/CodeGen/X86/scalar-int-to-fp.ll
+++ b/test/CodeGen/X86/scalar-int-to-fp.ll
@@ -75,7 +75,7 @@
 
 ; CHECK-LABEL: u64_to_f
 ; AVX512_32: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512_32: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512_32: vmovq %xmm0, {{[0-9]+}}(%esp)
 ; AVX512_32: fildll
 
 ; AVX512_64: vcvtusi2ssq
diff --git a/test/CodeGen/X86/seh-catchpad.ll b/test/CodeGen/X86/seh-catchpad.ll
index 99ed454..b8f1753 100644
--- a/test/CodeGen/X86/seh-catchpad.ll
+++ b/test/CodeGen/X86/seh-catchpad.ll
@@ -147,7 +147,7 @@
 
 ; CHECK: "?dtor$[[finbb]]@?0?main@4HA":
 ; CHECK: .seh_proc "?dtor$[[finbb]]@?0?main@4HA"
-; CHECK:         .seh_handler __C_specific_handler, @unwind, @except
+; CHECK-NOT:         .seh_handler
 ; CHECK: .LBB1_[[finbb]]:                                # %ehcleanup
 ; CHECK:         movq    %rdx, 16(%rsp)
 ; CHECK:         pushq   %rbp
diff --git a/test/CodeGen/X86/slow-pmulld.ll b/test/CodeGen/X86/slow-pmulld.ll
new file mode 100644
index 0000000..ff66820
--- /dev/null
+++ b/test/CodeGen/X86/slow-pmulld.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefix=CHECK32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefix=CHECK64
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE4-32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE4-64
+
+define <4 x i32> @foo(<4 x i8> %A) {
+; CHECK32-LABEL: foo:
+; CHECK32:       # BB#0:
+; CHECK32-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[4],zero,xmm0[8],zero,xmm0[12],zero,xmm0[u,u,u,u,u,u,u,u]
+; CHECK32-NEXT:    movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
+; CHECK32-NEXT:    movdqa %xmm0, %xmm2
+; CHECK32-NEXT:    pmullw %xmm1, %xmm0
+; CHECK32-NEXT:    pmulhw %xmm1, %xmm2
+; CHECK32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: foo:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[4],zero,xmm0[8],zero,xmm0[12],zero,xmm0[u,u,u,u,u,u,u,u]
+; CHECK64-NEXT:    movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
+; CHECK64-NEXT:    movdqa %xmm0, %xmm2
+; CHECK64-NEXT:    pmullw %xmm1, %xmm0
+; CHECK64-NEXT:    pmulhw %xmm1, %xmm2
+; CHECK64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK64-NEXT:    retq
+;
+; SSE4-32-LABEL: foo:
+; SSE4-32:       # BB#0:
+; SSE4-32-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; SSE4-32-NEXT:    pmulld {{\.LCPI.*}}, %xmm0
+; SSE4-32-NEXT:    retl
+;
+; SSE4-64-LABEL: foo:
+; SSE4-64:       # BB#0:
+; SSE4-64-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE4-64-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE4-64-NEXT:    retq
+  %z = zext <4 x i8> %A to <4 x i32>
+  %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @foo_os(<4 x i8> %A) minsize {
+; CHECK32-LABEL: foo_os:
+; CHECK32:       # BB#0:
+; CHECK32-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; CHECK32-NEXT:    pmulld {{\.LCPI.*}}, %xmm0
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: foo_os:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    pand {{.*}}(%rip), %xmm0
+; CHECK64-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK64-NEXT:    retq
+;
+; SSE4-32-LABEL: foo_os:
+; SSE4-32:       # BB#0:
+; SSE4-32-NEXT:    pand {{\.LCPI.*}}, %xmm0
+; SSE4-32-NEXT:    pmulld {{\.LCPI.*}}, %xmm0
+; SSE4-32-NEXT:    retl
+;
+; SSE4-64-LABEL: foo_os:
+; SSE4-64:       # BB#0:
+; SSE4-64-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE4-64-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE4-64-NEXT:    retq
+  %z = zext <4 x i8> %A to <4 x i32>
+  %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
+  ret <4 x i32> %m
+}
diff --git a/test/CodeGen/X86/sqrt-fastmath-mir.ll b/test/CodeGen/X86/sqrt-fastmath-mir.ll
index aec8513..c613ef8 100644
--- a/test/CodeGen/X86/sqrt-fastmath-mir.ll
+++ b/test/CodeGen/X86/sqrt-fastmath-mir.ll
@@ -17,10 +17,10 @@
 ; CHECK:     %10 = VFMADD213SSr %8, %9, %4
 ; CHECK:     %11 = VMULSSrr %9, %6
 ; CHECK:     %12 = VMULSSrr killed %11, killed %10
-; CHECK:     %13 = FsFLD0SS
-; CHECK:     %14 = VCMPSSrr %0, killed %13, 0
-; CHECK:     %15 = VFsANDNPSrr killed %14, killed %12
-; CHECK:     %xmm0 = COPY %15
+; CHECK:     %14 = FsFLD0SS
+; CHECK:     %15 = VCMPSSrr %0, killed %14, 0
+; CHECK:     %17 = VANDNPSrr killed %16, killed %13
+; CHECK:     %xmm0 = COPY %18
 ; CHECK:     RET 0, %xmm0
   %call = tail call float @llvm.sqrt.f32(float %f) #1
   ret float %call
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index 1a78729..747bee1 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -1500,6 +1500,33 @@
 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
 
 
+define <2 x double> @test_x86_sse2_sqrt_sd_vec_load(<2 x double>* %a0) {
+; SSE-LABEL: test_x86_sse2_sqrt_sd_vec_load:
+; SSE:       ## BB#0:
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; SSE-NEXT:    movaps (%eax), %xmm0 ## encoding: [0x0f,0x28,0x00]
+; SSE-NEXT:    sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_sqrt_sd_vec_load:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; AVX2-NEXT:    vmovaps (%eax), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x00]
+; AVX2-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_sqrt_sd_vec_load:
+; SKX:       ## BB#0:
+; SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; SKX-NEXT:    vmovaps (%eax), %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0x00]
+; SKX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %a1 = load <2 x double>, <2 x double>* %a0, align 16
+  %res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+
+
 define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) {
 ; SSE-LABEL: test_x86_sse2_ucomieq_sd:
 ; SSE:       ## BB#0:
diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll
index ddbf7b3..5e939cc 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -893,10 +893,12 @@
   ;CHECK-LABEL: stack_fold_extractps
   ;CHECK:       vextractps $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
   ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
-  %1 = extractelement <4 x float> %a0, i32 1
-  %2 = bitcast float %1 to i32
-  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-  ret i32 %2
+  ; fadd forces execution domain
+  %1 = fadd <4 x float> %a0, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %2 = extractelement <4 x float> %1, i32 1
+  %3 = bitcast float %2 to i32
+  %4 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i32 %3
 }
 
 define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) {
@@ -1504,15 +1506,7 @@
 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
 
 ; TODO stack_fold_rcpss
-
-define <4 x float> @stack_fold_rcpss_int(<4 x float> %a0) {
-  ;CHECK-LABEL: stack_fold_rcpss_int
-  ;CHECK:       vrcpss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
-  ret <4 x float> %2
-}
-declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+; TODO stack_fold_rcpss_int
 
 define <2 x double> @stack_fold_roundpd(<2 x double> %a0) {
   ;CHECK-LABEL: stack_fold_roundpd
@@ -1609,15 +1603,7 @@
 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
 
 ; TODO stack_fold_rsqrtss
-
-define <4 x float> @stack_fold_rsqrtss_int(<4 x float> %a0) {
-  ;CHECK-LABEL: stack_fold_rsqrtss_int
-  ;CHECK:       vrsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
-  ret <4 x float> %2
-}
-declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+; TODO stack_fold_rsqrtss_int
 
 define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) {
   ;CHECK-LABEL: stack_fold_shufpd
@@ -1696,14 +1682,7 @@
 }
 declare double @llvm.sqrt.f64(double) nounwind readnone
 
-define <2 x double> @stack_fold_sqrtsd_int(<2 x double> %a0) {
-  ;CHECK-LABEL: stack_fold_sqrtsd_int
-  ;CHECK:       vsqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0)
-  ret <2 x double> %2
-}
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+; TODO stack_fold_sqrtsd_int
 
 define float @stack_fold_sqrtss(float %a0) {
   ;CHECK-LABEL: stack_fold_sqrtss
@@ -1714,14 +1693,7 @@
 }
 declare float @llvm.sqrt.f32(float) nounwind readnone
 
-define <4 x float> @stack_fold_sqrtss_int(<4 x float> %a0) {
-  ;CHECK-LABEL: stack_fold_sqrtss_int
-  ;CHECK:       vsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
-  ret <4 x float> %2
-}
-declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+; TODO stack_fold_sqrtss_int
 
 define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
   ;CHECK-LABEL: stack_fold_subpd
diff --git a/test/CodeGen/X86/stack-folding-fp-avx512.ll b/test/CodeGen/X86/stack-folding-fp-avx512.ll
index 90b0c8d..4d5e8c9 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx512.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx512.ll
@@ -516,5 +516,245 @@
   ret <8 x double> %4
 }
 
+define <16 x float> @stack_fold_vpermt2ps(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2ps
+  ;CHECK:       vpermt2ps {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
+
+define <16 x float> @stack_fold_vpermi2ps(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2ps
+  ;CHECK:       vpermi2ps {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
+
+define <16 x float> @stack_fold_vpermi2ps_mask(<16 x float> %x0, <16 x i32>* %x1, <16 x float> %x2, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_vpermi2ps_mask
+  ;CHECK:       vpermi2ps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %x1b = load <16 x i32>, <16 x i32>* %x1
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1b, <16 x float> %x2, i16 %mask)
+  ret <16 x float> %res
+}
+
+define <16 x float> @stack_fold_vpermt2ps_mask(<16 x i32>* %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_vpermt2ps_mask
+  ;CHECK:       vpermt2ps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %x0b = load <16 x i32>, <16 x i32>* %x0
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermt2var.ps.512(<16 x i32> %x0b, <16 x float> %x1, <16 x float> %x2, i16 %mask)
+  ret <16 x float> %res
+}
+
+define <16 x float> @stack_fold_vpermt2ps_maskz(<16 x i32>* %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_vpermt2ps_maskz
+  ;CHECK:       vpermt2ps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %x0b = load <16 x i32>, <16 x i32>* %x0
+  %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0b, <16 x float> %x1, <16 x float> %x2, i16 %mask)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
+
+define <8 x double> @stack_fold_vpermt2pd(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2pd
+  ;CHECK:       vpermt2pd {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
+
+define <8 x double> @stack_fold_vpermi2pd(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2pd
+  ;CHECK:       vpermi2pd {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <8 x double> @llvm.x86.avx512.mask.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.mask.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
+
+define <8 x double> @stack_fold_permpd(<8 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_permpd
+  ;CHECK:   vpermpd $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
+  ; fadd forces execution domain
+  %3 = fadd <8 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <8 x double> %3
+}
+
+define <8 x double> @stack_fold_permpd_mask(<8 x double>* %passthru, <8 x double> %a0, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_permpd_mask
+  ;CHECK:   vpermpd $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
+  %3 = bitcast i8 %mask to <8 x i1>
+  ; load needed to keep the operation from being scheduled above the asm block
+  %4 = load <8 x double>, <8 x double>* %passthru
+  %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
+  ; fadd forces execution domain
+  %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <8 x double> %6
+}
+
+define <8 x double> @stack_fold_permpd_maskz(<8 x double> %a0, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_permpd_maskz
+  ;CHECK:   vpermpd $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
+  ret <8 x double> %4
+}
+
+define <8 x double> @stack_fold_permpdvar(<8 x i64> %a0, <8 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_permpdvar
+  ;CHECK:   vpermpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a1, <8 x i64> %a0, <8 x double> undef, i8 -1)
+  ; fadd forces execution domain
+  %3 = fadd <8 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <8 x double> %3
+}
+declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) nounwind readonly
+
+define <16 x float> @stack_fold_permps(<16 x i32> %a0, <16 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_permps
+  ;CHECK:       vpermps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a1, <16 x i32> %a0, <16 x float> undef, i16 -1)
+  ret <16 x float> %2
+}
+declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16) nounwind readonly
+
+define <8 x double> @stack_fold_permilpd_zmm(<8 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_permilpd_zmm
+  ;CHECK:   vpermilpd $85, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x double> %2
+}
+
+define <8 x double> @stack_fold_permilpd_zmm_mask(<8 x double>* %passthru, <8 x double> %a0, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_permilpd_zmm_mask
+  ;CHECK:   vpermilpd $85, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  %3 = bitcast i8 %mask to <8 x i1>
+  ; load needed to keep the operation from being scheduled above the asm block
+  %4 = load <8 x double>, <8 x double>* %passthru
+  %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
+  ret <8 x double> %5
+}
+
+define <8 x double> @stack_fold_permilpd_zmm_maskz(<8 x double> %a0, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_permilpd_zmm_maskz
+  ;CHECK:   vpermilpd $85, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
+  ret <8 x double> %4
+}
+
+define <8 x double> @stack_fold_permilpdvar_zmm(<8 x double> %a0, <8 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_permilpdvar_zmm
+  ;CHECK:       vpermilpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1, <8 x double> undef, i8 -1)
+  ret <8 x double> %2
+}
+declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) nounwind readnone
+
+define <8 x double> @stack_fold_permilpdvar_zmm_mask(<8 x double>* %passthru, <8 x double> %a0, <8 x i64> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_permilpdvar_zmm_mask
+  ;CHECK:       vpermilpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1, <8 x double> undef, i8 -1)
+  %3 = bitcast i8 %mask to <8 x i1>
+  ; load needed to keep the operation from being scheduled above the asm block
+  %4 = load <8 x double>, <8 x double>* %passthru
+  %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
+  ret <8 x double> %5
+}
+
+define <8 x double> @stack_fold_permilpdvar_zmm_maskz(<8 x double> %a0, <8 x i64> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_permilpdvar_zmm_maskz
+  ;CHECK:       vpermilpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1, <8 x double> undef, i8 -1)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
+  ret <8 x double> %4
+}
+
+define <16 x float> @stack_fold_permilps_zmm(<16 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_permilps_zmm
+  ;CHECK:   vpermilps $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+  ret <16 x float> %2
+}
+
+define <16 x float> @stack_fold_permilps_zmm_mask(<16 x float>* %passthru, <16 x float> %a0, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_permilps_zmm_mask
+  ;CHECK:   vpermilps $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+  %3 = bitcast i16 %mask to <16 x i1>
+  ; load needed to keep the operation from being scheduled above the asm block
+  %4 = load <16 x float>, <16 x float>* %passthru
+  %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
+  ret <16 x float> %5
+}
+
+define <16 x float> @stack_fold_permilps_zmm_maskz(<16 x float> %a0, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_permilps_zmm_maskz
+  ;CHECK:   vpermilps $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+  %3 = bitcast i16 %mask to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
+  ret <16 x float> %4
+}
+
+define <16 x float> @stack_fold_permilpsvar_zmm(<16 x float> %a0, <16 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_permilpsvar_zmm
+  ;CHECK:       vpermilps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1, <16 x float> undef, i16 -1)
+  ret <16 x float> %2
+}
+declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) nounwind readnone
+
+define <16 x float> @stack_fold_permilpsvar_zmm_mask(<16 x float>* %passthru, <16 x float> %a0, <16 x i32> %a1, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_permilpsvar_zmm_mask
+  ;CHECK:       vpermilps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1, <16 x float> undef, i16 -1)
+  %3 = bitcast i16 %mask to <16 x i1>
+  ; load needed to keep the operation from being scheduled above the asm block
+  %4 = load <16 x float>, <16 x float>* %passthru
+  %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
+  ret <16 x float> %5
+}
+
+define <16 x float> @stack_fold_permilpsvar_zmm_maskz(<16 x float> %a0, <16 x i32> %a1, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_permilpsvar_zmm_maskz
+  ;CHECK:       vpermilps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1, <16 x float> undef, i16 -1)
+  %3 = bitcast i16 %mask to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
+  ret <16 x float> %4
+}
+
 attributes #0 = { "unsafe-fp-math"="false" }
 attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
index fbece2b..198a96d 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
@@ -518,5 +518,185 @@
   ret <4 x double> %2
 }
 
+define <4 x float> @stack_fold_vpermt2ps(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2ps
+  ;CHECK:       vpermt2ps {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8)
+
+define <4 x float> @stack_fold_vpermi2ps(<4 x i32> %x0, <4 x float> %x1, <4 x float> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2ps
+  ;CHECK:       vpermi2ps {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <4 x float> @llvm.x86.avx512.mask.vpermt2var.ps.128(<4 x i32> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.vpermt2var.ps.128(<4 x i32>, <4 x float>, <4 x float>, i8)
+
+define <2 x double> @stack_fold_vpermt2pd(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2pd
+  ;CHECK:       vpermt2pd {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8)
+
+define <2 x double> @stack_fold_vpermi2pd(<2 x i64> %x0, <2 x double> %x1, <2 x double> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2pd
+  ;CHECK:       vpermi2pd {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <2 x double> @llvm.x86.avx512.mask.vpermt2var.pd.128(<2 x i64> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx512.mask.vpermt2var.pd.128(<2 x i64>, <2 x double>, <2 x double>, i8)
+
+define <8 x float> @stack_fold_vpermt2ps_ymm(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2ps_ymm
+  ;CHECK:       vpermt2ps {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8)
+
+define <8 x float> @stack_fold_vpermi2ps_ymm(<8 x i32> %x0, <8 x float> %x1, <8 x float> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2ps_ymm
+  ;CHECK:       vpermi2ps {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <8 x float> @llvm.x86.avx512.mask.vpermt2var.ps.256(<8 x i32> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.vpermt2var.ps.256(<8 x i32>, <8 x float>, <8 x float>, i8)
+
+define <4 x double> @stack_fold_vpermt2pd_ymm(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2pd_ymm
+  ;CHECK:       vpermt2pd {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8)
+
+define <4 x double> @stack_fold_vpermi2pd_ymm(<4 x i64> %x0, <4 x double> %x1, <4 x double> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2pd_ymm
+  ;CHECK:       vpermi2pd {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <4 x double> @llvm.x86.avx512.mask.vpermt2var.pd.256(<4 x i64> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx512.mask.vpermt2var.pd.256(<4 x i64>, <4 x double>, <4 x double>, i8)
+
+define <4 x double> @stack_fold_permpd(<4 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_permpd
+  ;CHECK:   vpermpd $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
+  ; fadd forces execution domain
+  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %3
+}
+
+define <4 x double> @stack_fold_permpdvar(<4 x i64> %a0, <4 x double> %a1) {
+  ;CHECK-LABEL: stack_fold_permpdvar
+  ;CHECK:   vpermpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a1, <4 x i64> %a0, <4 x double> undef, i8 -1)
+  ; fadd forces execution domain
+  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %3
+}
+declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8) nounwind readonly
+
+define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_permps
+  ;CHECK:       vpermps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
+
+define <2 x double> @stack_fold_permilpd(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_permilpd
+  ;CHECK:   vpermilpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  ret <2 x double> %2
+}
+
+define <4 x double> @stack_fold_permilpd_ymm(<4 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_permilpd_ymm
+  ;CHECK:   vpermilpd $5, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x double> %2
+}
+
+define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_permilpdvar
+  ;CHECK:       vpermilpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
+
+define <4 x double> @stack_fold_permilpdvar_ymm(<4 x double> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_permilpdvar_ymm
+  ;CHECK:       vpermilpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
+  ret <4 x double> %2
+}
+declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
+
+define <4 x float> @stack_fold_permilps(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_permilps
+  ;CHECK:   vpermilps $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x float> %2
+}
+
+define <8 x float> @stack_fold_permilps_ymm(<8 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_permilps_ymm
+  ;CHECK:   vpermilps $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x float> %2
+}
+
+define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_permilpsvar
+  ;CHECK:       vpermilps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
+
+define <8 x float> @stack_fold_permilpsvar_ymm(<8 x float> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_permilpsvar_ymm
+  ;CHECK:       vpermilps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1)
+  ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
+
+define <8 x float> @stack_fold_permilpsvar_ymm_maskz(<8 x float> %a0, <8 x i32> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_permilpsvar_ymm_maskz
+  ;CHECK:       vpermilps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer
+  ret <8 x float> %4
+}
+
 attributes #0 = { "unsafe-fp-math"="false" }
 attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/stack-folding-fp-sse42.ll b/test/CodeGen/X86/stack-folding-fp-sse42.ll
index bc346a6..c577827 100644
--- a/test/CodeGen/X86/stack-folding-fp-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -613,10 +613,12 @@
   ;CHECK-LABEL: stack_fold_extractps
   ;CHECK:       extractps $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
   ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
-  %1 = extractelement <4 x float> %a0, i32 1
-  %2 = bitcast float %1 to i32
-  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-  ret i32 %2
+  ; fadd forces execution domain
+  %1 = fadd <4 x float> %a0, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %2 = extractelement <4 x float> %1, i32 1
+  %3 = bitcast float %2 to i32
+  %4 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i32 %3
 }
 
 define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) {
diff --git a/test/CodeGen/X86/stack-folding-int-avx1.ll b/test/CodeGen/X86/stack-folding-int-avx1.ll
index 3e3bbb9..c2f1053 100644
--- a/test/CodeGen/X86/stack-folding-int-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx1.ll
@@ -428,16 +428,18 @@
 
 define i32 @stack_fold_pextrd(<4 x i32> %a0) {
   ;CHECK-LABEL: stack_fold_pextrd
-  ;CHECK:       pextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
+  ;CHECK:       vpextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
   ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
-  %1 = extractelement <4 x i32> %a0, i32 1
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-  ret i32 %1
+  ; add forces execution domain
+  %1 = add <4 x i32> %a0, <i32 1, i32 2, i32 3, i32 4>
+  %2 = extractelement <4 x i32> %1, i32 1
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i32 %2
 }
 
 define i64 @stack_fold_pextrq(<2 x i64> %a0) {
   ;CHECK-LABEL: stack_fold_pextrq
-  ;CHECK:       pextrq $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
+  ;CHECK:       vpextrq $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
   ;CHECK:       movq    {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Reload
   %1 = extractelement <2 x i64> %a0, i32 1
   %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
diff --git a/test/CodeGen/X86/stack-folding-int-avx512.ll b/test/CodeGen/X86/stack-folding-int-avx512.ll
index 5203f8d..0d17086 100644
--- a/test/CodeGen/X86/stack-folding-int-avx512.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx512.ll
@@ -426,42 +426,6 @@
   ret <2 x i64> %3
 }
 
-define <16 x float> @stack_fold_vpermt2ps(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2ps
-  ;CHECK:       vpermt2ps {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
-
-define <16 x float> @stack_fold_vpermi2ps(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2ps
-  ;CHECK:       vpermi2ps {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <16 x float> @llvm.x86.avx512.mask.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.mask.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
-
-define <8 x double> @stack_fold_vpermt2pd(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2pd
-  ;CHECK:       vpermt2pd {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
-  ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
-
-define <8 x double> @stack_fold_vpermi2pd(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2pd
-  ;CHECK:       vpermi2pd {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <8 x double> @llvm.x86.avx512.mask.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
-  ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.mask.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
-
 define <16 x i32> @stack_fold_vpermt2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
   ;CHECK-LABEL: stack_fold_vpermt2d
   ;CHECK:       vpermt2d {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
@@ -883,60 +847,6 @@
 }
 declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readonly
 
-define <8 x double> @stack_fold_permpd(<8 x double> %a0) {
-  ;CHECK-LABEL: stack_fold_permpd
-  ;CHECK:   vpermpd $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
-  ; fadd forces execution domain
-  %3 = fadd <8 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
-  ret <8 x double> %3
-}
-
-define <8 x double> @stack_fold_permpd_mask(<8 x double>* %passthru, <8 x double> %a0, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_permpd_mask
-  ;CHECK:   vpermpd $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
-  %3 = bitcast i8 %mask to <8 x i1>
-  ; load needed to keep the operation from being scheduled above the asm block
-  %4 = load <8 x double>, <8 x double>* %passthru
-  %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
-  ; fadd forces execution domain
-  %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
-  ret <8 x double> %6
-}
-
-define <8 x double> @stack_fold_permpd_maskz(<8 x double> %a0, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_permpd_maskz
-  ;CHECK:   vpermpd $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
-  %3 = bitcast i8 %mask to <8 x i1>
-  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
-  ret <8 x double> %4
-}
-
-define <8 x double> @stack_fold_permpdvar(<8 x i64> %a0, <8 x double> %a1) {
-  ;CHECK-LABEL: stack_fold_permpdvar
-  ;CHECK:   vpermpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a1, <8 x i64> %a0, <8 x double> undef, i8 -1)
-  ; fadd forces execution domain
-  %3 = fadd <8 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
-  ret <8 x double> %3
-}
-declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) nounwind readonly
-
-define <16 x float> @stack_fold_permps(<16 x i32> %a0, <16 x float> %a1) {
-  ;CHECK-LABEL: stack_fold_permps
-  ;CHECK:       vpermps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a1, <16 x i32> %a0, <16 x float> undef, i16 -1)
-  ret <16 x float> %2
-}
-declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16) nounwind readonly
-
 define <8 x i64> @stack_fold_permq(<8 x i64> %a0) {
   ;CHECK-LABEL: stack_fold_permq
   ;CHECK:   vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
diff --git a/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/test/CodeGen/X86/stack-folding-int-avx512vl.ll
index da26919..7d816c7 100644
--- a/test/CodeGen/X86/stack-folding-int-avx512vl.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx512vl.ll
@@ -483,42 +483,6 @@
   ret <4 x i64> %3
 }
 
-define <4 x float> @stack_fold_vpermt2ps(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2ps
-  ;CHECK:       vpermt2ps {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8)
-
-define <4 x float> @stack_fold_vpermi2ps(<4 x i32> %x0, <4 x float> %x1, <4 x float> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2ps
-  ;CHECK:       vpermi2ps {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <4 x float> @llvm.x86.avx512.mask.vpermt2var.ps.128(<4 x i32> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.mask.vpermt2var.ps.128(<4 x i32>, <4 x float>, <4 x float>, i8)
-
-define <2 x double> @stack_fold_vpermt2pd(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2pd
-  ;CHECK:       vpermt2pd {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8)
-
-define <2 x double> @stack_fold_vpermi2pd(<2 x i64> %x0, <2 x double> %x1, <2 x double> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2pd
-  ;CHECK:       vpermi2pd {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <2 x double> @llvm.x86.avx512.mask.vpermt2var.pd.128(<2 x i64> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.avx512.mask.vpermt2var.pd.128(<2 x i64>, <2 x double>, <2 x double>, i8)
-
 define <4 x i32> @stack_fold_vpermt2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
   ;CHECK-LABEL: stack_fold_vpermt2d
   ;CHECK:       vpermt2d {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
@@ -591,42 +555,6 @@
 }
 declare <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
 
-define <8 x float> @stack_fold_vpermt2ps_ymm(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2ps_ymm
-  ;CHECK:       vpermt2ps {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
-  ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8)
-
-define <8 x float> @stack_fold_vpermi2ps_ymm(<8 x i32> %x0, <8 x float> %x1, <8 x float> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2ps_ymm
-  ;CHECK:       vpermi2ps {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <8 x float> @llvm.x86.avx512.mask.vpermt2var.ps.256(<8 x i32> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
-  ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx512.mask.vpermt2var.ps.256(<8 x i32>, <8 x float>, <8 x float>, i8)
-
-define <4 x double> @stack_fold_vpermt2pd_ymm(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2pd_ymm
-  ;CHECK:       vpermt2pd {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
-  ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8)
-
-define <4 x double> @stack_fold_vpermi2pd_ymm(<4 x i64> %x0, <4 x double> %x1, <4 x double> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2pd_ymm
-  ;CHECK:       vpermi2pd {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <4 x double> @llvm.x86.avx512.mask.vpermt2var.pd.256(<4 x i64> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
-  ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx512.mask.vpermt2var.pd.256(<4 x i64>, <4 x double>, <4 x double>, i8)
-
 define <8 x i32> @stack_fold_vpermt2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
   ;CHECK-LABEL: stack_fold_vpermt2d_ymm
   ;CHECK:       vpermt2d {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
@@ -1350,36 +1278,6 @@
 }
 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
 
-define <4 x double> @stack_fold_permpd(<4 x double> %a0) {
-  ;CHECK-LABEL: stack_fold_permpd
-  ;CHECK:   vpermpd $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
-  ; fadd forces execution domain
-  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
-  ret <4 x double> %3
-}
-
-define <4 x double> @stack_fold_permpdvar(<4 x i64> %a0, <4 x double> %a1) {
-  ;CHECK-LABEL: stack_fold_permpdvar
-  ;CHECK:   vpermpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a1, <4 x i64> %a0, <4 x double> undef, i8 -1)
-  ; fadd forces execution domain
-  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
-  ret <4 x double> %3
-}
-declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8) nounwind readonly
-
-define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) {
-  ;CHECK-LABEL: stack_fold_permps
-  ;CHECK:       vpermps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0)
-  ret <8 x float> %2
-}
-declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
-
 define <4 x i64> @stack_fold_permq(<4 x i64> %a0) {
   ;CHECK-LABEL: stack_fold_permq
   ;CHECK:   vpermq $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
diff --git a/test/CodeGen/X86/stack-folding-int-sse42.ll b/test/CodeGen/X86/stack-folding-int-sse42.ll
index f732607..a839a31 100644
--- a/test/CodeGen/X86/stack-folding-int-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-int-sse42.ll
@@ -457,9 +457,11 @@
   ;CHECK-LABEL: stack_fold_pextrd
   ;CHECK:       pextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
   ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
-  %1 = extractelement <4 x i32> %a0, i32 1
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-  ret i32 %1
+  ; add forces execution domain
+  %1 = add <4 x i32> %a0, <i32 1, i32 2, i32 3, i32 4>
+  %2 = extractelement <4 x i32> %1, i32 1
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i32 %2
 }
 
 define i64 @stack_fold_pextrq(<2 x i64> %a0) {
diff --git a/test/CodeGen/X86/uint_to_fp-2.ll b/test/CodeGen/X86/uint_to_fp-2.ll
index d2b78a8..fbfd1bd 100644
--- a/test/CodeGen/X86/uint_to_fp-2.ll
+++ b/test/CodeGen/X86/uint_to_fp-2.ll
@@ -8,7 +8,7 @@
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    orpd %xmm0, %xmm1
+; CHECK-NEXT:    por %xmm0, %xmm1
 ; CHECK-NEXT:    subsd %xmm0, %xmm1
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    cvtsd2ss %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index 4641e4a..4ae95ba 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -2343,7 +2343,7 @@
 ; SSE-NEXT:    movq %rcx, %rsi
 ; SSE-NEXT:    callq __fixtfdi
 ; SSE-NEXT:    movd %rax, %xmm0
-; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    movq %rbx, %rdi
 ; SSE-NEXT:    movq %r14, %rsi
 ; SSE-NEXT:    callq __fixtfdi
@@ -2368,7 +2368,7 @@
 ; VEX-NEXT:    movq %rcx, %rsi
 ; VEX-NEXT:    callq __fixtfdi
 ; VEX-NEXT:    vmovq %rax, %xmm0
-; VEX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; VEX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; VEX-NEXT:    movq %rbx, %rdi
 ; VEX-NEXT:    movq %r14, %rsi
 ; VEX-NEXT:    callq __fixtfdi
diff --git a/test/CodeGen/X86/vec_ins_extract-1.ll b/test/CodeGen/X86/vec_ins_extract-1.ll
index 8019e11..8adc0e6 100644
--- a/test/CodeGen/X86/vec_ins_extract-1.ll
+++ b/test/CodeGen/X86/vec_ins_extract-1.ll
@@ -90,7 +90,7 @@
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
-; X32-NEXT:    movaps %xmm0, (%esp)
+; X32-NEXT:    movdqa %xmm0, (%esp)
 ; X32-NEXT:    movd %xmm0, (%esp,%eax,4)
 ; X32-NEXT:    movaps (%esp), %xmm0
 ; X32-NEXT:    movl %ebp, %esp
@@ -99,7 +99,7 @@
 ;
 ; X64-LABEL: t3:
 ; X64:       # BB#0:
-; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movslq %edi, %rax
 ; X64-NEXT:    movd %xmm0, -24(%rsp,%rax,4)
 ; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
diff --git a/test/CodeGen/X86/vec_shift6.ll b/test/CodeGen/X86/vec_shift6.ll
index c4b7f20..b4a58de 100644
--- a/test/CodeGen/X86/vec_shift6.ll
+++ b/test/CodeGen/X86/vec_shift6.ll
@@ -153,14 +153,14 @@
 ;
 ; AVX2-LABEL: test7:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test7:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
 ; AVX512-NEXT:    retq
@@ -183,7 +183,7 @@
 ;
 ; AVX2-LABEL: test8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsllvd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shuffle-512-v64.ll b/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 5b6c2be..ff0d1bb 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -466,7 +466,7 @@
 define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512F-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
 ; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512F-NEXT:    vpbroadcastw {{.*}}(%rip), %ymm4
 ; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
 ; AVX512F-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
@@ -482,7 +482,7 @@
 ; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
 ; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512BW-NEXT:    vpbroadcastw {{.*}}(%rip), %ymm3
 ; AVX512BW-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm2[2,3,0,1]
 ; AVX512BW-NEXT:    vpblendvb %ymm3, %ymm2, %ymm4, %ymm2
@@ -498,7 +498,7 @@
 ;
 ; AVX512DQ-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
 ; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512DQ-NEXT:    vpbroadcastw {{.*}}(%rip), %ymm4
 ; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
 ; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index 707cafe..d7073d6 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -341,3 +341,29 @@
   %1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> <i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16>)
   ret <16 x i8> %1
 }
+
+define <4 x float> @PR31296(i8* %in) {
+; X32-LABEL: PR31296:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovaps {{.*#+}} xmm1 = <0,1,u,u>
+; X32-NEXT:    vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0,0,1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: PR31296:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    vmovq %rax, %xmm0
+; X64-NEXT:    vmovaps {{.*#+}} xmm1 = <0,1,u,u>
+; X64-NEXT:    vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0,0,1]
+; X64-NEXT:    retq
+entry:
+  %0 = getelementptr i8, i8* %in, i32 0
+  %1 = bitcast i8* %0 to i32*
+  %2 = load i32, i32* %1
+  %3 = zext i32 %2 to i128
+  %4 = bitcast i128 %3 to <4 x float>
+  %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float 1.000000e+00, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 5>
+  ret <4 x float> %5
+}
diff --git a/test/DebugInfo/AArch64/frameindices.ll b/test/DebugInfo/AArch64/frameindices.ll
index 2aaf169..4514f01 100644
--- a/test/DebugInfo/AArch64/frameindices.ll
+++ b/test/DebugInfo/AArch64/frameindices.ll
@@ -235,15 +235,15 @@
 !73 = !DILocation(line: 17, column: 27, scope: !31)
 !74 = !DILocalVariable(name: "p1", line: 17, arg: 1, scope: !31, file: !26, type: !4)
 !75 = distinct !DILocation(line: 22, column: 3, scope: !34)
-!76 = !DIExpression(DW_OP_bit_piece, 8, 120)
+!76 = !DIExpression(DW_OP_LLVM_fragment, 8, 120)
 !77 = !DILocation(line: 17, column: 12, scope: !31, inlinedAt: !75)
-!78 = !DIExpression(DW_OP_bit_piece, 136, 56)
+!78 = !DIExpression(DW_OP_LLVM_fragment, 136, 56)
 !79 = !DIExpression(DW_OP_deref)
 !80 = !DILocation(line: 19, column: 5, scope: !34)
 !81 = !DILocation(line: 20, column: 7, scope: !34)
 !82 = !DILocation(line: 20, column: 5, scope: !34)
-!83 = !DIExpression(DW_OP_bit_piece, 0, 8)
-!84 = !DIExpression(DW_OP_bit_piece, 128, 8)
+!83 = !DIExpression(DW_OP_LLVM_fragment, 0, 8)
+!84 = !DIExpression(DW_OP_LLVM_fragment, 128, 8)
 !85 = !DILocation(line: 13, column: 12, scope: !25, inlinedAt: !86)
 !86 = distinct !DILocation(line: 17, column: 18, scope: !31, inlinedAt: !75)
 !87 = !DILocation(line: 14, column: 37, scope: !25, inlinedAt: !86)
diff --git a/test/DebugInfo/ARM/PR26163.ll b/test/DebugInfo/ARM/PR26163.ll
index cd8c9b1..fd75cee 100644
--- a/test/DebugInfo/ARM/PR26163.ll
+++ b/test/DebugInfo/ARM/PR26163.ll
@@ -100,7 +100,7 @@
 !27 = !DILocation(line: 11, scope: !11, inlinedAt: !28)
 !28 = distinct !DILocation(line: 26, scope: !4)
 !29 = !DILocation(line: 13, scope: !11, inlinedAt: !28)
-!30 = !DIExpression(DW_OP_bit_piece, 0, 64)
-!31 = !DIExpression(DW_OP_bit_piece, 0, 32)
+!30 = !DIExpression(DW_OP_LLVM_fragment, 0, 64)
+!31 = !DIExpression(DW_OP_LLVM_fragment, 0, 32)
 !32 = !DILocation(line: 18, scope: !11, inlinedAt: !28)
 !33 = !DILocation(line: 27, scope: !4)
diff --git a/test/DebugInfo/ARM/split-complex.ll b/test/DebugInfo/ARM/split-complex.ll
index f681cf4..4c47a29 100644
--- a/test/DebugInfo/ARM/split-complex.ll
+++ b/test/DebugInfo/ARM/split-complex.ll
@@ -51,6 +51,6 @@
 !14 = !DILocalVariable(name: "c", arg: 1, scope: !4, file: !5, line: 1, type: !8)
 !15 = !DIExpression()
 !16 = !DILocation(line: 1, column: 24, scope: !4)
-!17 = !DIExpression(DW_OP_bit_piece, 0, 64)
-!18 = !DIExpression(DW_OP_bit_piece, 64, 64)
+!17 = !DIExpression(DW_OP_LLVM_fragment, 0, 64)
+!18 = !DIExpression(DW_OP_LLVM_fragment, 64, 64)
 !19 = !DILocation(line: 1, column: 36, scope: !4)
diff --git a/test/DebugInfo/ARM/sroa-complex.ll b/test/DebugInfo/ARM/sroa-complex.ll
index d1fb9a3..7c7b254 100644
--- a/test/DebugInfo/ARM/sroa-complex.ll
+++ b/test/DebugInfo/ARM/sroa-complex.ll
@@ -26,8 +26,8 @@
   ; CHECK-SAME:                      metadata ![[C]], metadata ![[IMG:.*]])
   ret void, !dbg !18
 }
-; CHECK: ![[REAL]] = !DIExpression(DW_OP_bit_piece, 0, 64)
-; CHECK: ![[IMG]] = !DIExpression(DW_OP_bit_piece, 64, 64)
+; CHECK: ![[REAL]] = !DIExpression(DW_OP_LLVM_fragment, 0, 64)
+; CHECK: ![[IMG]] = !DIExpression(DW_OP_LLVM_fragment, 64, 64)
 
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
diff --git a/test/DebugInfo/COFF/pieces.ll b/test/DebugInfo/COFF/pieces.ll
index bd2551a..8252b59 100644
--- a/test/DebugInfo/COFF/pieces.ll
+++ b/test/DebugInfo/COFF/pieces.ll
@@ -35,27 +35,27 @@
 ; }
 
 ; ASM-LABEL: loop_csr: # @loop_csr
-; ASM:        #DEBUG_VALUE: loop_csr:o [bit_piece offset=0 size=32] <- 0
-; ASM:        #DEBUG_VALUE: loop_csr:o [bit_piece offset=32 size=32] <- 0
+; ASM:        #DEBUG_VALUE: loop_csr:o [fragment offset=0 size=32] <- 0
+; ASM:        #DEBUG_VALUE: loop_csr:o [fragment offset=32 size=32] <- 0
 ; ASM: # BB#2:                                 # %for.body.preheader
 ; ASM:         xorl    %edi, %edi
 ; ASM:         xorl    %esi, %esi
 ; ASM:         .p2align        4, 0x90
 ; ASM: .LBB0_3:                                # %for.body
 ; ASM: [[ox_start:\.Ltmp[0-9]+]]:
-; ASM:        #DEBUG_VALUE: loop_csr:o [bit_piece offset=0 size=32] <- %EDI
+; ASM:        #DEBUG_VALUE: loop_csr:o [fragment offset=0 size=32] <- %EDI
 ; ASM:        .cv_loc 0 1 13 11               # t.c:13:11
 ; ASM:        movl    %edi, %ecx
 ; ASM:        callq   g
 ; ASM:        movl    %eax, %edi
 ; ASM: [[oy_start:\.Ltmp[0-9]+]]:
-; ASM:         #DEBUG_VALUE: loop_csr:o [bit_piece offset=0 size=32] <- %EDI
-; ASM:         #DEBUG_VALUE: loop_csr:o [bit_piece offset=32 size=32] <- %ESI
+; ASM:         #DEBUG_VALUE: loop_csr:o [fragment offset=0 size=32] <- %EDI
+; ASM:         #DEBUG_VALUE: loop_csr:o [fragment offset=32 size=32] <- %ESI
 ; ASM:         .cv_loc 0 1 14 11               # t.c:14:11
 ; ASM:         movl    %esi, %ecx
 ; ASM:         callq   g
 ; ASM:         movl    %eax, %esi
-; ASM:         #DEBUG_VALUE: loop_csr:o [bit_piece offset=32 size=32] <- %ESI
+; ASM:         #DEBUG_VALUE: loop_csr:o [fragment offset=32 size=32] <- %ESI
 ; ASM:         cmpl    n(%rip), %eax
 ; ASM:         jl      .LBB0_3
 ; ASM: [[oy_end:\.Ltmp[0-9]+]]:
@@ -64,13 +64,13 @@
 
 
 ; ASM-LABEL: pad_right: # @pad_right
-; ASM:         #DEBUG_VALUE: pad_right:o [bit_piece offset=32 size=32] <- %ECX
+; ASM:         #DEBUG_VALUE: pad_right:o [fragment offset=32 size=32] <- %ECX
 ; ASM:         movl    %ecx, %eax
 ; ASM:         retq
 
 
 ; ASM-LABEL: pad_left: # @pad_left
-; ASM:         #DEBUG_VALUE: pad_left:o [bit_piece offset=0 size=32] <- %ECX
+; ASM:         #DEBUG_VALUE: pad_left:o [fragment offset=0 size=32] <- %ECX
 ; ASM:         .cv_loc 2 1 24 3                # t.c:24:3
 ; ASM:         movl    %ecx, %eax
 ; ASM:         retq
@@ -80,16 +80,16 @@
 ; ASM:         #DEBUG_VALUE: nested:o <- [%RCX+0]
 ; ASM:         movl    12(%rcx), %eax
 ; ASM: [[p_start:\.Ltmp[0-9]+]]:
-; ASM:         #DEBUG_VALUE: nested:p [bit_piece offset=32 size=32] <- %EAX
+; ASM:         #DEBUG_VALUE: nested:p [fragment offset=32 size=32] <- %EAX
 ; ASM:         retq
 
 ; ASM-LABEL: bitpiece_spill: # @bitpiece_spill
-; ASM:         #DEBUG_VALUE: bitpiece_spill:o [bit_piece offset=0 size=32] <- 0
+; ASM:         #DEBUG_VALUE: bitpiece_spill:o [fragment offset=0 size=32] <- 0
 ; ASM:         xorl    %ecx, %ecx
 ; ASM:         callq   g
 ; ASM:         movl    %eax, [[offset_o_x:[0-9]+]](%rsp)          # 4-byte Spill
 ; ASM: [[spill_o_x_start:\.Ltmp[0-9]+]]:
-; ASM:         #DEBUG_VALUE: bitpiece_spill:o [bit_piece offset=32 size=32] <- [%RSP+[[offset_o_x]]]
+; ASM:         #DEBUG_VALUE: bitpiece_spill:o [fragment offset=32 size=32] <- [%RSP+[[offset_o_x]]]
 ; ASM:         #APP
 ; ASM:         #NO_APP
 ; ASM:         movl    [[offset_o_x]](%rsp), %eax          # 4-byte Reload
@@ -360,8 +360,8 @@
 !16 = !DIDerivedType(tag: DW_TAG_member, name: "y", scope: !13, file: !1, line: 1, baseType: !10, size: 32, align: 32, offset: 32)
 !17 = !DIExpression()
 !18 = !DILocation(line: 11, column: 18, scope: !7)
-!19 = !DIExpression(DW_OP_bit_piece, 0, 32)
-!20 = !DIExpression(DW_OP_bit_piece, 32, 32)
+!19 = !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+!20 = !DIExpression(DW_OP_LLVM_fragment, 32, 32)
 !21 = !DILocation(line: 12, column: 23, scope: !22)
 !22 = distinct !DILexicalBlock(scope: !23, file: !1, line: 12, column: 3)
 !23 = distinct !DILexicalBlock(scope: !7, file: !1, line: 12, column: 3)
diff --git a/test/DebugInfo/Generic/inline-debug-loc.ll b/test/DebugInfo/Generic/inline-debug-loc.ll
new file mode 100644
index 0000000..032dc8e
--- /dev/null
+++ b/test/DebugInfo/Generic/inline-debug-loc.ll
@@ -0,0 +1,47 @@
+; RUN: opt -inline -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function @bar contains instruction %cmp which is not associated to any debug
+; location. This test verifies that the inliner doesn't incorrectly attribute
+; the callsite debug location to %cmp.
+
+define i32 @bar(i32 %a, i32 %b) #0 !dbg !6 {
+entry:
+  %inc = add i32 %a, 1, !dbg !8
+  %cmp = icmp slt i32 %inc, %b
+  %select = select i1 %cmp, i32 %a, i32 %b, !dbg !8
+  ret i32 %select, !dbg !8
+}
+
+
+; CHECK-LABEL: define i32 @baz(
+; CHECK: entry:
+; CHECK:   %[[INC:[a-z0-9.]+]] = add i32 %a, 1, !dbg ![[DL:[0-9]+]]
+; CHECK:   %[[CMP:[a-z0-9.]+]] = icmp slt i32 %[[INC]], %b
+; CHECK-NOT: !dbg
+; CHECK:   %[[SELECT:[a-z0-9.]+]] = select i1 %[[CMP]], i32 %a, i32 %b, !dbg ![[DL]]
+;
+; ![[DL]] = !DILocation(line: 3, scope: !{{.*}}, inlinedAt: {{.*}})
+
+define i32 @baz(i32 %a, i32 %b) !dbg !9 {
+entry:
+  %call = tail call i32 @bar(i32 %a, i32 %b), !dbg !10
+  ret i32 %call, !dbg !10
+}
+
+attributes #0 = { alwaysinline }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "test.c", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 3, scope: !6)
+!9 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 11, type: !7, isLocal: false, isDefinition: true, scopeLine: 11, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
+!10 = !DILocation(line: 12, scope: !9)
diff --git a/test/DebugInfo/Generic/piece-verifier.ll b/test/DebugInfo/Generic/piece-verifier.ll
index 450380a..028f408 100644
--- a/test/DebugInfo/Generic/piece-verifier.ll
+++ b/test/DebugInfo/Generic/piece-verifier.ll
@@ -46,10 +46,10 @@
 !21 = !DILocation(line: 3, scope: !4)
 !22 = !DILocalVariable(name: "s", line: 3, arg: 1, scope: !4, file: !5, type: !9)
 !23 = !DILocation(line: 4, scope: !4)
-!24 = !DIExpression(DW_OP_deref, DW_OP_bit_piece, 0, 64)
+!24 = !DIExpression(DW_OP_deref, DW_OP_LLVM_fragment, 0, 64)
 !25 = !{}
-; This expression has elements after DW_OP_bit_piece.
+; This expression has elements after DW_OP_LLVM_fragment.
 ; CHECK: invalid expression
 ; CHECK-NEXT: !DIExpression({{[0-9]+}}, 64, 32, {{[0-9]+}})
 ; CHECK-NOT: invalid expression
-!27 = !DIExpression(DW_OP_bit_piece, 64, 32, DW_OP_deref)
+!27 = !DIExpression(DW_OP_LLVM_fragment, 64, 32, DW_OP_deref)
diff --git a/test/DebugInfo/X86/PR26148.ll b/test/DebugInfo/X86/PR26148.ll
index 62c3157..0b2b082 100644
--- a/test/DebugInfo/X86/PR26148.ll
+++ b/test/DebugInfo/X86/PR26148.ll
@@ -95,9 +95,9 @@
 !27 = !DILocation(line: 5, column: 16, scope: !4)
 !28 = !DILocation(line: 6, column: 13, scope: !4)
 !29 = !DILocation(line: 6, column: 16, scope: !4)
-!30 = !DIExpression(DW_OP_bit_piece, 0, 32)
-!31 = !DIExpression(DW_OP_bit_piece, 32, 32)
-!32 = !DIExpression(DW_OP_bit_piece, 32, 16)
+!30 = !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+!31 = !DIExpression(DW_OP_LLVM_fragment, 32, 32)
+!32 = !DIExpression(DW_OP_LLVM_fragment, 32, 16)
 !33 = !DILocation(line: 8, column: 9, scope: !4)
 !34 = !DILocation(line: 9, column: 1, scope: !4)
 !35 = !DILocation(line: 11, column: 14, scope: !17)
diff --git a/test/DebugInfo/X86/array2.ll b/test/DebugInfo/X86/array2.ll
index 2289e32..0d4e975 100644
--- a/test/DebugInfo/X86/array2.ll
+++ b/test/DebugInfo/X86/array2.ll
@@ -18,7 +18,7 @@
 ; CHECK: define i32 @main
 ; CHECK: call void @llvm.dbg.value(metadata i32 42, i64 0, metadata ![[ARRAY:[0-9]+]], metadata ![[EXPR:[0-9]+]])
 ; CHECK: ![[ARRAY]] = !DILocalVariable(name: "array",{{.*}} line: 6
-; CHECK: ![[EXPR]] = !DIExpression(DW_OP_bit_piece, 0, 32)
+; CHECK: ![[EXPR]] = !DIExpression(DW_OP_LLVM_fragment, 0, 32)
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
 
diff --git a/test/DebugInfo/X86/deleted-bit-piece.ll b/test/DebugInfo/X86/deleted-bit-piece.ll
index dace078..4346b5b 100644
--- a/test/DebugInfo/X86/deleted-bit-piece.ll
+++ b/test/DebugInfo/X86/deleted-bit-piece.ll
@@ -41,5 +41,5 @@
 !13 = !DISubroutineType(types: !14)
 !14 = !{null}
 !15 = !DILocalVariable(name: "v", scope: !12, type: !6)
-!16 = !DIExpression(DW_OP_bit_piece, 32, 32)
+!16 = !DIExpression(DW_OP_LLVM_fragment, 32, 32)
 !17 = !DILocation(line: 2755, column: 9, scope: !12)
diff --git a/test/DebugInfo/X86/nophysreg.ll b/test/DebugInfo/X86/nophysreg.ll
index 7f0c5fc..9e783df 100644
--- a/test/DebugInfo/X86/nophysreg.ll
+++ b/test/DebugInfo/X86/nophysreg.ll
@@ -168,9 +168,9 @@
 !30 = !{i32 2, !"Debug Info Version", i32 3}
 !31 = !{i32 1, !"PIC Level", i32 2}
 !32 = !{!"clang version 3.7.0 (trunk 227088) (llvm/trunk 227091)"}
-!33 = !DIExpression(DW_OP_bit_piece, 0, 8)
+!33 = !DIExpression(DW_OP_LLVM_fragment, 0, 8)
 !34 = !DILocation(line: 7, column: 42, scope: !11)
-!35 = !DIExpression(DW_OP_bit_piece, 8, 4)
+!35 = !DIExpression(DW_OP_LLVM_fragment, 8, 4)
 !36 = !DIExpression()
 !37 = !DILocation(line: 7, column: 48, scope: !11)
 !38 = !DILocation(line: 7, column: 66, scope: !11)
diff --git a/test/DebugInfo/X86/pieces-1.ll b/test/DebugInfo/X86/pieces-1.ll
index cc6c330..64a0def 100644
--- a/test/DebugInfo/X86/pieces-1.ll
+++ b/test/DebugInfo/X86/pieces-1.ll
@@ -73,6 +73,6 @@
 !21 = !DILocation(line: 3, scope: !4)
 !22 = !DILocalVariable(name: "s", line: 3, arg: 1, scope: !4, file: !5, type: !9)
 !23 = !DILocation(line: 4, scope: !4)
-!24 = !DIExpression(DW_OP_bit_piece, 0, 64)
+!24 = !DIExpression(DW_OP_LLVM_fragment, 0, 64)
 !25 = !{}
-!27 = !DIExpression(DW_OP_bit_piece, 64, 32)
+!27 = !DIExpression(DW_OP_LLVM_fragment, 64, 32)
diff --git a/test/DebugInfo/X86/pieces-2.ll b/test/DebugInfo/X86/pieces-2.ll
index 66eea6b..146344e 100644
--- a/test/DebugInfo/X86/pieces-2.ll
+++ b/test/DebugInfo/X86/pieces-2.ll
@@ -90,6 +90,6 @@
 !26 = !DILocation(line: 10, scope: !4)
 !27 = !DILocation(line: 11, scope: !4)
 !28 = !DILocalVariable(name: "i1", line: 11, scope: !4, file: !5, type: !14)
-!29 = !DIExpression(DW_OP_bit_piece, 0, 32)
+!29 = !DIExpression(DW_OP_LLVM_fragment, 0, 32)
 !31 = !{i32 3, i32 0, i32 12}
 !32 = !DILocation(line: 12, scope: !4)
diff --git a/test/DebugInfo/X86/pieces-3.ll b/test/DebugInfo/X86/pieces-3.ll
index d6bea3f..fd5a8b8 100644
--- a/test/DebugInfo/X86/pieces-3.ll
+++ b/test/DebugInfo/X86/pieces-3.ll
@@ -99,15 +99,15 @@
 !22 = !{i32 1, !"Debug Info Version", i32 3}
 !23 = !{!"clang version 3.5.0 "}
 !24 = !DILocalVariable(name: "outer", line: 10, arg: 1, scope: !4, file: !5, type: !9)
-!25 = !DIExpression(DW_OP_bit_piece, 0, 64)
+!25 = !DIExpression(DW_OP_LLVM_fragment, 0, 64)
 !26 = !DILocation(line: 10, scope: !4)
 !27 = !DILocalVariable(name: "outer", line: 10, arg: 1, scope: !4, file: !5, type: !9)
-!28 = !DIExpression(DW_OP_bit_piece, 64, 64)
+!28 = !DIExpression(DW_OP_LLVM_fragment, 64, 64)
 !29 = !DILocalVariable(name: "outer", line: 10, arg: 1, scope: !4, file: !5, type: !9)
-!30 = !DIExpression(DW_OP_bit_piece, 96, 32)
+!30 = !DIExpression(DW_OP_LLVM_fragment, 96, 32)
 !31 = !DILocalVariable(name: "outer", line: 10, arg: 1, scope: !4, file: !5, type: !9)
-!32 = !DIExpression(DW_OP_bit_piece, 64, 32)
+!32 = !DIExpression(DW_OP_LLVM_fragment, 64, 32)
 !33 = !DILocation(line: 11, scope: !4)
 !34 = !DILocalVariable(name: "i1", line: 11, scope: !4, file: !5, type: !14)
-!35 = !DIExpression(DW_OP_bit_piece, 0, 32)
+!35 = !DIExpression(DW_OP_LLVM_fragment, 0, 32)
 !36 = !DILocation(line: 12, scope: !4)
diff --git a/test/DebugInfo/X86/pieces-4.ll b/test/DebugInfo/X86/pieces-4.ll
index 7dfa887..1244636 100644
--- a/test/DebugInfo/X86/pieces-4.ll
+++ b/test/DebugInfo/X86/pieces-4.ll
@@ -15,8 +15,8 @@
 ; CHECK-LABEL: bitpiece_spill:                         # @bitpiece_spill
 ; CHECK:               callq   g
 ; CHECK:               movl    %eax, [[offs:[0-9]+]](%rsp)          # 4-byte Spill
-; CHECK:               #DEBUG_VALUE: bitpiece_spill:o [bit_piece offset=32 size=32] <- 0
-; CHECK:               #DEBUG_VALUE: bitpiece_spill:o [bit_piece offset=0 size=32] <- [%RSP+[[offs]]]
+; CHECK:               #DEBUG_VALUE: bitpiece_spill:o [fragment offset=32 size=32] <- 0
+; CHECK:               #DEBUG_VALUE: bitpiece_spill:o [fragment offset=0 size=32] <- [%RSP+[[offs]]]
 ; CHECK:               #APP
 ; CHECK:               #NO_APP
 ; CHECK:               movl    [[offs]](%rsp), %eax          # 4-byte Reload
@@ -77,8 +77,8 @@
 !17 = !DIExpression()
 !18 = !DILocation(line: 4, column: 18, scope: !7)
 !19 = !DILocation(line: 4, column: 23, scope: !7)
-!20 = !DIExpression(DW_OP_bit_piece, 0, 32)
-!21 = !DIExpression(DW_OP_bit_piece, 32, 32)
+!20 = !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+!21 = !DIExpression(DW_OP_LLVM_fragment, 32, 32)
 !22 = !DILocation(line: 6, column: 3, scope: !7)
 !23 = !{i32 138}
 !24 = !DILocation(line: 8, column: 3, scope: !7)
diff --git a/test/DebugInfo/X86/sroasplit-1.ll b/test/DebugInfo/X86/sroasplit-1.ll
index 1c73e7f..6a9077a 100644
--- a/test/DebugInfo/X86/sroasplit-1.ll
+++ b/test/DebugInfo/X86/sroasplit-1.ll
@@ -26,8 +26,8 @@
 ; CHECK: ret i32 %[[A]]
 ; Read Var and Piece:
 ; CHECK: ![[VAR]] = !DILocalVariable(name: "i1",{{.*}} line: 11,
-; CHECK: ![[PIECE1]] = !DIExpression(DW_OP_bit_piece, 32, 96)
-; CHECK: ![[PIECE2]] = !DIExpression(DW_OP_bit_piece, 0, 32)
+; CHECK: ![[PIECE1]] = !DIExpression(DW_OP_LLVM_fragment, 32, 96)
+; CHECK: ![[PIECE2]] = !DIExpression(DW_OP_LLVM_fragment, 0, 32)
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
diff --git a/test/DebugInfo/X86/sroasplit-2.ll b/test/DebugInfo/X86/sroasplit-2.ll
index 80a05d9..9735e1e 100644
--- a/test/DebugInfo/X86/sroasplit-2.ll
+++ b/test/DebugInfo/X86/sroasplit-2.ll
@@ -24,10 +24,10 @@
 ; CHECK:  call void @llvm.dbg.value(metadata i64 %outer.coerce1, i64 0, metadata ![[O]], metadata ![[PIECE2:[0-9]+]]),
 ; CHECK:  call void @llvm.dbg.value({{.*}}, i64 0, metadata ![[I1:[0-9]+]], metadata ![[PIECE3:[0-9]+]]),
 ; CHECK-DAG: ![[O]] = !DILocalVariable(name: "outer",{{.*}} line: 10
-; CHECK-DAG: ![[PIECE1]] = !DIExpression(DW_OP_bit_piece, 0, 64)
-; CHECK-DAG: ![[PIECE2]] = !DIExpression(DW_OP_bit_piece, 64, 64)
+; CHECK-DAG: ![[PIECE1]] = !DIExpression(DW_OP_LLVM_fragment, 0, 64)
+; CHECK-DAG: ![[PIECE2]] = !DIExpression(DW_OP_LLVM_fragment, 64, 64)
 ; CHECK-DAG: ![[I1]] = !DILocalVariable(name: "i1",{{.*}} line: 11
-; CHECK-DAG: ![[PIECE3]] = !DIExpression(DW_OP_bit_piece, 0, 32)
+; CHECK-DAG: ![[PIECE3]] = !DIExpression(DW_OP_LLVM_fragment, 0, 32)
 
 ; ModuleID = 'sroasplit-2.c'
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/DebugInfo/X86/sroasplit-3.ll b/test/DebugInfo/X86/sroasplit-3.ll
index 197994f..0b94b0f 100644
--- a/test/DebugInfo/X86/sroasplit-3.ll
+++ b/test/DebugInfo/X86/sroasplit-3.ll
@@ -6,7 +6,7 @@
 ; CHECK: call void @llvm.dbg.value(metadata float %s.coerce, i64 0, metadata ![[VAR:[0-9]+]], metadata ![[EXPR:[0-9]+]])
 ; CHECK: ![[VAR]] = !DILocalVariable(name: "s",{{.*}} line: 3,
 ; CHECK: ![[EXPR]] = !DIExpression(
-; CHECK-NOT:                       DW_OP_bit_piece
+; CHECK-NOT:                       DW_OP_LLVM_fragment
 
 ;
 ; struct S { float f; };
diff --git a/test/DebugInfo/X86/sroasplit-4.ll b/test/DebugInfo/X86/sroasplit-4.ll
index 76cd7da..8653ecb 100644
--- a/test/DebugInfo/X86/sroasplit-4.ll
+++ b/test/DebugInfo/X86/sroasplit-4.ll
@@ -7,10 +7,10 @@
 ; CHECK: call void @llvm.dbg.value(metadata i64 %[[T1]], i64 0, metadata ![[Y]], metadata ![[P2:.*]])
 ; CHECK: call void @llvm.dbg.value(metadata i64 %[[T]], i64 0, metadata ![[R:.*]], metadata ![[P3:.*]])
 ; CHECK: call void @llvm.dbg.value(metadata i64 %[[T1]], i64 0, metadata ![[R]], metadata ![[P4:.*]])
-; CHECK: ![[P1]] = !DIExpression(DW_OP_bit_piece, 0, 64)
-; CHECK: ![[P2]] = !DIExpression(DW_OP_bit_piece, 64, 64)
-; CHECK: ![[P3]] = !DIExpression(DW_OP_bit_piece, 192, 64)
-; CHECK: ![[P4]] = !DIExpression(DW_OP_bit_piece, 256, 64)
+; CHECK: ![[P1]] = !DIExpression(DW_OP_LLVM_fragment, 0, 64)
+; CHECK: ![[P2]] = !DIExpression(DW_OP_LLVM_fragment, 64, 64)
+; CHECK: ![[P3]] = !DIExpression(DW_OP_LLVM_fragment, 192, 64)
+; CHECK: ![[P4]] = !DIExpression(DW_OP_LLVM_fragment, 256, 64)
 ; 
 ; struct p {
 ;   __SIZE_TYPE__ s;
diff --git a/test/DebugInfo/X86/sroasplit-5.ll b/test/DebugInfo/X86/sroasplit-5.ll
index 71f72f2..dbd3b49 100644
--- a/test/DebugInfo/X86/sroasplit-5.ll
+++ b/test/DebugInfo/X86/sroasplit-5.ll
@@ -19,11 +19,11 @@
 ; When SROA is creating new smaller allocas, it may add padding.
 ;
 ; There should be no debug info for the padding.
-; CHECK-NOT: DW_OP_bit_piece, 56
-; CHECK: DIExpression(DW_OP_bit_piece, 32, 24)
-; CHECK-NOT: DW_OP_bit_piece, 56
-; CHECK: DIExpression(DW_OP_bit_piece, 0, 32)
-; CHECK-NOT: DW_OP_bit_piece, 56
+; CHECK-NOT: DW_OP_LLVM_fragment, 56
+; CHECK: DIExpression(DW_OP_LLVM_fragment, 32, 24)
+; CHECK-NOT: DW_OP_LLVM_fragment, 56
+; CHECK: DIExpression(DW_OP_LLVM_fragment, 0, 32)
+; CHECK-NOT: DW_OP_LLVM_fragment, 56
 %struct.prog_src_register = type { i32, i24 }
 
 ; Function Attrs: nounwind
diff --git a/test/DebugInfo/X86/stack-value-piece.ll b/test/DebugInfo/X86/stack-value-piece.ll
index 503ccb3..65c1cfb 100644
--- a/test/DebugInfo/X86/stack-value-piece.ll
+++ b/test/DebugInfo/X86/stack-value-piece.ll
@@ -97,8 +97,8 @@
 !19 = !DIExpression()
 !20 = !DILocation(line: 2, column: 9, scope: !7)
 !21 = !DILocation(line: 3, column: 5, scope: !7)
-!22 = !DIExpression(DW_OP_bit_piece, 0, 32)
-!23 = !DIExpression(DW_OP_bit_piece, 32, 32)
+!22 = !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+!23 = !DIExpression(DW_OP_LLVM_fragment, 32, 32)
 !24 = !DILocation(line: 5, column: 1, scope: !7)
 !25 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 8, type: !26, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
 !26 = !DISubroutineType(types: !27)
diff --git a/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll b/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
index b5a3eaf..ad3fb33 100644
--- a/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
+++ b/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -asan -asan-instrumentation-with-call-threshold=0 -S | FileCheck %s
+; RUN: opt < %s -asan -asan-instrumentation-with-call-threshold=0 -S | FileCheck %s -check-prefix=LOAD -check-prefix=STORE -check-prefix=BOTH
 ; Support ASan instrumentation for constant-mask llvm.masked.{load,store}
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -13,57 +13,57 @@
 declare void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*>, <4 x i32*>*, i32, <4 x i1>) argmemonly nounwind
 
 define void @store.v4f32.1110(<4 x float> %arg) sanitize_address {
-; CHECK-LABEL: @store.v4f32.1110
+; BOTH-LABEL: @store.v4f32.1110
   %p = load <4 x float>*, <4 x float>** @v4f32, align 8
-; CHECK: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
-; CHECK: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
-; CHECK: call void @__asan_store4(i64 [[PGEP0]])
-; CHECK: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1
-; CHECK: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
-; CHECK: call void @__asan_store4(i64 [[PGEP1]])
-; CHECK: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
-; CHECK: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64
-; CHECK: call void @__asan_store4(i64 [[PGEP2]])
-; CHECK: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; STORE: call void @__asan_store4(i64 [[PGEP0]])
+; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1
+; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; STORE: call void @__asan_store4(i64 [[PGEP1]])
+; STORE: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; STORE: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64
+; STORE: call void @__asan_store4(i64 [[PGEP2]])
+; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
   tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
   ret void
 }
 
 define void @store.v8i32.10010110(<8 x i32> %arg) sanitize_address {
-; CHECK-LABEL: @store.v8i32.10010110
+; BOTH-LABEL: @store.v8i32.10010110
   %p = load <8 x i32>*, <8 x i32>** @v8i32, align 8
-; CHECK: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 0
-; CHECK: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP0]] to i64
-; CHECK: call void @__asan_store4(i64 [[PGEP0]])
-; CHECK: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 3
-; CHECK: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP3]] to i64
-; CHECK: call void @__asan_store4(i64 [[PGEP3]])
-; CHECK: [[GEP5:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 5
-; CHECK: [[PGEP5:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP5]] to i64
-; CHECK: call void @__asan_store4(i64 [[PGEP5]])
-; CHECK: [[GEP6:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 6
-; CHECK: [[PGEP6:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP6]] to i64
-; CHECK: call void @__asan_store4(i64 [[PGEP6]])
-; CHECK: tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %arg, <8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP0]] to i64
+; STORE: call void @__asan_store4(i64 [[PGEP0]])
+; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 3
+; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP3]] to i64
+; STORE: call void @__asan_store4(i64 [[PGEP3]])
+; STORE: [[GEP5:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 5
+; STORE: [[PGEP5:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP5]] to i64
+; STORE: call void @__asan_store4(i64 [[PGEP5]])
+; STORE: [[GEP6:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 6
+; STORE: [[PGEP6:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP6]] to i64
+; STORE: call void @__asan_store4(i64 [[PGEP6]])
+; STORE: tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %arg, <8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
   tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %arg, <8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
   ret void
 }
 
 define void @store.v4i64.0001(<4 x i32*> %arg) sanitize_address {
-; CHECK-LABEL: @store.v4i64.0001
+; BOTH-LABEL: @store.v4i64.0001
   %p = load <4 x i32*>*, <4 x i32*>** @v4i64, align 8
-; CHECK: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x i32*>, <4 x i32*>* %p, i64 0, i64 3
-; CHECK: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32** [[GEP3]] to i64
-; CHECK: call void @__asan_store8(i64 [[PGEP3]])
-; CHECK: tail call void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*> %arg, <4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x i32*>, <4 x i32*>* %p, i64 0, i64 3
+; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32** [[GEP3]] to i64
+; STORE: call void @__asan_store8(i64 [[PGEP3]])
+; STORE: tail call void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*> %arg, <4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
   tail call void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*> %arg, <4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
   ret void
 }
 
 define void @store.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) sanitize_address {
-; CHECK-LABEL: @store.v4f32.variable
+; BOTH-LABEL: @store.v4f32.variable
   %p = load <4 x float>*, <4 x float>** @v4f32, align 8
-; CHECK-NOT: call void @__asan_store
+; BOTH-NOT: call void @__asan_store
   tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> %mask)
   ret void
 }
@@ -74,54 +74,54 @@
 declare <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>*, i32, <4 x i1>, <4 x i32*>) argmemonly nounwind
 
 define <8 x i32> @load.v8i32.11100001(<8 x i32> %arg) sanitize_address {
-; CHECK-LABEL: @load.v8i32.11100001
+; BOTH-LABEL: @load.v8i32.11100001
   %p = load <8 x i32>*, <8 x i32>** @v8i32, align 8
-; CHECK: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 0
-; CHECK: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP0]] to i64
-; CHECK: call void @__asan_load4(i64 [[PGEP0]])
-; CHECK: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 1
-; CHECK: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP1]] to i64
-; CHECK: call void @__asan_load4(i64 [[PGEP1]])
-; CHECK: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 2
-; CHECK: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP2]] to i64
-; CHECK: call void @__asan_load4(i64 [[PGEP2]])
-; CHECK: [[GEP7:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 7
-; CHECK: [[PGEP7:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP7]] to i64
-; CHECK: call void @__asan_load4(i64 [[PGEP7]])
-; CHECK: tail call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP0]] to i64
+; LOAD: call void @__asan_load4(i64 [[PGEP0]])
+; LOAD: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 1
+; LOAD: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP1]] to i64
+; LOAD: call void @__asan_load4(i64 [[PGEP1]])
+; LOAD: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 2
+; LOAD: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP2]] to i64
+; LOAD: call void @__asan_load4(i64 [[PGEP2]])
+; LOAD: [[GEP7:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 7
+; LOAD: [[PGEP7:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP7]] to i64
+; LOAD: call void @__asan_load4(i64 [[PGEP7]])
+; LOAD: tail call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
   %res = tail call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
   ret <8 x i32> %res
 }
 
 define <4 x float> @load.v4f32.1001(<4 x float> %arg) sanitize_address {
-; CHECK-LABEL: @load.v4f32.1001
+; BOTH-LABEL: @load.v4f32.1001
   %p = load <4 x float>*, <4 x float>** @v4f32, align 8
-; CHECK: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
-; CHECK: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
-; CHECK: call void @__asan_load4(i64 [[PGEP0]])
-; CHECK: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
-; CHECK: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
-; CHECK: call void @__asan_load4(i64 [[PGEP3]])
-; CHECK: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; LOAD: call void @__asan_load4(i64 [[PGEP0]])
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; LOAD: call void @__asan_load4(i64 [[PGEP3]])
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
   ret <4 x float> %res
 }
 
 define <4 x i32*> @load.v4i64.0001(<4 x i32*> %arg) sanitize_address {
-; CHECK-LABEL: @load.v4i64.0001
+; BOTH-LABEL: @load.v4i64.0001
   %p = load <4 x i32*>*, <4 x i32*>** @v4i64, align 8
-; CHECK: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x i32*>, <4 x i32*>* %p, i64 0, i64 3
-; CHECK: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32** [[GEP3]] to i64
-; CHECK: call void @__asan_load8(i64 [[PGEP3]])
-; CHECK: tail call <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32*> %arg)
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x i32*>, <4 x i32*>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32** [[GEP3]] to i64
+; LOAD: call void @__asan_load8(i64 [[PGEP3]])
+; LOAD: tail call <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32*> %arg)
   %res = tail call <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32*> %arg)
   ret <4 x i32*> %res
 }
 
 define <4 x float> @load.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) sanitize_address {
-; CHECK-LABEL: @load.v4f32.variable
+; BOTH-LABEL: @load.v4f32.variable
   %p = load <4 x float>*, <4 x float>** @v4f32, align 8
-; CHECK-NOT: call void @__asan_load
+; BOTH-NOT: call void @__asan_load
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
   ret <4 x float> %res
 }
diff --git a/test/MC/AMDGPU/exp-err.s b/test/MC/AMDGPU/exp-err.s
new file mode 100644
index 0000000..22d3edf
--- /dev/null
+++ b/test/MC/AMDGPU/exp-err.s
@@ -0,0 +1,107 @@
+// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN %s
+
+exp mrt8 v3, v2, v1, v0
+// GCN: :5: error: invalid exp target
+
+exp pos4 v3, v2, v1, v0
+// GCN: :5: error: invalid exp target
+
+exp param32 v3, v2, v1, v0
+// GCN: :5: error: invalid exp target
+
+exp invalid_target_10 v3, v2, v1, v0
+// GCN: :5: error: invalid exp target
+
+exp invalid_target_10 v3, v2, v1, v0 done
+// GCN: :5: error: invalid exp target
+
+exp invalid_target_11 v3, v2, v1, v0
+// GCN: :5: error: invalid exp target
+
+exp invalid_target_11 v3, v2, v1, v0 done
+// GCN: :5: error: invalid exp target
+
+exp mrt-1 v3, v2, v1, v0
+// GCN: :5: error: failed parsing operand
+
+exp mrtX v3, v2, v1, v0
+// GCN: :5: error: failed parsing operand
+
+exp pos-1 v3, v2, v1, v0
+// GCN: :5: error: failed parsing operand
+
+exp posX v3, v2, v1, v0
+// GCN: :5: error: failed parsing operand
+
+exp param-1 v3, v2, v1, v0
+// GCN: :5: error: failed parsing operand
+
+exp paramX v3, v2, v1, v0
+// GCN: :5: error: failed parsing operand
+
+exp invalid_target_-1 v3, v2, v1, v0
+// GCN: :5: error: failed parsing operand
+
+exp invalid_target_X v3, v2, v1, v0
+// GCN: :5: error: failed parsing operand
+
+exp mrt0 s0, v0, v0, v0
+// GCN: 10: error: invalid operand for instruction
+
+exp mrt0 v0, s0, v0, v0
+// GCN: 14: error: invalid operand for instruction
+
+exp mrt0 v0, v0, s0, v0
+// GCN: 18: error: invalid operand for instruction
+
+exp mrt0 v0, v0, v0, s0
+// GCN: 22: error: invalid operand for instruction
+
+exp mrt0 v[0:1], v0, v0, v0
+// GCN: 10: error: invalid operand for instruction
+
+exp mrt0 v0, v[0:1], v0, v0
+// GCN: 14: error: invalid operand for instruction
+
+exp mrt0 v0, v0, v[0:1], v0
+// GCN: 18: error: invalid operand for instruction
+
+exp mrt0 v0, v0, v0, v[0:1]
+// GCN: 22: error: invalid operand for instruction
+
+exp mrt0 1.0, v0, v0, v0
+// GCN: 10: error: invalid operand for instruction
+
+exp mrt0 v0, 1.0, v0, v0
+// GCN: 14: error: invalid operand for instruction
+
+exp mrt0 v0, v0, 1.0, v0
+// GCN: 18: error: invalid operand for instruction
+
+exp mrt0 v0, v0, v0, 1.0
+// GCN: 22: error: invalid operand for instruction
+
+exp mrt0 7, v0, v0, v0
+// GCN: 10: error: invalid operand for instruction
+
+exp mrt0 v0, 7, v0, v0
+// GCN: 14: error: invalid operand for instruction
+
+exp mrt0 v0, v0, 7, v0
+// GCN: 18: error: invalid operand for instruction
+
+exp mrt0 v0, v0, v0, 7
+// GCN: 22: error: invalid operand for instruction
+
+exp mrt0 0x12345678, v0, v0, v0
+// GCN: 10: error: invalid operand for instruction
+
+exp mrt0 v0, 0x12345678, v0, v0
+// GCN: 14: error: invalid operand for instruction
+
+exp mrt0 v0, v0, 0x12345678, v0
+// GCN: 18: error: invalid operand for instruction
+
+exp mrt0 v0, v0, v0, 0x12345678
+// GCN: 22: error: invalid operand for instruction
diff --git a/test/MC/AMDGPU/exp.s b/test/MC/AMDGPU/exp.s
new file mode 100644
index 0000000..4dc3799
--- /dev/null
+++ b/test/MC/AMDGPU/exp.s
@@ -0,0 +1,86 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+
+exp mrt0 off, off, off, off
+// GCN: exp mrt0 off, off, off, off ; encoding: [0x00,0x00,0x00,0xf8,0x00,0x00,0x00,0x00]
+
+exp mrt0 off, off, off, off done
+// GCN: exp mrt0 off, off, off, off done ; encoding: [0x00,0x08,0x00,0xf8,0x00,0x00,0x00,0x00]
+
+exp mrt0 v4, off, off, off done
+// GCN: exp mrt0 v4, off, off, off done ; encoding: [0x01,0x08,0x00,0xf8,0x04,0x00,0x00,0x00]
+
+exp mrt0 off, v3, off, off done
+// GCN: exp mrt0 off, v3, off, off done ; encoding: [0x02,0x08,0x00,0xf8,0x00,0x03,0x00,0x00]
+
+exp mrt0 off, off, v2, off done
+// GCN: exp mrt0 off, off, v2, off done ; encoding: [0x04,0x08,0x00,0xf8,0x00,0x00,0x02,0x00]
+
+exp mrt0 off, off, off, v1 done
+// GCN: exp mrt0 off, off, off, v1 done ; encoding: [0x08,0x08,0x00,0xf8,0x00,0x00,0x00,0x01]
+
+exp mrt0 v4, v3, off, off done
+// GCN: exp mrt0 v4, v3, off, off done ; encoding: [0x03,0x08,0x00,0xf8,0x04,0x03,0x00,0x00]
+
+exp mrt0 v4, off, v2, off done
+// GCN: exp mrt0 v4, off, v2, off done ; encoding: [0x05,0x08,0x00,0xf8,0x04,0x00,0x02,0x00]
+
+exp mrt0 v4, off, off, v1
+// GCN: exp mrt0 v4, off, off, v1 ; encoding: [0x09,0x00,0x00,0xf8,0x04,0x00,0x00,0x01]
+
+exp mrt0 v4, off, off, v1 done
+// GCN: exp mrt0 v4, off, off, v1 done ; encoding: [0x09,0x08,0x00,0xf8,0x04,0x00,0x00,0x01]
+
+exp mrt0 v4, v3, v2, v1
+// GCN: exp mrt0 v4, v3, v2, v1 ; encoding: [0x0f,0x00,0x00,0xf8,0x04,0x03,0x02,0x01]
+
+exp mrt0 v4, v3, v2, v1 done
+// GCN: exp mrt0 v4, v3, v2, v1 done ; encoding: [0x0f,0x08,0x00,0xf8,0x04,0x03,0x02,0x01]
+
+exp mrt7 v1, v1, v1, v1
+// GCN: exp mrt7 v1, v1, v1, v1 ; encoding: [0x7f,0x00,0x00,0xf8,0x01,0x01,0x01,0x01]
+
+exp mrt7 v1, v1, v1, v1 done
+// GCN: exp mrt7 v1, v1, v1, v1 done ; encoding: [0x7f,0x08,0x00,0xf8,0x01,0x01,0x01,0x01]
+
+exp mrtz v4, v3, v2, v1
+// GCN: exp mrtz v4, v3, v2, v1 ; encoding: [0x8f,0x00,0x00,0xf8,0x04,0x03,0x02,0x01]
+
+exp mrtz v4, v3, v2, v1 done
+// GCN: exp mrtz v4, v3, v2, v1 done ; encoding: [0x8f,0x08,0x00,0xf8,0x04,0x03,0x02,0x01]
+
+exp null v4, v3, v2, v1
+// GCN: exp null v4, v3, v2, v1 ; encoding: [0x9f,0x00,0x00,0xf8,0x04,0x03,0x02,0x01]
+
+exp null v4, v3, v2, v1 done
+// GCN: exp null v4, v3, v2, v1 done ; encoding: [0x9f,0x08,0x00,0xf8,0x04,0x03,0x02,0x01]
+
+exp pos0 v4, v3, v2, v1
+// GCN: exp pos0 v4, v3, v2, v1 ; encoding: [0xcf,0x00,0x00,0xf8,0x04,0x03,0x02,0x01]
+
+exp pos0 v4, v3, v2, v1 done
+// GCN: exp pos0 v4, v3, v2, v1 done ; encoding: [0xcf,0x08,0x00,0xf8,0x04,0x03,0x02,0x01]
+
+exp pos3 v4, v3, v2, v1
+// GCN: exp pos3 v4, v3, v2, v1 ; encoding: [0xff,0x00,0x00,0xf8,0x04,0x03,0x02,0x01]
+
+exp pos3 v4, v3, v2, v1 done
+// GCN: exp pos3 v4, v3, v2, v1 done ; encoding: [0xff,0x08,0x00,0xf8,0x04,0x03,0x02,0x01]
+
+exp param0 v4, v3, v2, v1
+// GCN: exp param0 v4, v3, v2, v1 ; encoding: [0x0f,0x02,0x00,0xf8,0x04,0x03,0x02,0x01]
+
+exp param0 v4, v3, v2, v1 done
+// GCN: exp param0 v4, v3, v2, v1 done ; encoding: [0x0f,0x0a,0x00,0xf8,0x04,0x03,0x02,0x01]
+
+exp param31 v4, v3, v2, v1
+// GCN: exp param31 v4, v3, v2, v1 ; encoding: [0xff,0x03,0x00,0xf8,0x04,0x03,0x02,0x01]
+
+exp param31 v4, v3, v2, v1 done
+// GCN: exp param31 v4, v3, v2, v1 done ; encoding: [0xff,0x0b,0x00,0xf8,0x04,0x03,0x02,0x01]
+
+exp mrt0 v4, v3, v2, v1 vm
+// GCN: exp mrt0 v4, v3, v2, v1 vm ; encoding: [0x0f,0x10,0x00,0xf8,0x04,0x03,0x02,0x01]
+
+exp mrt0 v4, v3, v2, v1 done vm
+// GCN: exp mrt0 v4, v3, v2, v1 done vm ; encoding: [0x0f,0x18,0x00,0xf8,0x04,0x03,0x02,0x01]
diff --git a/test/MC/ARM/thumb-diagnostics.s b/test/MC/ARM/thumb-diagnostics.s
index 65d8ed6..ab7c92c 100644
--- a/test/MC/ARM/thumb-diagnostics.s
+++ b/test/MC/ARM/thumb-diagnostics.s
@@ -11,7 +11,7 @@
 
 @ ADD instruction w/o 'S' suffix.
         add r1, r2, r3
-@ CHECK-ERRORS: error: invalid instruction
+@ CHECK-ERRORS: error: no flag-preserving variant of this instruction available
 @ CHECK-ERRORS:         add r1, r2, r3
 @ CHECK-ERRORS:         ^
 
diff --git a/test/MC/Disassembler/AMDGPU/missing_op.txt b/test/MC/Disassembler/AMDGPU/missing_op.txt
index 7c4e252..607d28f 100644
--- a/test/MC/Disassembler/AMDGPU/missing_op.txt
+++ b/test/MC/Disassembler/AMDGPU/missing_op.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -arch=amdgcn -mcpu=fiji -disassemble < %s | FileCheck %s -check-prefix=VI
 
 #TODO: this test will fail when we fix v_interp_p2_f32 signature, remove it then
-#VI: v_interp_p2_f32 16, [/*Missing OP1*/], /*Missing OP2*/, /*Missing OP3*/, /*Missing OP4*/
+#VI: v_interp_p2_f32 v7, [v7], 16, /*Missing OP3*/, /*Missing OP4*/
 0xd4 0x41 0x1d 0xd4
diff --git a/test/MC/Mips/cpsetup.s b/test/MC/Mips/cpsetup.s
index 149419f..af6caed 100644
--- a/test/MC/Mips/cpsetup.s
+++ b/test/MC/Mips/cpsetup.s
@@ -1,21 +1,21 @@
 # RUN: llvm-mc -triple mips-unknown-linux -target-abi o32 -filetype=obj -o - %s | \
 # RUN:   llvm-objdump -d -r - | FileCheck -check-prefixes=ALL,O32 %s
 
-# RUN: llvm-mc -triple mips-unknown-unknown -target-abi o32 %s | \
+# RUN: llvm-mc -triple mips-unknown-linux -target-abi o32 %s | \
 # RUN:   FileCheck -check-prefixes=ALL,ASM,ASM-O32 %s
 
 # RUN: llvm-mc -triple mips64-unknown-linux -target-abi n32 -filetype=obj -o - %s | \
 # RUN:   llvm-objdump -d -r - | \
 # RUN:   FileCheck -check-prefixes=ALL,NXX,N32 %s
 
-# RUN: llvm-mc -triple mips64-unknown-unknown -target-abi n32 %s | \
+# RUN: llvm-mc -triple mips64-unknown-linux -target-abi n32 %s | \
 # RUN:   FileCheck -check-prefixes=ALL,ASM,ASM-N32 %s
 
 # RUN: llvm-mc -triple mips64-unknown-linux %s -filetype=obj -o - | \
 # RUN:   llvm-objdump -d -r - | \
 # RUN:   FileCheck -check-prefixes=ALL,NXX,N64 %s
 
-# RUN: llvm-mc -triple mips64-unknown-unknown %s | \
+# RUN: llvm-mc -triple mips64-unknown-linux %s | \
 # RUN:   FileCheck -check-prefixes=ALL,ASM,ASM-N64 %s
 
         .text
diff --git a/test/MC/Mips/mips64-register-names-o32.s b/test/MC/Mips/mips64-register-names-o32.s
index f5df527..1735a03 100644
--- a/test/MC/Mips/mips64-register-names-o32.s
+++ b/test/MC/Mips/mips64-register-names-o32.s
@@ -2,40 +2,40 @@
 # RUN:     -target-abi o32 | FileCheck %s
 
 # Check that the register names are mapped to their correct numbers for o32
-# Second byte of daddiu with $zero at rt contains the number of the source
+# Second byte of addiu with $zero at rt contains the number of the source
 # register.
 
 .set noat
-daddiu	$zero, $zero, 0 # CHECK: encoding: [0x64,0x00,0x00,0x00]
-daddiu	$at, $zero, 0   # CHECK: encoding: [0x64,0x01,0x00,0x00]
-daddiu	$v0, $zero, 0   # CHECK: encoding: [0x64,0x02,0x00,0x00]
-daddiu	$v1, $zero, 0   # CHECK: encoding: [0x64,0x03,0x00,0x00]
-daddiu	$a0, $zero, 0   # CHECK: encoding: [0x64,0x04,0x00,0x00]
-daddiu	$a1, $zero, 0   # CHECK: encoding: [0x64,0x05,0x00,0x00]
-daddiu	$a2, $zero, 0   # CHECK: encoding: [0x64,0x06,0x00,0x00]
-daddiu	$a3, $zero, 0   # CHECK: encoding: [0x64,0x07,0x00,0x00]
-daddiu	$t0, $zero, 0   # CHECK: encoding: [0x64,0x08,0x00,0x00]
-daddiu	$t1, $zero, 0   # CHECK: encoding: [0x64,0x09,0x00,0x00]
-daddiu	$t2, $zero, 0   # CHECK: encoding: [0x64,0x0a,0x00,0x00]
-daddiu	$t3, $zero, 0   # CHECK: encoding: [0x64,0x0b,0x00,0x00]
-daddiu	$t4, $zero, 0   # CHECK: encoding: [0x64,0x0c,0x00,0x00]
-daddiu	$t5, $zero, 0   # CHECK: encoding: [0x64,0x0d,0x00,0x00]
-daddiu	$t6, $zero, 0   # CHECK: encoding: [0x64,0x0e,0x00,0x00]
-daddiu	$t7, $zero, 0   # CHECK: encoding: [0x64,0x0f,0x00,0x00]
-daddiu	$s0, $zero, 0   # CHECK: encoding: [0x64,0x10,0x00,0x00]
-daddiu	$s1, $zero, 0   # CHECK: encoding: [0x64,0x11,0x00,0x00]
-daddiu	$s2, $zero, 0   # CHECK: encoding: [0x64,0x12,0x00,0x00]
-daddiu	$s3, $zero, 0   # CHECK: encoding: [0x64,0x13,0x00,0x00]
-daddiu	$s4, $zero, 0   # CHECK: encoding: [0x64,0x14,0x00,0x00]
-daddiu	$s5, $zero, 0   # CHECK: encoding: [0x64,0x15,0x00,0x00]
-daddiu	$s6, $zero, 0   # CHECK: encoding: [0x64,0x16,0x00,0x00]
-daddiu	$s7, $zero, 0   # CHECK: encoding: [0x64,0x17,0x00,0x00]
-daddiu	$t8, $zero, 0   # CHECK: encoding: [0x64,0x18,0x00,0x00]
-daddiu	$t9, $zero, 0   # CHECK: encoding: [0x64,0x19,0x00,0x00]
-daddiu	$k0, $zero, 0   # CHECK: encoding: [0x64,0x1a,0x00,0x00]
-daddiu	$k1, $zero, 0   # CHECK: encoding: [0x64,0x1b,0x00,0x00]
-daddiu	$gp, $zero, 0   # CHECK: encoding: [0x64,0x1c,0x00,0x00]
-daddiu	$sp, $zero, 0   # CHECK: encoding: [0x64,0x1d,0x00,0x00]
-daddiu	$fp, $zero, 0   # CHECK: encoding: [0x64,0x1e,0x00,0x00]
-daddiu	$s8, $zero, 0   # CHECK: encoding: [0x64,0x1e,0x00,0x00]
-daddiu	$ra, $zero, 0   # CHECK: encoding: [0x64,0x1f,0x00,0x00]
+addiu	$zero, $zero, 0 # CHECK: encoding: [0x24,0x00,0x00,0x00]
+addiu	$at, $zero, 0   # CHECK: encoding: [0x24,0x01,0x00,0x00]
+addiu	$v0, $zero, 0   # CHECK: encoding: [0x24,0x02,0x00,0x00]
+addiu	$v1, $zero, 0   # CHECK: encoding: [0x24,0x03,0x00,0x00]
+addiu	$a0, $zero, 0   # CHECK: encoding: [0x24,0x04,0x00,0x00]
+addiu	$a1, $zero, 0   # CHECK: encoding: [0x24,0x05,0x00,0x00]
+addiu	$a2, $zero, 0   # CHECK: encoding: [0x24,0x06,0x00,0x00]
+addiu	$a3, $zero, 0   # CHECK: encoding: [0x24,0x07,0x00,0x00]
+addiu	$t0, $zero, 0   # CHECK: encoding: [0x24,0x08,0x00,0x00]
+addiu	$t1, $zero, 0   # CHECK: encoding: [0x24,0x09,0x00,0x00]
+addiu	$t2, $zero, 0   # CHECK: encoding: [0x24,0x0a,0x00,0x00]
+addiu	$t3, $zero, 0   # CHECK: encoding: [0x24,0x0b,0x00,0x00]
+addiu	$t4, $zero, 0   # CHECK: encoding: [0x24,0x0c,0x00,0x00]
+addiu	$t5, $zero, 0   # CHECK: encoding: [0x24,0x0d,0x00,0x00]
+addiu	$t6, $zero, 0   # CHECK: encoding: [0x24,0x0e,0x00,0x00]
+addiu	$t7, $zero, 0   # CHECK: encoding: [0x24,0x0f,0x00,0x00]
+addiu	$s0, $zero, 0   # CHECK: encoding: [0x24,0x10,0x00,0x00]
+addiu	$s1, $zero, 0   # CHECK: encoding: [0x24,0x11,0x00,0x00]
+addiu	$s2, $zero, 0   # CHECK: encoding: [0x24,0x12,0x00,0x00]
+addiu	$s3, $zero, 0   # CHECK: encoding: [0x24,0x13,0x00,0x00]
+addiu	$s4, $zero, 0   # CHECK: encoding: [0x24,0x14,0x00,0x00]
+addiu	$s5, $zero, 0   # CHECK: encoding: [0x24,0x15,0x00,0x00]
+addiu	$s6, $zero, 0   # CHECK: encoding: [0x24,0x16,0x00,0x00]
+addiu	$s7, $zero, 0   # CHECK: encoding: [0x24,0x17,0x00,0x00]
+addiu	$t8, $zero, 0   # CHECK: encoding: [0x24,0x18,0x00,0x00]
+addiu	$t9, $zero, 0   # CHECK: encoding: [0x24,0x19,0x00,0x00]
+addiu	$k0, $zero, 0   # CHECK: encoding: [0x24,0x1a,0x00,0x00]
+addiu	$k1, $zero, 0   # CHECK: encoding: [0x24,0x1b,0x00,0x00]
+addiu	$gp, $zero, 0   # CHECK: encoding: [0x24,0x1c,0x00,0x00]
+addiu	$sp, $zero, 0   # CHECK: encoding: [0x24,0x1d,0x00,0x00]
+addiu	$fp, $zero, 0   # CHECK: encoding: [0x24,0x1e,0x00,0x00]
+addiu	$s8, $zero, 0   # CHECK: encoding: [0x24,0x1e,0x00,0x00]
+addiu	$ra, $zero, 0   # CHECK: encoding: [0x24,0x1f,0x00,0x00]
diff --git a/test/Object/Inputs/openbsd-phdrs.elf-x86-64 b/test/Object/Inputs/openbsd-phdrs.elf-x86-64
index 757d490..dab75bf 100644
--- a/test/Object/Inputs/openbsd-phdrs.elf-x86-64
+++ b/test/Object/Inputs/openbsd-phdrs.elf-x86-64
Binary files differ
diff --git a/test/Object/archive-thin-create.test b/test/Object/archive-thin-create.test
index 3656597..55abe00 100644
--- a/test/Object/archive-thin-create.test
+++ b/test/Object/archive-thin-create.test
@@ -1,3 +1,4 @@
+RUN: rm -rf %t
 RUN: mkdir -p %t
 RUN: cd %t
 RUN: mkdir -p foo
diff --git a/test/ObjectYAML/MachO/DWARF-debug_abbrev.yaml b/test/ObjectYAML/MachO/DWARF-debug_abbrev.yaml
new file mode 100644
index 0000000..0736514
--- /dev/null
+++ b/test/ObjectYAML/MachO/DWARF-debug_abbrev.yaml
@@ -0,0 +1,433 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+
+--- !mach-o
+FileHeader:      
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x0000000A
+  ncmds:           5
+  sizeofcmds:      1800
+  flags:           0x00000000
+  reserved:        0x00000000
+LoadCommands:    
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __PAGEZERO
+    vmaddr:          0
+    vmsize:          4294967296
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         472
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        5
+    nsects:          5
+    flags:           0
+    Sections:        
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0000000100000F50
+        size:            52
+        offset:          0x00000000
+        align:           4
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __stubs
+        segname:         __TEXT
+        addr:            0x0000000100000F84
+        size:            6
+        offset:          0x00000000
+        align:           1
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000408
+        reserved1:       0x00000000
+        reserved2:       0x00000006
+        reserved3:       0x00000000
+      - sectname:        __stub_helper
+        segname:         __TEXT
+        addr:            0x0000000100000F8C
+        size:            26
+        offset:          0x00000000
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __cstring
+        segname:         __TEXT
+        addr:            0x0000000100000FA6
+        size:            14
+        offset:          0x00000000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000002
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x0000000100000FB4
+        size:            72
+        offset:          0x00000000
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __DATA
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        3
+    nsects:          2
+    flags:           0
+    Sections:        
+      - sectname:        __nl_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000100001000
+        size:            16
+        offset:          0x00000000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000006
+        reserved1:       0x00000001
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __la_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000100001010
+        size:            8
+        offset:          0x00000000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000007
+        reserved1:       0x00000003
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294975488
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        60
+    maxprot:         7
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         952
+    segname:         __DWARF
+    vmaddr:          4294979584
+    vmsize:          4096
+    fileoff:         8192
+    filesize:        764
+    maxprot:         7
+    initprot:        3
+    nsects:          11
+    flags:           0
+    Sections:        
+      - sectname:        __debug_line
+        segname:         __DWARF
+        addr:            0x0000000100003000
+        size:            69
+        offset:          0x00002000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubnames
+        segname:         __DWARF
+        addr:            0x0000000100003045
+        size:            27
+        offset:          0x00002045
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubtypes
+        segname:         __DWARF
+        addr:            0x0000000100003060
+        size:            35
+        offset:          0x00002060
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_aranges
+        segname:         __DWARF
+        addr:            0x0000000100003083
+        size:            48
+        offset:          0x00002083
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_info
+        segname:         __DWARF
+        addr:            0x00000001000030B3
+        size:            121
+        offset:          0x000020B3
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_abbrev
+        segname:         __DWARF
+        addr:            0x000000010000312C
+        size:            76
+        offset:          0x0000212C
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_str
+        segname:         __DWARF
+        addr:            0x0000000100003178
+        size:            142
+        offset:          0x00002178
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_names
+        segname:         __DWARF
+        addr:            0x0000000100003206
+        size:            60
+        offset:          0x00002206
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_namespac
+        segname:         __DWARF
+        addr:            0x0000000100003242
+        size:            36
+        offset:          0x00002242
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_types
+        segname:         __DWARF
+        addr:            0x0000000100003266
+        size:            114
+        offset:          0x00002266
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_objc
+        segname:         __DWARF
+        addr:            0x00000001000032D8
+        size:            36
+        offset:          0x000022D8
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+DWARF:           
+  debug_abbrev:     
+    - Code:            0x00000001
+      Tag:             DW_TAG_compile_unit
+      Children:        DW_CHILDREN_yes
+      Attributes:      
+        - Attribute:       DW_AT_producer
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_language
+          Form:            DW_FORM_data2
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_stmt_list
+          Form:            DW_FORM_sec_offset
+        - Attribute:       DW_AT_comp_dir
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_low_pc
+          Form:            DW_FORM_addr
+        - Attribute:       DW_AT_high_pc
+          Form:            DW_FORM_data4
+    - Code:            0x00000002
+      Tag:             DW_TAG_subprogram
+      Children:        DW_CHILDREN_yes
+      Attributes:      
+        - Attribute:       DW_AT_low_pc
+          Form:            DW_FORM_addr
+        - Attribute:       DW_AT_high_pc
+          Form:            DW_FORM_data4
+        - Attribute:       DW_AT_frame_base
+          Form:            DW_FORM_exprloc
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_decl_file
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_decl_line
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_prototyped
+          Form:            DW_FORM_flag_present
+        - Attribute:       DW_AT_type
+          Form:            DW_FORM_ref4
+        - Attribute:       DW_AT_external
+          Form:            DW_FORM_flag_present
+    - Code:            0x00000003
+      Tag:             DW_TAG_formal_parameter
+      Children:        DW_CHILDREN_no
+      Attributes:      
+        - Attribute:       DW_AT_location
+          Form:            DW_FORM_exprloc
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_decl_file
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_decl_line
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_type
+          Form:            DW_FORM_ref4
+    - Code:            0x00000004
+      Tag:             DW_TAG_base_type
+      Children:        DW_CHILDREN_no
+      Attributes:      
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_encoding
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_byte_size
+          Form:            DW_FORM_data1
+    - Code:            0x00000005
+      Tag:             DW_TAG_pointer_type
+      Children:        DW_CHILDREN_no
+      Attributes:      
+        - Attribute:       DW_AT_type
+          Form:            DW_FORM_ref4
+...
+
+#CHECK: DWARF:           
+#CHECK:   debug_abbrev:     
+#CHECK:     - Code:            0x00000001
+#CHECK:       Tag:             DW_TAG_compile_unit
+#CHECK:       Children:        DW_CHILDREN_yes
+#CHECK:       Attributes:      
+#CHECK:         - Attribute:       DW_AT_producer
+#CHECK:           Form:            DW_FORM_strp
+#CHECK:         - Attribute:       DW_AT_language
+#CHECK:           Form:            DW_FORM_data2
+#CHECK:         - Attribute:       DW_AT_name
+#CHECK:           Form:            DW_FORM_strp
+#CHECK:         - Attribute:       DW_AT_stmt_list
+#CHECK:           Form:            DW_FORM_sec_offset
+#CHECK:         - Attribute:       DW_AT_comp_dir
+#CHECK:           Form:            DW_FORM_strp
+#CHECK:         - Attribute:       DW_AT_low_pc
+#CHECK:           Form:            DW_FORM_addr
+#CHECK:         - Attribute:       DW_AT_high_pc
+#CHECK:           Form:            DW_FORM_data4
+#CHECK:     - Code:            0x00000002
+#CHECK:       Tag:             DW_TAG_subprogram
+#CHECK:       Children:        DW_CHILDREN_yes
+#CHECK:       Attributes:      
+#CHECK:         - Attribute:       DW_AT_low_pc
+#CHECK:           Form:            DW_FORM_addr
+#CHECK:         - Attribute:       DW_AT_high_pc
+#CHECK:           Form:            DW_FORM_data4
+#CHECK:         - Attribute:       DW_AT_frame_base
+#CHECK:           Form:            DW_FORM_exprloc
+#CHECK:         - Attribute:       DW_AT_name
+#CHECK:           Form:            DW_FORM_strp
+#CHECK:         - Attribute:       DW_AT_decl_file
+#CHECK:           Form:            DW_FORM_data1
+#CHECK:         - Attribute:       DW_AT_decl_line
+#CHECK:           Form:            DW_FORM_data1
+#CHECK:         - Attribute:       DW_AT_prototyped
+#CHECK:           Form:            DW_FORM_flag_present
+#CHECK:         - Attribute:       DW_AT_type
+#CHECK:           Form:            DW_FORM_ref4
+#CHECK:         - Attribute:       DW_AT_external
+#CHECK:           Form:            DW_FORM_flag_present
+#CHECK:     - Code:            0x00000003
+#CHECK:       Tag:             DW_TAG_formal_parameter
+#CHECK:       Children:        DW_CHILDREN_no
+#CHECK:       Attributes:      
+#CHECK:         - Attribute:       DW_AT_location
+#CHECK:           Form:            DW_FORM_exprloc
+#CHECK:         - Attribute:       DW_AT_name
+#CHECK:           Form:            DW_FORM_strp
+#CHECK:         - Attribute:       DW_AT_decl_file
+#CHECK:           Form:            DW_FORM_data1
+#CHECK:         - Attribute:       DW_AT_decl_line
+#CHECK:           Form:            DW_FORM_data1
+#CHECK:         - Attribute:       DW_AT_type
+#CHECK:           Form:            DW_FORM_ref4
+#CHECK:     - Code:            0x00000004
+#CHECK:       Tag:             DW_TAG_base_type
+#CHECK:       Children:        DW_CHILDREN_no
+#CHECK:       Attributes:      
+#CHECK:         - Attribute:       DW_AT_name
+#CHECK:           Form:            DW_FORM_strp
+#CHECK:         - Attribute:       DW_AT_encoding
+#CHECK:           Form:            DW_FORM_data1
+#CHECK:         - Attribute:       DW_AT_byte_size
+#CHECK:           Form:            DW_FORM_data1
+#CHECK:     - Code:            0x00000005
+#CHECK:       Tag:             DW_TAG_pointer_type
+#CHECK:       Children:        DW_CHILDREN_no
+#CHECK:       Attributes:      
+#CHECK:         - Attribute:       DW_AT_type
+#CHECK:           Form:            DW_FORM_ref4
diff --git a/test/ObjectYAML/MachO/DWARF-debug_aranges.yaml b/test/ObjectYAML/MachO/DWARF-debug_aranges.yaml
new file mode 100644
index 0000000..2822c94
--- /dev/null
+++ b/test/ObjectYAML/MachO/DWARF-debug_aranges.yaml
@@ -0,0 +1,335 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+
+--- !mach-o
+FileHeader:      
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x0000000A
+  ncmds:           7
+  sizeofcmds:      1848
+  flags:           0x00000000
+  reserved:        0x00000000
+LoadCommands:    
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            B4D48511-37F4-3ED4-AFA7-1683DCE69AC4
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          4096
+    nsyms:           2
+    stroff:          4128
+    strsize:         28
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __PAGEZERO
+    vmaddr:          0
+    vmsize:          4294967296
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         472
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        5
+    nsects:          5
+    flags:           0
+    Sections:        
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0000000100000F50
+        size:            52
+        offset:          0x00000000
+        align:           4
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __stubs
+        segname:         __TEXT
+        addr:            0x0000000100000F84
+        size:            6
+        offset:          0x00000000
+        align:           1
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000408
+        reserved1:       0x00000000
+        reserved2:       0x00000006
+        reserved3:       0x00000000
+      - sectname:        __stub_helper
+        segname:         __TEXT
+        addr:            0x0000000100000F8C
+        size:            26
+        offset:          0x00000000
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __cstring
+        segname:         __TEXT
+        addr:            0x0000000100000FA6
+        size:            14
+        offset:          0x00000000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000002
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x0000000100000FB4
+        size:            72
+        offset:          0x00000000
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __DATA
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        3
+    nsects:          2
+    flags:           0
+    Sections:        
+      - sectname:        __nl_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000100001000
+        size:            16
+        offset:          0x00000000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000006
+        reserved1:       0x00000001
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __la_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000100001010
+        size:            8
+        offset:          0x00000000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000007
+        reserved1:       0x00000003
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294975488
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        60
+    maxprot:         7
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         952
+    segname:         __DWARF
+    vmaddr:          4294979584
+    vmsize:          4096
+    fileoff:         8192
+    filesize:        764
+    maxprot:         7
+    initprot:        3
+    nsects:          11
+    flags:           0
+    Sections:        
+      - sectname:        __debug_line
+        segname:         __DWARF
+        addr:            0x0000000100003000
+        size:            69
+        offset:          0x00002000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubnames
+        segname:         __DWARF
+        addr:            0x0000000100003045
+        size:            27
+        offset:          0x00002045
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubtypes
+        segname:         __DWARF
+        addr:            0x0000000100003060
+        size:            35
+        offset:          0x00002060
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_aranges
+        segname:         __DWARF
+        addr:            0x0000000100003083
+        size:            48
+        offset:          0x00002083
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_info
+        segname:         __DWARF
+        addr:            0x00000001000030B3
+        size:            121
+        offset:          0x000020B3
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_abbrev
+        segname:         __DWARF
+        addr:            0x000000010000312C
+        size:            76
+        offset:          0x0000212C
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_str
+        segname:         __DWARF
+        addr:            0x0000000100003178
+        size:            142
+        offset:          0x00002178
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_names
+        segname:         __DWARF
+        addr:            0x0000000100003206
+        size:            60
+        offset:          0x00002206
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_namespac
+        segname:         __DWARF
+        addr:            0x0000000100003242
+        size:            36
+        offset:          0x00002242
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_types
+        segname:         __DWARF
+        addr:            0x0000000100003266
+        size:            114
+        offset:          0x00002266
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_objc
+        segname:         __DWARF
+        addr:            0x00000001000032D8
+        size:            36
+        offset:          0x000022D8
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+LinkEditData:    
+  NameList:        
+    - n_strx:          2
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          16
+      n_value:         4294967296
+    - n_strx:          22
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294971216
+  StringTable:     
+    - ''
+    - ''
+    - __mh_execute_header
+    - _main
+DWARF:           
+  debug_aranges:   
+    - Length:          44
+      Version:         2
+      CuOffset:        0
+      AddrSize:        8
+      SegSize:         0
+      Descriptors:     
+        - Address:         0x0000000100000F50
+          Length:          52
+...
+
+#CHECK: DWARF:           
+#CHECK:   debug_aranges:   
+#CHECK:     - Length:          44
+#CHECK:       Version:         2
+#CHECK:       CuOffset:        0
+#CHECK:       AddrSize:        8
+#CHECK:       SegSize:         0
+#CHECK:       Descriptors:     
+#CHECK:         - Address:         0x0000000100000F50
+#CHECK:           Length:          52
diff --git a/test/ObjectYAML/MachO/DWARF-debug_str.yaml b/test/ObjectYAML/MachO/DWARF-debug_str.yaml
new file mode 100644
index 0000000..417a755
--- /dev/null
+++ b/test/ObjectYAML/MachO/DWARF-debug_str.yaml
@@ -0,0 +1,266 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+
+--- !mach-o
+FileHeader:      
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x0000000A
+  ncmds:           6
+  sizeofcmds:      1376
+  flags:           0x00000000
+  reserved:        0x00000000
+LoadCommands:    
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            9304404B-E522-3BBA-A861-AF5938908725
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          4096
+    nsyms:           2
+    stroff:          4128
+    strsize:         28
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __PAGEZERO
+    vmaddr:          0
+    vmsize:          4294967296
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        5
+    nsects:          2
+    flags:           0
+    Sections:        
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0000000100000FA0
+        size:            22
+        offset:          0x00000000
+        align:           4
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x0000000100000FB8
+        size:            72
+        offset:          0x00000000
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        60
+    maxprot:         7
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         952
+    segname:         __DWARF
+    vmaddr:          4294975488
+    vmsize:          4096
+    fileoff:         8192
+    filesize:        765
+    maxprot:         7
+    initprot:        3
+    nsects:          11
+    flags:           0
+    Sections:        
+      - sectname:        __debug_line
+        segname:         __DWARF
+        addr:            0x0000000100002000
+        size:            70
+        offset:          0x00002000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubnames
+        segname:         __DWARF
+        addr:            0x0000000100002046
+        size:            27
+        offset:          0x00002046
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubtypes
+        segname:         __DWARF
+        addr:            0x0000000100002061
+        size:            35
+        offset:          0x00002061
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_aranges
+        segname:         __DWARF
+        addr:            0x0000000100002084
+        size:            48
+        offset:          0x00002084
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_info
+        segname:         __DWARF
+        addr:            0x00000001000020B4
+        size:            121
+        offset:          0x000020B4
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_abbrev
+        segname:         __DWARF
+        addr:            0x000000010000212D
+        size:            76
+        offset:          0x0000212D
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_str
+        segname:         __DWARF
+        addr:            0x0000000100002179
+        size:            142
+        offset:          0x00002179
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_names
+        segname:         __DWARF
+        addr:            0x0000000100002207
+        size:            60
+        offset:          0x00002207
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_namespac
+        segname:         __DWARF
+        addr:            0x0000000100002243
+        size:            36
+        offset:          0x00002243
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_types
+        segname:         __DWARF
+        addr:            0x0000000100002267
+        size:            114
+        offset:          0x00002267
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_objc
+        segname:         __DWARF
+        addr:            0x00000001000022D9
+        size:            36
+        offset:          0x000022D9
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+LinkEditData:    
+  NameList:        
+    - n_strx:          2
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          16
+      n_value:         4294967296
+    - n_strx:          22
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294971296
+  StringTable:     
+    - ''
+    - ''
+    - __mh_execute_header
+    - _main
+DWARF:           
+  debug_str:    
+    - ''
+    - 'clang version 4.0.0 (trunk 288677) (llvm/trunk 288676)'
+    - hello_world.c
+    - /Users/cbieneman/dev/open-source/llvm-build-rel
+    - main
+    - argc
+    - argv
+    - int
+    - char
+...
+
+#CHECK: DWARF:           
+#CHECK:   debug_str:    
+#CHECK:     - ''
+#CHECK:     - 'clang version 4.0.0 (trunk 288677) (llvm/trunk 288676)'
+#CHECK:     - hello_world.c
+#CHECK:     - /Users/cbieneman/dev/open-source/llvm-build-rel
+#CHECK:     - main
+#CHECK:     - argc
+#CHECK:     - argv
+#CHECK:     - int
+#CHECK:     - char
+
diff --git a/test/ThinLTO/X86/cache-config.ll b/test/ThinLTO/X86/cache-config.ll
new file mode 100644
index 0000000..a947969
--- /dev/null
+++ b/test/ThinLTO/X86/cache-config.ll
@@ -0,0 +1,27 @@
+; RUN: rm -rf %t.cache && mkdir %t.cache
+; RUN: opt -module-hash -module-summary %s -o %t.bc
+
+; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx
+; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -mcpu=yonah
+; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -relax-elf-relocations
+; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -function-sections
+; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -data-sections
+; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -debugger-tune=sce
+; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -mattr=+sse2
+; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -relocation-model=static
+; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -code-model=large
+; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -cg-opt-level=0
+; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -O1
+; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -opt-pipeline=loweratomic
+; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -aa-pipeline=basic-aa
+; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -override-triple=x86_64-unknown-linux-gnu
+; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -default-triple=x86_64-unknown-linux-gnu
+; RUN: ls %t.cache | count 15
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @globalfunc() {
+entry:
+  ret void
+}
diff --git a/test/Transforms/BDCE/dbg-multipleuses.ll b/test/Transforms/BDCE/dbg-multipleuses.ll
new file mode 100644
index 0000000..9213b54
--- /dev/null
+++ b/test/Transforms/BDCE/dbg-multipleuses.ll
@@ -0,0 +1,47 @@
+; Test that BDCE doesn't destroy llvm.dbg.value's argument.
+; RUN: opt -bdce %s -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: define void @f()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: tail call void (...) @h()
+; CHECK-NEXT: %[[CALL:.*]] = tail call i32 (...) @g()
+; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i32 %[[CALL:.*]]
+
+define void @f() !dbg !6 {
+entry:
+  tail call void (...) @h(), !dbg !9
+  %call = tail call i32 (...) @g(), !dbg !10
+  tail call void @llvm.dbg.value(metadata i32 %call, i64 0, metadata !11, metadata !13), !dbg !14
+  %patatino = xor i32 %call, %call
+  tail call void (...) @h(), !dbg !15
+  ret void, !dbg !16
+}
+
+declare void @h(...)
+declare i32 @g(...)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 288665) (llvm/trunk 288725)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "patatino.c", directory: "/home/davide/work/llvm/build-clang/bin")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 4.0.0 (trunk 288665) (llvm/trunk 288725)"}
+!6 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 3, type: !7, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: false, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{null}
+!9 = !DILocation(line: 4, column: 3, scope: !6)
+!10 = !DILocation(line: 5, column: 11, scope: !6)
+!11 = !DILocalVariable(name: "a", scope: !6, file: !1, line: 5, type: !12)
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !DIExpression()
+!14 = !DILocation(line: 5, column: 7, scope: !6)
+!15 = !DILocation(line: 6, column: 3, scope: !6)
+!16 = !DILocation(line: 7, column: 1, scope: !6)
diff --git a/test/Transforms/BDCE/pr26587.ll b/test/Transforms/BDCE/pr26587.ll
new file mode 100644
index 0000000..17837ec
--- /dev/null
+++ b/test/Transforms/BDCE/pr26587.ll
@@ -0,0 +1,46 @@
+; Test that BDCE doesn't destroy llvm.dbg.value's argument.
+; RUN: opt -bdce %s -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: define void @f()
+; CHECK-NEXT: entry:
+; CHECK-NEXT: tail call void (...) @h()
+; CHECK-NEXT: %[[CALL:.*]] = tail call i32 (...) @g()
+; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i32 %[[CALL:.*]]
+
+define void @f() !dbg !6 {
+entry:
+  tail call void (...) @h(), !dbg !9
+  %call = tail call i32 (...) @g(), !dbg !10
+  tail call void @llvm.dbg.value(metadata i32 %call, i64 0, metadata !11, metadata !13), !dbg !14
+  tail call void (...) @h(), !dbg !15
+  ret void, !dbg !16
+}
+
+declare void @h(...)
+declare i32 @g(...)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 288665) (llvm/trunk 288725)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "patatino.c", directory: "/home/davide/work/llvm/build-clang/bin")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 4.0.0 (trunk 288665) (llvm/trunk 288725)"}
+!6 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 3, type: !7, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: false, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{null}
+!9 = !DILocation(line: 4, column: 3, scope: !6)
+!10 = !DILocation(line: 5, column: 11, scope: !6)
+!11 = !DILocalVariable(name: "a", scope: !6, file: !1, line: 5, type: !12)
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !DIExpression()
+!14 = !DILocation(line: 5, column: 7, scope: !6)
+!15 = !DILocation(line: 6, column: 3, scope: !6)
+!16 = !DILocation(line: 7, column: 1, scope: !6)
diff --git a/test/Transforms/FunctionAttrs/nonnull-global.ll b/test/Transforms/FunctionAttrs/nonnull-global.ll
new file mode 100644
index 0000000..43353e8
--- /dev/null
+++ b/test/Transforms/FunctionAttrs/nonnull-global.ll
@@ -0,0 +1,10 @@
+; RUN: opt -S -functionattrs %s | FileCheck %s
+
+@a = external global i8, !absolute_symbol !0
+
+; CHECK-NOT: define nonnull
+define i8* @foo() {
+  ret i8* @a
+}
+
+!0 = !{i64 0, i64 256}
diff --git a/test/Transforms/GVN/dbg-redundant-load.ll b/test/Transforms/GVN/dbg-redundant-load.ll
new file mode 100644
index 0000000..8e5a48b
--- /dev/null
+++ b/test/Transforms/GVN/dbg-redundant-load.ll
@@ -0,0 +1,52 @@
+; RUN: opt -gvn -S < %s | FileCheck %s
+
+; Check that the redundant load from %if.then is removed.
+; Also, check that the debug location associated to load %0 still refers to
+; line 3 and not line 6.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK: @test_redundant_load(
+; CHECK-LABEL: entry:
+; CHECK-NEXT: load i32, i32* %Y, align 4, !dbg ![[LOC:[0-9]+]]
+; CHECK-LABEL: if.then:
+; CHECK-NOT: load
+; CHECK-LABEL: if.end:
+; CHECK: ![[LOC]] = !DILocation(line: 3, scope: !{{.*}})
+
+define i32 @test_redundant_load(i32 %X, i32* %Y) !dbg !6 {
+entry:
+  %0 = load i32, i32* %Y, align 4, !dbg !8
+  %cmp = icmp sgt i32 %X, -1, !dbg !9
+  br i1 %cmp, label %if.then, label %if.end, !dbg !9
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, i32* %Y, align 4, !dbg !10
+  %add = add nsw i32 %0, %1, !dbg !10
+  call void @foo(), !dbg !11
+  br label %if.end, !dbg !12
+
+if.end:                                           ; preds = %if.then, %entry
+  %Result.0 = phi i32 [ %add, %if.then ], [ %0, %entry ]
+  ret i32 %Result.0, !dbg !13
+}
+
+declare void @foo()
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "test.cpp", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = distinct !DISubprogram(name: "test_redundant_load", scope: !1, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 3, scope: !6)
+!9 = !DILocation(line: 5, scope: !6)
+!10 = !DILocation(line: 6, scope: !6)
+!11 = !DILocation(line: 7, scope: !6)
+!12 = !DILocation(line: 8, scope: !6)
+!13 = !DILocation(line: 10, scope: !6)
diff --git a/test/Transforms/GVNHoist/hoist.ll b/test/Transforms/GVNHoist/hoist.ll
index 8e18941..d4b4afd 100644
--- a/test/Transforms/GVNHoist/hoist.ll
+++ b/test/Transforms/GVNHoist/hoist.ll
@@ -711,3 +711,36 @@
 ; CHECK: %[[load:.*]] = load i32, i32* %y, align 1
 ; CHECK: %[[phi:.*]] = phi i32 [ %[[load]], %{{.*}} ], [ %[[load]], %{{.*}} ]
 ; CHECK: i32 %[[phi]]
+
+
+declare i8 @pr30991_f() nounwind readonly
+declare void @pr30991_f1(i8)
+define i8 @pr30991(i8* %sp, i8* %word, i1 %b1, i1 %b2) {
+entry:
+  br i1 %b1, label %a, label %b
+
+a:
+  %r0 = load i8, i8* %word, align 1
+  %incdec.ptr = getelementptr i8, i8* %sp, i32 1
+  %rr0 = call i8 @pr30991_f() nounwind readonly
+  call void @pr30991_f1(i8 %r0)
+  ret i8 %rr0
+
+b:
+  br i1 %b2, label %c, label %x
+
+c:
+  %r1 = load i8, i8* %word, align 1
+  %incdec.ptr115 = getelementptr i8, i8* %sp, i32 1
+  %rr1 = call i8 @pr30991_f() nounwind readonly
+  call void @pr30991_f1(i8 %r1)
+  ret i8 %rr1
+
+x:
+  %r2 = load i8, i8* %word, align 1
+  ret i8 %r2
+}
+
+; CHECK-LABEL: define i8 @pr30991
+; CHECK:  %r0 = load i8, i8* %word, align 1
+; CHECK-NEXT:  br i1 %b1, label %a, label %b
diff --git a/test/Transforms/Inline/alloca-dbgdeclare.ll b/test/Transforms/Inline/alloca-dbgdeclare.ll
index 44ca11a..78b888f 100644
--- a/test/Transforms/Inline/alloca-dbgdeclare.ll
+++ b/test/Transforms/Inline/alloca-dbgdeclare.ll
@@ -129,10 +129,10 @@
 !45 = !DILocation(line: 9, scope: !15)
 !46 = !DILocalVariable(name: "p1", line: 6, arg: 1, scope: !15, file: !16, type: !4)
 !47 = distinct !DILocation(line: 11, scope: !21)
-!48 = !DIExpression(DW_OP_bit_piece, 32, 160)
+!48 = !DIExpression(DW_OP_LLVM_fragment, 32, 160)
 !49 = !DILocation(line: 6, scope: !15, inlinedAt: !47)
 !50 = !DILocation(line: 11, scope: !21)
-!51 = !DIExpression(DW_OP_bit_piece, 0, 32)
+!51 = !DIExpression(DW_OP_LLVM_fragment, 0, 32)
 !52 = !DILocation(line: 7, scope: !34, inlinedAt: !47)
 !53 = !DILocation(line: 7, scope: !15, inlinedAt: !47)
 !54 = !DILocation(line: 8, scope: !34, inlinedAt: !47)
diff --git a/test/Transforms/InstCombine/bitcast.ll b/test/Transforms/InstCombine/bitcast.ll
index b6b1ffd..08f4966 100644
--- a/test/Transforms/InstCombine/bitcast.ll
+++ b/test/Transforms/InstCombine/bitcast.ll
@@ -449,3 +449,67 @@
   %res = bitcast <8 x i1> <i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true> to i8
   ret i8 %res
 }
+
+@g = internal unnamed_addr global i32 undef
+
+; CHECK-LABEL: @constant_fold_vector_to_double(
+; CHECK: store volatile double 1.000000e+00,
+; CHECK: store volatile double 1.000000e+00,
+; CHECK: store volatile double 1.000000e+00,
+; CHECK: store volatile double 1.000000e+00,
+
+; CHECK: store volatile double 0xFFFFFFFFFFFFFFFF,
+; CHECK: store volatile double 0x162E000004D2,
+
+; CHECK: store volatile double bitcast (<2 x i32> <i32 1234, i32 ptrtoint (i32* @g to i32)> to double),
+; CHECK: store volatile double 0x400000003F800000,
+
+; CHECK: store volatile double 0.000000e+00,
+; CHECK: store volatile double 0.000000e+00,
+; CHECK: store volatile double 0.000000e+00,
+; CHECK: store volatile double 0.000000e+00,
+; CHECK: store volatile double 0.000000e+00,
+; CHECK: store volatile double 0.000000e+00,
+define void @constant_fold_vector_to_double() {
+  store volatile double bitcast (<1 x i64> <i64 4607182418800017408> to double), double* undef
+  store volatile double bitcast (<2 x i32> <i32 0, i32 1072693248> to double), double* undef
+  store volatile double bitcast (<4 x i16> <i16 0, i16 0, i16 0, i16 16368> to double), double* undef
+  store volatile double bitcast (<8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 240, i8 63> to double), double* undef
+
+  store volatile double bitcast (<2 x i32> <i32 -1, i32 -1> to double), double* undef
+  store volatile double bitcast (<2 x i32> <i32 1234, i32 5678> to double), double* undef
+
+  store volatile double bitcast (<2 x i32> <i32 1234, i32 ptrtoint (i32* @g to i32)> to double), double* undef
+  store volatile double bitcast (<2 x float> <float 1.0, float 2.0> to double), double* undef
+
+  store volatile double bitcast (<2 x i32> zeroinitializer to double), double* undef
+  store volatile double bitcast (<4 x i16> zeroinitializer to double), double* undef
+  store volatile double bitcast (<8 x i8> zeroinitializer to double), double* undef
+  store volatile double bitcast (<16 x i4> zeroinitializer to double), double* undef
+  store volatile double bitcast (<32 x i2> zeroinitializer to double), double* undef
+  store volatile double bitcast (<64 x i1> zeroinitializer to double), double* undef
+  ret void
+}
+
+; CHECK-LABEL: @constant_fold_vector_to_float(
+; CHECK: store volatile float 1.000000e+00,
+; CHECK: store volatile float 1.000000e+00,
+; CHECK: store volatile float 1.000000e+00,
+; CHECK: store volatile float 1.000000e+00,
+define void @constant_fold_vector_to_float() {
+  store volatile float bitcast (<1 x i32> <i32 1065353216> to float), float* undef
+  store volatile float bitcast (<2 x i16> <i16 0, i16 16256> to float), float* undef
+  store volatile float bitcast (<4 x i8> <i8 0, i8 0, i8 128, i8 63> to float), float* undef
+  store volatile float bitcast (<32 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0> to float), float* undef
+
+  ret void
+}
+
+; CHECK-LABEL: @constant_fold_vector_to_half(
+; CHECK: store volatile half 0xH4000,
+; CHECK: store volatile half 0xH4000,
+define void @constant_fold_vector_to_half() {
+  store volatile half bitcast (<2 x i8> <i8 0, i8 64> to half), half* undef
+  store volatile half bitcast (<4 x i4> <i4 0, i4 0, i4 0, i4 4> to half), half* undef
+  ret void
+}
diff --git a/test/Transforms/InstCombine/smax-icmp.ll b/test/Transforms/InstCombine/smax-icmp.ll
new file mode 100644
index 0000000..1462a96
--- /dev/null
+++ b/test/Transforms/InstCombine/smax-icmp.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; If we have an smax feeding a signed or equality icmp that shares an
+; operand with the smax, the compare should always be folded.
+; Test all 4 foldable predicates (eq,ne,sgt,sle) * 4 commutation
+; possibilities for each predicate. Note that folds to true/false
+; (predicate = sge/slt) or folds to an existing instruction should be
+; handled by InstSimplify.
+
+; smax(X, Y) == X --> X >= Y
+
+define i1 @eq_smax1(i32 %x, i32 %y) {
+; CHECK-LABEL: @eq_smax1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp sgt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp eq i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @eq_smax2(i32 %x, i32 %y) {
+; CHECK-LABEL: @eq_smax2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 %y, %x
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 %x
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp sgt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp eq i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the max op to the RHS.
+
+define i1 @eq_smax3(i32 %a, i32 %y) {
+; CHECK-LABEL: @eq_smax3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp sgt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp eq i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @eq_smax4(i32 %a, i32 %y) {
+; CHECK-LABEL: @eq_smax4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp sgt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp eq i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; smax(X, Y) <= X --> X >= Y
+
+define i1 @sle_smax1(i32 %x, i32 %y) {
+; CHECK-LABEL: @sle_smax1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp sgt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp sle i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @sle_smax2(i32 %x, i32 %y) {
+; CHECK-LABEL: @sle_smax2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 %y, %x
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 %x
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp sgt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp sle i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the max op to the RHS.
+
+define i1 @sle_smax3(i32 %a, i32 %y) {
+; CHECK-LABEL: @sle_smax3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp sgt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp sge i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @sle_smax4(i32 %a, i32 %y) {
+; CHECK-LABEL: @sle_smax4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp sgt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp sge i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; smax(X, Y) != X --> X < Y
+
+define i1 @ne_smax1(i32 %x, i32 %y) {
+; CHECK-LABEL: @ne_smax1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp sgt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp ne i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @ne_smax2(i32 %x, i32 %y) {
+; CHECK-LABEL: @ne_smax2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 %y, %x
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp sgt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp ne i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the max op to the RHS.
+
+define i1 @ne_smax3(i32 %a, i32 %y) {
+; CHECK-LABEL: @ne_smax3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp sgt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp ne i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @ne_smax4(i32 %a, i32 %y) {
+; CHECK-LABEL: @ne_smax4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[X]], %y
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp sgt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp ne i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; smax(X, Y) > X --> X < Y
+
+define i1 @sgt_smax1(i32 %x, i32 %y) {
+; CHECK-LABEL: @sgt_smax1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp sgt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp sgt i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @sgt_smax2(i32 %x, i32 %y) {
+; CHECK-LABEL: @sgt_smax2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 %y, %x
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp sgt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp sgt i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the max op to the RHS.
+
+define i1 @sgt_smax3(i32 %a, i32 %y) {
+; CHECK-LABEL: @sgt_smax3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp sgt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp slt i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @sgt_smax4(i32 %a, i32 %y) {
+; CHECK-LABEL: @sgt_smax4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[X]], %y
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp sgt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp slt i32 %x, %sel
+  ret i1 %cmp2
+}
+
diff --git a/test/Transforms/InstCombine/smin-icmp.ll b/test/Transforms/InstCombine/smin-icmp.ll
new file mode 100644
index 0000000..083d063
--- /dev/null
+++ b/test/Transforms/InstCombine/smin-icmp.ll
@@ -0,0 +1,357 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; If we have an smin feeding a signed or equality icmp that shares an
+; operand with the smin, the compare should always be folded.
+; Test all 6 foldable predicates (eq,ne,sge,sgt,sle,slt) * 4 commutation
+; possibilities for each predicate. Note that folds to true/false or
+; folds to an existing instruction may be handled by InstSimplify.
+
+; smin(X, Y) == X --> X <= Y
+
+define i1 @eq_smin1(i32 %x, i32 %y) {
+; CHECK-LABEL: @eq_smin1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp slt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp eq i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @eq_smin2(i32 %x, i32 %y) {
+; CHECK-LABEL: @eq_smin2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 %y, %x
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 %x
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp slt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp eq i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the min op to the RHS.
+
+define i1 @eq_smin3(i32 %a, i32 %y) {
+; CHECK-LABEL: @eq_smin3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp slt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp eq i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @eq_smin4(i32 %a, i32 %y) {
+; CHECK-LABEL: @eq_smin4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp slt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp eq i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; smin(X, Y) >= X --> X <= Y
+
+define i1 @sge_smin1(i32 %x, i32 %y) {
+; CHECK-LABEL: @sge_smin1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp slt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp sge i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @sge_smin2(i32 %x, i32 %y) {
+; CHECK-LABEL: @sge_smin2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 %y, %x
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 %x
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp slt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp sge i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the min op to the RHS.
+
+define i1 @sge_smin3(i32 %a, i32 %y) {
+; CHECK-LABEL: @sge_smin3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp slt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp sle i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @sge_smin4(i32 %a, i32 %y) {
+; CHECK-LABEL: @sge_smin4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp slt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp sle i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; smin(X, Y) != X --> X > Y
+
+define i1 @ne_smin1(i32 %x, i32 %y) {
+; CHECK-LABEL: @ne_smin1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp slt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp ne i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @ne_smin2(i32 %x, i32 %y) {
+; CHECK-LABEL: @ne_smin2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 %y, %x
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp slt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp ne i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the min op to the RHS.
+
+define i1 @ne_smin3(i32 %a, i32 %y) {
+; CHECK-LABEL: @ne_smin3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp slt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp ne i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @ne_smin4(i32 %a, i32 %y) {
+; CHECK-LABEL: @ne_smin4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[X]], %y
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp slt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp ne i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; smin(X, Y) < X --> X > Y
+
+define i1 @slt_smin1(i32 %x, i32 %y) {
+; CHECK-LABEL: @slt_smin1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp slt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp slt i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @slt_smin2(i32 %x, i32 %y) {
+; CHECK-LABEL: @slt_smin2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 %y, %x
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp slt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp slt i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the min op to the RHS.
+
+define i1 @slt_smin3(i32 %a, i32 %y) {
+; CHECK-LABEL: @slt_smin3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp slt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp sgt i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @slt_smin4(i32 %a, i32 %y) {
+; CHECK-LABEL: @slt_smin4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[X]], %y
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp slt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp sgt i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; smin(X, Y) <= X --> true
+
+define i1 @sle_smin1(i32 %x, i32 %y) {
+; CHECK-LABEL: @sle_smin1(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp slt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp sle i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @sle_smin2(i32 %x, i32 %y) {
+; CHECK-LABEL: @sle_smin2(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp slt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp sle i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the min op to the RHS.
+
+define i1 @sle_smin3(i32 %a, i32 %y) {
+; CHECK-LABEL: @sle_smin3(
+; CHECK-NEXT:    ret i1 true
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp slt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp sge i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @sle_smin4(i32 %a, i32 %y) {
+; CHECK-LABEL: @sle_smin4(
+; CHECK-NEXT:    ret i1 true
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp slt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp sge i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; smin(X, Y) > X --> false
+
+define i1 @sgt_smin1(i32 %x, i32 %y) {
+; CHECK-LABEL: @sgt_smin1(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp slt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp sgt i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @sgt_smin2(i32 %x, i32 %y) {
+; CHECK-LABEL: @sgt_smin2(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp slt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp sgt i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the min op to the RHS.
+
+define i1 @sgt_smin3(i32 %a, i32 %y) {
+; CHECK-LABEL: @sgt_smin3(
+; CHECK-NEXT:    ret i1 false
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp slt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp slt i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @sgt_smin4(i32 %a, i32 %y) {
+; CHECK-LABEL: @sgt_smin4(
+; CHECK-NEXT:    ret i1 false
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp slt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp slt i32 %x, %sel
+  ret i1 %cmp2
+}
+
diff --git a/test/Transforms/InstCombine/umax-icmp.ll b/test/Transforms/InstCombine/umax-icmp.ll
new file mode 100644
index 0000000..270b1ef
--- /dev/null
+++ b/test/Transforms/InstCombine/umax-icmp.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; If we have a umax feeding an unsigned or equality icmp that shares an
+; operand with the umax, the compare should always be folded.
+; Test all 4 foldable predicates (eq,ne,ugt,ule) * 4 commutation
+; possibilities for each predicate. Note that folds to true/false
+; (predicate = uge/ult) or folds to an existing instruction should be
+; handled by InstSimplify.
+
+; umax(X, Y) == X --> X >= Y
+
+define i1 @eq_umax1(i32 %x, i32 %y) {
+; CHECK-LABEL: @eq_umax1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ugt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp eq i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @eq_umax2(i32 %x, i32 %y) {
+; CHECK-LABEL: @eq_umax2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 %y, %x
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 %x
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ugt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp eq i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the max op to the RHS.
+
+define i1 @eq_umax3(i32 %a, i32 %y) {
+; CHECK-LABEL: @eq_umax3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ugt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp eq i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @eq_umax4(i32 %a, i32 %y) {
+; CHECK-LABEL: @eq_umax4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ugt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp eq i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; umax(X, Y) <= X --> X >= Y
+
+define i1 @ule_umax1(i32 %x, i32 %y) {
+; CHECK-LABEL: @ule_umax1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ugt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp ule i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @ule_umax2(i32 %x, i32 %y) {
+; CHECK-LABEL: @ule_umax2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 %y, %x
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 %x
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ugt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp ule i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the max op to the RHS.
+
+define i1 @ule_umax3(i32 %a, i32 %y) {
+; CHECK-LABEL: @ule_umax3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ugt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp uge i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @ule_umax4(i32 %a, i32 %y) {
+; CHECK-LABEL: @ule_umax4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ugt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp uge i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; umax(X, Y) != X --> X < Y
+
+define i1 @ne_umax1(i32 %x, i32 %y) {
+; CHECK-LABEL: @ne_umax1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ugt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp ne i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @ne_umax2(i32 %x, i32 %y) {
+; CHECK-LABEL: @ne_umax2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 %y, %x
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ugt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp ne i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the max op to the RHS.
+
+define i1 @ne_umax3(i32 %a, i32 %y) {
+; CHECK-LABEL: @ne_umax3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ugt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp ne i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @ne_umax4(i32 %a, i32 %y) {
+; CHECK-LABEL: @ne_umax4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[X]], %y
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ugt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp ne i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; umax(X, Y) > X --> X < Y
+
+define i1 @ugt_umax1(i32 %x, i32 %y) {
+; CHECK-LABEL: @ugt_umax1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ugt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp ugt i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @ugt_umax2(i32 %x, i32 %y) {
+; CHECK-LABEL: @ugt_umax2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 %y, %x
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ugt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp ugt i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the max op to the RHS.
+
+define i1 @ugt_umax3(i32 %a, i32 %y) {
+; CHECK-LABEL: @ugt_umax3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ugt i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp ult i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute max operands.
+
+define i1 @ugt_umax4(i32 %a, i32 %y) {
+; CHECK-LABEL: @ugt_umax4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[X]], %y
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ugt i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp ult i32 %x, %sel
+  ret i1 %cmp2
+}
+
diff --git a/test/Transforms/InstCombine/umin-icmp.ll b/test/Transforms/InstCombine/umin-icmp.ll
new file mode 100644
index 0000000..f9d814d
--- /dev/null
+++ b/test/Transforms/InstCombine/umin-icmp.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; If we have a umin feeding an unsigned or equality icmp that shares an
+; operand with the umin, the compare should always be folded.
+; Test all 4 foldable predicates (eq,ne,uge,ult) * 4 commutation
+; possibilities for each predicate. Note that folds to true/false 
+; (predicate is ule/ugt) or folds to an existing instruction should be
+; handled by InstSimplify.
+
+; umin(X, Y) == X --> X <= Y
+
+define i1 @eq_umin1(i32 %x, i32 %y) {
+; CHECK-LABEL: @eq_umin1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ult i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp eq i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @eq_umin2(i32 %x, i32 %y) {
+; CHECK-LABEL: @eq_umin2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 %y, %x
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 %x
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ult i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp eq i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the min op to the RHS.
+
+define i1 @eq_umin3(i32 %a, i32 %y) {
+; CHECK-LABEL: @eq_umin3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ult i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp eq i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @eq_umin4(i32 %a, i32 %y) {
+; CHECK-LABEL: @eq_umin4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ult i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp eq i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; umin(X, Y) >= X --> X <= Y
+
+define i1 @uge_umin1(i32 %x, i32 %y) {
+; CHECK-LABEL: @uge_umin1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ult i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp uge i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @uge_umin2(i32 %x, i32 %y) {
+; CHECK-LABEL: @uge_umin2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 %y, %x
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 %x
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ult i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp uge i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the min op to the RHS.
+
+define i1 @uge_umin3(i32 %a, i32 %y) {
+; CHECK-LABEL: @uge_umin3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ult i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp ule i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @uge_umin4(i32 %a, i32 %y) {
+; CHECK-LABEL: @uge_umin4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %y, i32 [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ult i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp ule i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; umin(X, Y) != X --> X > Y
+
+define i1 @ne_umin1(i32 %x, i32 %y) {
+; CHECK-LABEL: @ne_umin1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ult i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp ne i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @ne_umin2(i32 %x, i32 %y) {
+; CHECK-LABEL: @ne_umin2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 %y, %x
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ult i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp ne i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the min op to the RHS.
+
+define i1 @ne_umin3(i32 %a, i32 %y) {
+; CHECK-LABEL: @ne_umin3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ult i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp ne i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @ne_umin4(i32 %a, i32 %y) {
+; CHECK-LABEL: @ne_umin4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[X]], %y
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ult i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp ne i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; umin(X, Y) < X --> X > Y
+
+define i1 @ult_umin1(i32 %x, i32 %y) {
+; CHECK-LABEL: @ult_umin1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 %x, %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 %x, i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[SEL]], %x
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ult i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp ult i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @ult_umin2(i32 %x, i32 %y) {
+; CHECK-LABEL: @ult_umin2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 %y, %x
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ult i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp ult i32 %sel, %x
+  ret i1 %cmp2
+}
+
+; Disguise the icmp predicate by commuting the min op to the RHS.
+
+define i1 @ult_umin3(i32 %a, i32 %y) {
+; CHECK-LABEL: @ult_umin3(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[X]], %y
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32 [[X]], i32 %y
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ult i32 %x, %y
+  %sel = select i1 %cmp1, i32 %x, i32 %y
+  %cmp2 = icmp ugt i32 %x, %sel
+  ret i1 %cmp2
+}
+
+; Commute min operands.
+
+define i1 @ult_umin4(i32 %a, i32 %y) {
+; CHECK-LABEL: @ult_umin4(
+; CHECK-NEXT:    [[X:%.*]] = add i32 %a, 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[X]], %y
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %x = add i32 %a, 3 ; thwart complexity-based canonicalization
+  %cmp1 = icmp ult i32 %y, %x
+  %sel = select i1 %cmp1, i32 %y, i32 %x
+  %cmp2 = icmp ugt i32 %x, %sel
+  ret i1 %cmp2
+}
+
diff --git a/test/Transforms/InstSimplify/and-icmps-same-ops.ll b/test/Transforms/InstSimplify/and-icmps-same-ops.ll
new file mode 100644
index 0000000..4da7938
--- /dev/null
+++ b/test/Transforms/InstSimplify/and-icmps-same-ops.ll
@@ -0,0 +1,1239 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+; There are 10 * 10 combinations of icmp predicates that can be AND'd together.
+; The majority of these can be simplified to always false or just one of the icmps.
+
+define i1 @eq_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_eq(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @eq_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_ne(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @eq_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_sge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @eq_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_sgt(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @eq_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_sle(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @eq_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_slt(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @eq_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_uge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @eq_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_ugt(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @eq_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_ule(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @eq_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_ult(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+;
+
+define i1 @ne_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_eq(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ne_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_ne(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ne_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_sge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ne_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_sgt(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ne_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_sle(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ne_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_slt(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ne_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_uge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ne_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_ugt(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ne_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_ule(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ne_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_ult(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+;
+
+define i1 @sge_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_eq(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sge_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_ne(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sge_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_sge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sge_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_sgt(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sge_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_sle(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sge_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_slt(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sge_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_uge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sge_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_ugt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sge_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_ule(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sge_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_ult(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+;
+
+define i1 @sgt_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_eq(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sgt_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_ne(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sgt_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_sge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sgt_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_sgt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sgt_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_sle(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sgt_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_slt(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sgt_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_uge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sgt_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_ugt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sgt_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_ule(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sgt_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_ult(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+;
+
+define i1 @sle_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_eq(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sle_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_ne(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sle_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_sge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sle_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_sgt(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sle_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_sle(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sle_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_slt(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sle_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_uge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sle_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_ugt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sle_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_ule(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @sle_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_ult(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+;
+
+define i1 @slt_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_eq(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @slt_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_ne(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @slt_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_sge(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @slt_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_sgt(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @slt_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_sle(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @slt_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_slt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @slt_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_uge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @slt_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_ugt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @slt_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_ule(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @slt_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_ult(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+;
+
+define i1 @uge_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_eq(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @uge_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_ne(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @uge_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_sge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @uge_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_sgt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @uge_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_sle(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @uge_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_slt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @uge_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_uge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @uge_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_ugt(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @uge_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_ule(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @uge_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_ult(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+;
+
+define i1 @ugt_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_eq(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ugt_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_ne(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ugt_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_sge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ugt_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_sgt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ugt_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_sle(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ugt_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_slt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ugt_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_uge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ugt_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_ugt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ugt_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_ule(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ugt_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_ult(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+;
+
+define i1 @ule_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_eq(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ule_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_ne(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ule_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_sge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ule_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_sgt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ule_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_sle(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ule_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_slt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ule_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_uge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ule_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_ugt(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ule_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_ule(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ule_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_ult(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+;
+
+define i1 @ult_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_eq(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ult_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_ne(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ult_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_sge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ult_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_sgt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ult_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_sle(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ult_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_slt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ult_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_uge(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ult_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_ugt(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ult_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_ule(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ult_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_ult(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+; Check a couple of vector variants to make sure those work too.
+
+define <2 x i1> @ult_uge_vec(<2 x i8> %a, <2 x i8> %b) {
+; CHECK-LABEL: @ult_uge_vec(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %cmp1 = icmp ult <2 x i8> %a, %b
+  %cmp2 = icmp uge <2 x i8> %a, %b
+  %and = and <2 x i1> %cmp1, %cmp2
+  ret <2 x i1> %and
+}
+
+define <2 x i1> @ult_ule_vec(<2 x i8> %a, <2 x i8> %b) {
+; CHECK-LABEL: @ult_ule_vec(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <2 x i8> %a, %b
+; CHECK-NEXT:    ret <2 x i1> [[CMP1]]
+;
+  %cmp1 = icmp ult <2 x i8> %a, %b
+  %cmp2 = icmp ule <2 x i8> %a, %b
+  %and = and <2 x i1> %cmp1, %cmp2
+  ret <2 x i1> %and
+}
+
+define i1 @ult_uge_swap(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_uge_swap(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i8 %b, %a
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp uge i8 %b, %a
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @ult_ult_swap(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_ult_swap(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 %b, %a
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp ult i8 %b, %a
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
diff --git a/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
index b825ac8..21c9fdd 100644
--- a/test/Transforms/InstSimplify/floating-point-arithmetic.ll
+++ b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
@@ -1,10 +1,10 @@
-; NOTE: Assertions have been autogenerated by update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
 ; fsub -0.0, (fsub -0.0, X) ==> X
 define float @fsub_-0_-0_x(float %a) {
 ; CHECK-LABEL: @fsub_-0_-0_x(
-; CHECK:         ret float %a
+; CHECK-NEXT:    ret float %a
 ;
   %t1 = fsub float -0.0, %a
   %ret = fsub float -0.0, %t1
@@ -14,7 +14,7 @@
 ; fsub 0.0, (fsub -0.0, X) != X
 define float @fsub_0_-0_x(float %a) {
 ; CHECK-LABEL: @fsub_0_-0_x(
-; CHECK:         [[T1:%.*]] = fsub float 0.000000e+00, %a
+; CHECK-NEXT:    [[T1:%.*]] = fsub float 0.000000e+00, %a
 ; CHECK-NEXT:    [[RET:%.*]] = fsub float -0.000000e+00, [[T1]]
 ; CHECK-NEXT:    ret float [[RET]]
 ;
@@ -26,7 +26,7 @@
 ; fsub -0.0, (fsub 0.0, X) != X
 define float @fsub_-0_0_x(float %a) {
 ; CHECK-LABEL: @fsub_-0_0_x(
-; CHECK:         [[T1:%.*]] = fsub float -0.000000e+00, %a
+; CHECK-NEXT:    [[T1:%.*]] = fsub float -0.000000e+00, %a
 ; CHECK-NEXT:    [[RET:%.*]] = fsub float 0.000000e+00, [[T1]]
 ; CHECK-NEXT:    ret float [[RET]]
 ;
@@ -38,7 +38,7 @@
 ; fsub X, 0 ==> X
 define float @fsub_x_0(float %a) {
 ; CHECK-LABEL: @fsub_x_0(
-; CHECK:         ret float %a
+; CHECK-NEXT:    ret float %a
 ;
   %ret = fsub float %a, 0.0
   ret float %ret
@@ -47,7 +47,7 @@
 ; fadd X, -0 ==> X
 define float @fadd_x_n0(float %a) {
 ; CHECK-LABEL: @fadd_x_n0(
-; CHECK:         ret float %a
+; CHECK-NEXT:    ret float %a
 ;
   %ret = fadd float %a, -0.0
   ret float %ret
@@ -56,12 +56,21 @@
 ; fmul X, 1.0 ==> X
 define double @fmul_X_1(double %a) {
 ; CHECK-LABEL: @fmul_X_1(
-; CHECK:         ret double %a
+; CHECK-NEXT:    ret double %a
 ;
   %b = fmul double 1.000000e+00, %a
   ret double %b
 }
 
+; fdiv X, 1.0 ==> X
+define float @fdiv_x_1(float %a) {
+; CHECK-LABEL: @fdiv_x_1(
+; CHECK-NEXT:    ret float %a
+;
+  %ret = fdiv float %a, 1.0
+  ret float %ret
+}
+
 ; We can't optimize away the fadd in this test because the input
 ; value to the function and subsequently to the fadd may be -0.0.
 ; In that one special case, the result of the fadd should be +0.0
@@ -75,7 +84,13 @@
 
 define float @PR22688(float %x) {
 ; CHECK-LABEL: @PR22688(
-; CHECK:         [[TMP7:%.*]] = fadd float {{%.*}}, 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @sqrtf(float %x)
+; CHECK-NEXT:    [[TMP2:%.*]] = call float @sqrtf(float [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call float @sqrtf(float [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @sqrtf(float [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call float @sqrtf(float [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call float @sqrtf(float [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd float [[TMP6]], 0.000000e+00
 ; CHECK-NEXT:    ret float [[TMP7]]
 ;
   %1 = call float @sqrtf(float %x)
diff --git a/test/Transforms/InstSimplify/or-icmps-same-ops.ll b/test/Transforms/InstSimplify/or-icmps-same-ops.ll
new file mode 100644
index 0000000..326b1e1
--- /dev/null
+++ b/test/Transforms/InstSimplify/or-icmps-same-ops.ll
@@ -0,0 +1,1239 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+; There are 10 * 10 combinations of icmp predicates that can be OR'd together.
+; The majority of these can be simplified to always true or just one of the icmps.
+
+define i1 @eq_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_eq(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @eq_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_ne(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @eq_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_sge(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @eq_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_sgt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @eq_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_sle(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @eq_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_slt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @eq_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_uge(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @eq_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_ugt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @eq_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_ule(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @eq_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @eq_ult(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+;
+
+define i1 @ne_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_eq(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ne_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_ne(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ne_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_sge(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ne_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_sgt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ne_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_sle(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ne_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_slt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ne_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_uge(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ne_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_ugt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ne_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_ule(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ne_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @ne_ult(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+;
+
+define i1 @sge_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_eq(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sge_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_ne(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sge_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_sge(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sge_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_sgt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sge_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_sle(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sge_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_slt(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sge_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_uge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sge_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_ugt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sge_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_ule(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sge_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @sge_ult(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp sge i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+;
+
+define i1 @sgt_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_eq(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sgt_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_ne(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sgt_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_sge(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sgt_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_sgt(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sgt_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_sle(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sgt_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_slt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sgt_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_uge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sgt_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_ugt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sgt_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_ule(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sgt_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @sgt_ult(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp sgt i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+;
+
+define i1 @sle_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_eq(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sle_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_ne(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sle_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_sge(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sle_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_sgt(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sle_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_sle(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sle_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_slt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sle_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_uge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sle_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_ugt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sle_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_ule(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @sle_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @sle_ult(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp sle i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+;
+
+define i1 @slt_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_eq(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @slt_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_ne(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @slt_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_sge(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @slt_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_sgt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @slt_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_sle(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @slt_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_slt(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @slt_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_uge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @slt_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_ugt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @slt_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_ule(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @slt_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @slt_ult(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp slt i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+;
+
+define i1 @uge_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_eq(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @uge_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_ne(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @uge_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_sge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @uge_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_sgt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @uge_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_sle(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @uge_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_slt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @uge_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_uge(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @uge_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_ugt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @uge_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_ule(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @uge_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @uge_ult(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp uge i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+;
+
+define i1 @ugt_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_eq(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ugt_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_ne(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ugt_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_sge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ugt_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_sgt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ugt_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_sle(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ugt_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_slt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ugt_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_uge(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ugt_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_ugt(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ugt_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_ule(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ugt_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @ugt_ult(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ugt i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+;
+
+define i1 @ule_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_eq(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ule_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_ne(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ule_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_sge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ule_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_sgt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ule_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_sle(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ule_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_slt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ule_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_uge(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ule_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_ugt(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ule_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_ule(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ule_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @ule_ult(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ule i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+;
+
+define i1 @ult_eq(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_eq(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp eq i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ult_ne(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_ne(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp ne i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ult_sge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_sge(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp sge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ult_sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_sgt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp sgt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ult_sle(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_sle(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp sle i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ult_slt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_slt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp slt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ult_uge(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_uge(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp uge i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ult_ugt(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_ugt(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp ugt i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ult_ule(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_ule(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp ule i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ult_ult(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_ult(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp ult i8 %a, %b
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+; Check a couple of vector variants to make sure those work too.
+
+define <2 x i1> @ult_uge_vec(<2 x i8> %a, <2 x i8> %b) {
+; CHECK-LABEL: @ult_uge_vec(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %cmp1 = icmp ult <2 x i8> %a, %b
+  %cmp2 = icmp uge <2 x i8> %a, %b
+  %or = or <2 x i1> %cmp1, %cmp2
+  ret <2 x i1> %or
+}
+
+define <2 x i1> @ult_ule_vec(<2 x i8> %a, <2 x i8> %b) {
+; CHECK-LABEL: @ult_ule_vec(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ule <2 x i8> %a, %b
+; CHECK-NEXT:    ret <2 x i1> [[CMP2]]
+;
+  %cmp1 = icmp ult <2 x i8> %a, %b
+  %cmp2 = icmp ule <2 x i8> %a, %b
+  %or = or <2 x i1> %cmp1, %cmp2
+  ret <2 x i1> %or
+}
+
+define i1 @ult_ne_swap(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_ne_swap(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 %b, %a
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp ne i8 %b, %a
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @ult_ule_swap(i8 %a, i8 %b) {
+; CHECK-LABEL: @ult_ule_swap(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i8 %a, %b
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i8 %b, %a
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ult i8 %a, %b
+  %cmp2 = icmp uge i8 %b, %a
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
diff --git a/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
new file mode 100644
index 0000000..21b59f8
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -0,0 +1,63 @@
+; RUN: opt < %s -loop-vectorize -simplifycfg -S | FileCheck %s
+; RUN: opt < %s -force-vector-width=2 -loop-vectorize -simplifycfg -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: predicated_udiv_scalarized_operand
+;
+; This test checks that we correctly compute the scalarized operands for a
+; user-specified vectorization factor when interleaving is disabled. We use the
+; "optsize" attribute to disable all interleaving calculations.
+;
+; CHECK: vector.body:
+; CHECK:   %wide.load = load <2 x i64>, <2 x i64>* {{.*}}, align 4
+; CHECK:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; CHECK: [[IF0]]:
+; CHECK:   %[[T00:.+]] = extractelement <2 x i64> %wide.load, i32 0
+; CHECK:   %[[T01:.+]] = extractelement <2 x i64> %wide.load, i32 0
+; CHECK:   %[[T02:.+]] = add nsw i64 %[[T01]], %x
+; CHECK:   %[[T03:.+]] = udiv i64 %[[T00]], %[[T02]]
+; CHECK:   %[[T04:.+]] = insertelement <2 x i64> undef, i64 %[[T03]], i32 0
+; CHECK:   br label %[[CONT0]]
+; CHECK: [[CONT0]]:
+; CHECK:   %[[T05:.+]] = phi <2 x i64> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ]
+; CHECK:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; CHECK: [[IF1]]:
+; CHECK:   %[[T06:.+]] = extractelement <2 x i64> %wide.load, i32 1
+; CHECK:   %[[T07:.+]] = extractelement <2 x i64> %wide.load, i32 1
+; CHECK:   %[[T08:.+]] = add nsw i64 %[[T07]], %x
+; CHECK:   %[[T09:.+]] = udiv i64 %[[T06]], %[[T08]]
+; CHECK:   %[[T10:.+]] = insertelement <2 x i64> %[[T05]], i64 %[[T09]], i32 1
+; CHECK:   br label %[[CONT1]]
+; CHECK: [[CONT1]]:
+; CHECK:   phi <2 x i64> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ]
+; CHECK:   br i1 {{.*}}, label %middle.block, label %vector.body
+
+define i64 @predicated_udiv_scalarized_operand(i64* %a, i1 %c, i64 %x) optsize {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
+  %tmp2 = load i64, i64* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp3 = add nsw i64 %tmp2, %x
+  %tmp4 = udiv i64 %tmp2, %tmp3
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i64 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, 100
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i64 [ %tmp6, %for.inc ]
+  ret i64 %tmp7
+}
diff --git a/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
index 6b3a5b6..61ed205 100644
--- a/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -57,9 +57,9 @@
 ; as:
 ;
 ; Cost of store:
-;   (store(4) + extractelement(6)) / 2 = 5
+;   (store(4) + extractelement(3)) / 2 = 3
 ;
-; CHECK: Found an estimated cost of 5 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
+; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
 ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
 ;
 define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) {
@@ -85,3 +85,147 @@
 for.end:
   ret void
 }
+
+; CHECK-LABEL: predicated_udiv_scalarized_operand
+;
+; This test checks that we correctly compute the cost of the predicated udiv
+; instruction and the add instruction it uses. The add is scalarized and sunk
+; inside the predicated block.  If we assume the block probability is 50%, we
+; compute the cost as:
+;
+; Cost of add:
+;   (add(2) + extractelement(3)) / 2 = 2
+; Cost of udiv:
+;   (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4
+;
+; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
+; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
+; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
+;
+define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp2 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp3 = add nsw i32 %tmp2, %x
+  %tmp4 = udiv i32 %tmp2, %tmp3
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i32 [ %tmp2, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i32 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}
+
+; CHECK-LABEL: predicated_store_scalarized_operand
+;
+; This test checks that we correctly compute the cost of the predicated store
+; instruction and the add instruction it uses. The add is scalarized and sunk
+; inside the predicated block.  If we assume the block probability is 50%, we
+; compute the cost as:
+;
+; Cost of add:
+;   (add(2) + extractelement(3)) / 2 = 2
+; Cost of store:
+;   store(4) / 2 = 2
+;
+; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
+; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
+; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
+; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
+;
+define void @predicated_store_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp2 = add nsw i32 %tmp1, %x
+  store i32 %tmp2, i32* %tmp0, align 4
+  br label %for.inc
+
+for.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: predication_multi_context
+;
+; This test checks that we correctly compute the cost of multiple predicated
+; instructions in the same block. The sdiv, udiv, and store must be scalarized
+; and predicated. The sub feeding the store is scalarized and sunk inside the
+; store's predicated block. However, the add feeding the sdiv and udiv cannot
+; be sunk and is not scalarized. If we assume the block probability is 50%, we
+; compute the cost as:
+;
+; Cost of add:
+;   add(1) = 1
+; Cost of sdiv:
+;   (sdiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+; Cost of udiv:
+;   (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+; Cost of sub:
+;   (sub(2) + extractelement(3)) / 2 = 2
+; Cost of store:
+;   store(4) / 2 = 2
+;
+; CHECK:     Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x
+; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
+; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
+; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
+; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4
+; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x
+; CHECK:     Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2
+; CHECK:     Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
+; CHECK:     Scalarizing: %tmp5 = sub i32 %tmp4, %x
+; CHECK:     Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4
+;
+define void @predication_multi_context(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp2 = add i32 %tmp1, %x
+  %tmp3 = sdiv i32 %tmp1, %tmp2
+  %tmp4 = udiv i32 %tmp3, %tmp2
+  %tmp5 = sub i32 %tmp4, %x
+  store i32 %tmp5, i32* %tmp0, align 4
+  br label %for.inc
+
+for.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/X86/x86-predication.ll b/test/Transforms/LoopVectorize/X86/x86-predication.ll
new file mode 100644
index 0000000..b35fc59
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -mattr=avx -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -simplifycfg -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK-LABEL: predicated_sdiv_masked_load
+;
+; This test ensures that we don't scalarize the predicated load. Since the load
+; can be vectorized with predication, scalarizing it would cause its pointer
+; operand to become non-uniform.
+;
+; CHECK: vector.body:
+; CHECK:   %wide.masked.load = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32
+; CHECK:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; CHECK: [[IF0]]:
+; CHECK:   %[[T0:.+]] = extractelement <2 x i32> %wide.masked.load, i32 0
+; CHECK:   %[[T1:.+]] = sdiv i32 %[[T0]], %x
+; CHECK:   %[[T2:.+]] = insertelement <2 x i32> undef, i32 %[[T1]], i32 0
+; CHECK:   br label %[[CONT0]]
+; CHECK: [[CONT0]]:
+; CHECK:   %[[T3:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[T2]], %[[IF0]] ]
+; CHECK:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; CHECK: [[IF1]]:
+; CHECK:   %[[T4:.+]] = extractelement <2 x i32> %wide.masked.load, i32 1
+; CHECK:   %[[T5:.+]] = sdiv i32 %[[T4]], %x
+; CHECK:   %[[T6:.+]] = insertelement <2 x i32> %[[T3]], i32 %[[T5]], i32 1
+; CHECK:   br label %[[CONT1]]
+; CHECK: [[CONT1]]:
+; CHECK:   phi <2 x i32> [ %[[T3]], %[[CONT0]] ], [ %[[T6]], %[[IF1]] ]
+; CHECK:   br i1 {{.*}}, label %middle.block, label %vector.body
+
+define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp7, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp2 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp3 = load i32, i32* %tmp2, align 4
+  %tmp4 = sdiv i32 %tmp3, %x
+  %tmp5 = add nsw i32 %tmp4, %tmp1
+  br label %for.inc
+
+for.inc:
+  %tmp6 = phi i32 [ %tmp1, %for.body ], [ %tmp5, %if.then]
+  %tmp7 = add i32 %r, %tmp6
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 10000
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  %tmp8 = phi i32 [ %tmp7, %for.inc ]
+  ret i32 %tmp8
+}
diff --git a/test/Transforms/LoopVectorize/if-pred-non-void.ll b/test/Transforms/LoopVectorize/if-pred-non-void.ll
index a2c2580..dbbf7b3 100644
--- a/test/Transforms/LoopVectorize/if-pred-non-void.ll
+++ b/test/Transforms/LoopVectorize/if-pred-non-void.ll
@@ -207,3 +207,57 @@
   %exitcond = icmp eq i64 %indvars.iv.next, 128
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
+
+
+define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: predicated_udiv_scalarized_operand
+; CHECK: vector.body:
+; CHECK:   %wide.load = load <2 x i32>, <2 x i32>* {{.*}}, align 4
+; CHECK:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; CHECK: [[IF0]]:
+; CHECK:   %[[T00:.+]] = extractelement <2 x i32> %wide.load, i32 0
+; CHECK:   %[[T01:.+]] = extractelement <2 x i32> %wide.load, i32 0
+; CHECK:   %[[T02:.+]] = add nsw i32 %[[T01]], %x
+; CHECK:   %[[T03:.+]] = udiv i32 %[[T00]], %[[T02]]
+; CHECK:   %[[T04:.+]] = insertelement <2 x i32> undef, i32 %[[T03]], i32 0
+; CHECK:   br label %[[CONT0]]
+; CHECK: [[CONT0]]:
+; CHECK:   %[[T05:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ]
+; CHECK:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; CHECK: [[IF1]]:
+; CHECK:   %[[T06:.+]] = extractelement <2 x i32> %wide.load, i32 1
+; CHECK:   %[[T07:.+]] = extractelement <2 x i32> %wide.load, i32 1
+; CHECK:   %[[T08:.+]] = add nsw i32 %[[T07]], %x
+; CHECK:   %[[T09:.+]] = udiv i32 %[[T06]], %[[T08]]
+; CHECK:   %[[T10:.+]] = insertelement <2 x i32> %[[T05]], i32 %[[T09]], i32 1
+; CHECK:   br label %[[CONT1]]
+; CHECK: [[CONT1]]:
+; CHECK:   phi <2 x i32> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ]
+; CHECK:   br i1 {{.*}}, label %middle.block, label %vector.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp2 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp3 = add nsw i32 %tmp2, %x
+  %tmp4 = udiv i32 %tmp2, %tmp3
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i32 [ %tmp2, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i32 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}
diff --git a/test/Transforms/LoopVectorize/if-pred-stores.ll b/test/Transforms/LoopVectorize/if-pred-stores.ll
index f19485c..3912375 100644
--- a/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -12,7 +12,6 @@
 ; VEC-LABEL: test
 ; VEC:   %[[v0:.+]] = add i64 %index, 0
 ; VEC:   %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100>
-; VEC:   %[[v9:.+]] = add nsw <2 x i32> %{{.*}}, <i32 20, i32 20>
 ; VEC:   %[[v10:.+]] = and <2 x i1> %[[v8]], <i1 true, i1 true>
 ; VEC:   %[[o1:.+]] = or <2 x i1> zeroinitializer, %[[v10]]
 ; VEC:   %[[v11:.+]] = extractelement <2 x i1> %[[o1]], i32 0
@@ -20,9 +19,10 @@
 ; VEC:   br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]]
 ;
 ; VEC: [[cond]]:
-; VEC:   %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0
+; VEC:   %[[v13:.+]] = extractelement <2 x i32> %wide.load, i32 0
+; VEC:   %[[v9a:.+]] = add nsw i32 %[[v13]], 20
 ; VEC:   %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]]
-; VEC:   store i32 %[[v13]], i32* %[[v2]], align 4
+; VEC:   store i32 %[[v9a]], i32* %[[v2]], align 4
 ; VEC:   br label %[[else:.+]]
 ;
 ; VEC: [[else]]:
@@ -31,10 +31,11 @@
 ; VEC:   br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]]
 ;
 ; VEC: [[cond2]]:
-; VEC:   %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1
+; VEC:   %[[v17:.+]] = extractelement <2 x i32> %wide.load, i32 1
+; VEC:   %[[v9b:.+]] = add nsw i32 %[[v17]], 20
 ; VEC:   %[[v1:.+]] = add i64 %index, 1
 ; VEC:   %[[v4:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v1]]
-; VEC:   store i32 %[[v17]], i32* %[[v4]], align 4
+; VEC:   store i32 %[[v9b]], i32* %[[v4]], align 4
 ; VEC:   br label %[[else2:.+]]
 ;
 ; VEC: [[else2]]:
diff --git a/test/Transforms/LoopVersioning/loop-invariant-bound.ll b/test/Transforms/LoopVersioning/loop-invariant-bound.ll
new file mode 100644
index 0000000..3411adb
--- /dev/null
+++ b/test/Transforms/LoopVersioning/loop-invariant-bound.ll
@@ -0,0 +1,37 @@
+; RUN: opt -loop-versioning -S < %s | FileCheck %s
+; Checks that when introducing check, we don't accidentally introduce non-dominating instructions
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%Dual.212 = type { %Dual.213, %Partials.215 }
+%Dual.213 = type { double, %Partials.214 }
+%Partials.214 = type { [2 x double] }
+%Partials.215 = type { [2 x %Dual.213] }
+
+; Function Attrs: sspreq
+define void @"julia_axpy!_65480"(%Dual.212*) {
+top:
+  br label %if24
+
+; CHECK-NOT: %bc = bitcast i64* %v2.sroa.0.0..sroa_cast
+; CHECK: %bound0
+
+if24:                                             ; preds = %if24, %top
+  %"#temp#1.sroa.3.02" = phi i64 [ undef, %top ], [ %2, %if24 ]
+  %"#temp#1.sroa.0.01" = phi i64 [ undef, %top ], [ %1, %if24 ]
+  %1 = add i64 %"#temp#1.sroa.0.01", 1
+  %2 = add i64 %"#temp#1.sroa.3.02", 1
+  ; This pointer is loop invariant. LAA used to re-use it from memcheck, even though it didn't dominate.
+  %v2.sroa.0.0..sroa_cast = bitcast %Dual.212* %0 to i64*
+  %v2.sroa.0.0.copyload = load i64, i64* %v2.sroa.0.0..sroa_cast, align 1
+  %3 = add i64 %"#temp#1.sroa.0.01", -1
+  %4 = getelementptr inbounds %Dual.212, %Dual.212* undef, i64 %3, i32 1, i32 0, i64 0, i32 1, i32 0, i64 0
+  %5 = bitcast double* %4 to i64*
+  store i64 undef, i64* %5, align 8
+  %notlhs27 = icmp eq i64 %2, undef
+  %notrhs28 = icmp eq i64 %1, undef
+  %6 = or i1 %notrhs28, %notlhs27
+  br i1 %6, label %L41.L335_crit_edge, label %if24
+
+L41.L335_crit_edge:                               ; preds = %if24
+  ret void
+}
diff --git a/test/Transforms/LowerTypeTests/function.ll b/test/Transforms/LowerTypeTests/function.ll
index 7b7a6af..effe769 100644
--- a/test/Transforms/LowerTypeTests/function.ll
+++ b/test/Transforms/LowerTypeTests/function.ll
@@ -1,7 +1,8 @@
-; RUN: opt -S -lowertypetests -mtriple=i686-unknown-linux-gnu < %s | FileCheck --check-prefix=X86 %s
-; RUN: opt -S -lowertypetests -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck --check-prefix=X86 %s
-; RUN: opt -S -lowertypetests -mtriple=arm-unknown-linux-gnu < %s | FileCheck --check-prefix=ARM %s
-; RUN: opt -S -lowertypetests -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck --check-prefix=ARM %s
+; RUN: opt -S -lowertypetests -mtriple=i686-unknown-linux-gnu < %s | FileCheck --check-prefixes=X86,NATIVE %s
+; RUN: opt -S -lowertypetests -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck --check-prefixes=X86,NATIVE %s
+; RUN: opt -S -lowertypetests -mtriple=arm-unknown-linux-gnu < %s | FileCheck --check-prefixes=ARM,NATIVE %s
+; RUN: opt -S -lowertypetests -mtriple=thumb-unknown-linux-gnu < %s | FileCheck --check-prefixes=THUMB,NATIVE %s
+; RUN: opt -S -lowertypetests -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck --check-prefixes=ARM,NATIVE %s
 ; RUN: opt -S -lowertypetests -mtriple=wasm32-unknown-unknown < %s | FileCheck --check-prefix=WASM32 %s
 
 ; Tests that we correctly handle bitsets containing 2 or more functions.
@@ -15,7 +16,7 @@
 ; X86-NEXT: module asm ".type g, function"
 ; X86-NEXT: module asm "g = .cfi.jumptable + 8"
 ; X86-NEXT: module asm ".size g, 8"
-; X86-NEXT: module asm ".section .text.cfi, \22ax\22, @progbits"
+; X86-NEXT: module asm ".section .text.cfi, \22ax\22, %progbits"
 ; X86-NEXT: module asm ".balign 8"
 ; X86-NEXT: module asm ".cfi.jumptable:"
 ; X86-NEXT: module asm "jmp f.cfi@plt"
@@ -34,30 +35,43 @@
 ; ARM-NEXT: module asm ".type g, function"
 ; ARM-NEXT: module asm "g = .cfi.jumptable + 4"
 ; ARM-NEXT: module asm ".size g, 4"
-; ARM-NEXT: module asm ".section .text.cfi, \22ax\22, @progbits"
+; ARM-NEXT: module asm ".section .text.cfi, \22ax\22, %progbits"
 ; ARM-NEXT: module asm ".balign 4"
 ; ARM-NEXT: module asm ".cfi.jumptable:"
 ; ARM-NEXT: module asm "b f.cfi"
 ; ARM-NEXT: module asm "b g.cfi"
 
+; THUMB:      module asm ".globl f"
+; THUMB-NEXT: module asm ".type f, function"
+; THUMB-NEXT: module asm ".thumb_set f, .cfi.jumptable + 0"
+; THUMB-NEXT: module asm ".size f, 4"
+; THUMB-NEXT: module asm ".type g, function"
+; THUMB-NEXT: module asm ".thumb_set g, .cfi.jumptable + 4"
+; THUMB-NEXT: module asm ".size g, 4"
+; THUMB-NEXT: module asm ".section .text.cfi, \22ax\22, %progbits"
+; THUMB-NEXT: module asm ".balign 4"
+; THUMB-NEXT: module asm ".thumb_func"
+; THUMB-NEXT: module asm ".cfi.jumptable:"
+; THUMB-NEXT: module asm "b.w f.cfi"
+; THUMB-NEXT: module asm "b.w g.cfi"
+
+
 ; X86: @.cfi.jumptable = external hidden constant [2 x [8 x i8]]
 ; ARM: @.cfi.jumptable = external hidden constant [2 x [4 x i8]]
+; THUMB: @.cfi.jumptable = external hidden constant [2 x [4 x i8]]
 
 ; WASM32: private constant [0 x i8] zeroinitializer
 @0 = private unnamed_addr constant [2 x void (...)*] [void (...)* bitcast (void ()* @f to void (...)*), void (...)* bitcast (void ()* @g to void (...)*)], align 16
 
-; X86: @llvm.used = appending global [2 x i8*] [i8* bitcast (void ()* @f.cfi to i8*), i8* bitcast (void ()* @g.cfi to i8*)], section "llvm.metadata"
-; ARM: @llvm.used = appending global [2 x i8*] [i8* bitcast (void ()* @f.cfi to i8*), i8* bitcast (void ()* @g.cfi to i8*)], section "llvm.metadata"
+; NATIVE: @llvm.used = appending global [2 x i8*] [i8* bitcast (void ()* @f.cfi to i8*), i8* bitcast (void ()* @g.cfi to i8*)], section "llvm.metadata"
 
-; X86: define internal void @f.cfi()
-; ARM: define internal void @f.cfi()
+; NATIVE: define internal void @f.cfi()
 ; WASM32: define void @f() !type !{{[0-9]+}} !wasm.index ![[I0:[0-9]+]]
 define void @f() !type !0 {
   ret void
 }
 
-; X86: define internal void @g.cfi()
-; ARM: define internal void @g.cfi()
+; NATIVE: define internal void @g.cfi()
 ; WASM32: define internal void @g() !type !{{[0-9]+}} !wasm.index ![[I1:[0-9]+]]
 define internal void @g() !type !0 {
   ret void
@@ -76,10 +90,8 @@
   ret i1 %x
 }
 
-; X86: declare void @f()
-; ARM: declare void @f()
-; X86: declare hidden void @g()
-; ARM: declare hidden void @g()
+; NATIVE: declare void @f()
+; NATIVE: declare hidden void @g()
 
 ; WASM32: ![[I0]] = !{i64 1}
 ; WASM32: ![[I1]] = !{i64 2}
diff --git a/test/Transforms/LowerTypeTests/unsat.ll b/test/Transforms/LowerTypeTests/unsat.ll
new file mode 100644
index 0000000..5bafc9e
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/unsat.ll
@@ -0,0 +1,12 @@
+; FIXME: We should not require -O2 to simplify this to return false.
+; RUN: opt -S -lowertypetests -O2 < %s | FileCheck %s
+
+target datalayout = "e-p:32:32"
+
+declare i1 @llvm.type.test(i8* %ptr, metadata %bitset) nounwind readnone
+
+define i1 @foo(i8* %p) {
+  %x = call i1 @llvm.type.test(i8* %p, metadata !"typeid1")
+  ; CHECK: ret i1 false
+  ret i1 %x
+}
diff --git a/test/Transforms/SCCP/logical-nuke.ll b/test/Transforms/SCCP/logical-nuke.ll
index 4ef52a2..6ca16de4 100644
--- a/test/Transforms/SCCP/logical-nuke.ll
+++ b/test/Transforms/SCCP/logical-nuke.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -sccp -S | FileCheck %s
 
-; Test that SCCP has basic knowledge of when and/or nuke overdefined values.
+; Test that SCCP has basic knowledge of when and/or/mul nuke overdefined values.
 
 ; CHECK-LABEL: test
 ; CHECK: ret i32 0
@@ -29,3 +29,11 @@
   %Y = or i32 %X, undef
   ret i32 %Y
 }
+
+; X * 0 = 0 even if X is overdefined.
+; CHECK-LABEL: test5
+; CHECK: ret i32 0
+define i32 @test5(i32 %foo) {
+  %patatino = mul i32 %foo, 0
+  ret i32 %patatino
+}
diff --git a/test/Transforms/SCCP/undef-resolve.ll b/test/Transforms/SCCP/undef-resolve.ll
index fcfe3f5..dd7f1f3 100644
--- a/test/Transforms/SCCP/undef-resolve.ll
+++ b/test/Transforms/SCCP/undef-resolve.ll
@@ -135,7 +135,7 @@
   %t = ashr i32 undef, 31
   ret i32 %t
 ; CHECK-LABEL: @test6(
-; CHECK: ret i32 -1
+; CHECK: ret i32 0
 }
 
 ; Make sure lshr produces a possible value
@@ -178,5 +178,5 @@
   %shr4 = ashr i32 undef, zext (i1 icmp eq (i32* bitcast (i32 (i1)* @test11 to i32*), i32* @GV) to i32)
   ret i32 %shr4
 ; CHECK-LABEL: @test11(
-; CHECK: ret i32 -1
+; CHECK: ret i32 0
 }
diff --git a/test/Transforms/SLPVectorizer/X86/arith-fp.ll b/test/Transforms/SLPVectorizer/X86/arith-fp.ll
index cdbba35..7eec13e 100644
--- a/test/Transforms/SLPVectorizer/X86/arith-fp.ll
+++ b/test/Transforms/SLPVectorizer/X86/arith-fp.ll
@@ -222,22 +222,15 @@
 
 define <4 x double> @buildvector_add_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_add_4f64(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x double> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x double> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x double> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x double> %a, i32 3
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x double> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x double> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> %b, i32 3
-; CHECK-NEXT:    [[C0:%.*]] = fadd double [[A0]], [[B0]]
-; CHECK-NEXT:    [[C1:%.*]] = fadd double [[A1]], [[B1]]
-; CHECK-NEXT:    [[C2:%.*]] = fadd double [[A2]], [[B2]]
-; CHECK-NEXT:    [[C3:%.*]] = fadd double [[A3]], [[B3]]
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[C0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[C1]], i32 1
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[C2]], i32 2
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[C3]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
 ; CHECK-NEXT:    ret <4 x double> [[R3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
@@ -261,22 +254,15 @@
 
 define <4 x double> @buildvector_sub_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_sub_4f64(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x double> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x double> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x double> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x double> %a, i32 3
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x double> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x double> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> %b, i32 3
-; CHECK-NEXT:    [[C0:%.*]] = fsub double [[A0]], [[B0]]
-; CHECK-NEXT:    [[C1:%.*]] = fsub double [[A1]], [[B1]]
-; CHECK-NEXT:    [[C2:%.*]] = fsub double [[A2]], [[B2]]
-; CHECK-NEXT:    [[C3:%.*]] = fsub double [[A3]], [[B3]]
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[C0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[C1]], i32 1
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[C2]], i32 2
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[C3]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
 ; CHECK-NEXT:    ret <4 x double> [[R3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
@@ -300,22 +286,15 @@
 
 define <4 x double> @buildvector_mul_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_mul_4f64(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x double> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x double> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x double> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x double> %a, i32 3
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x double> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x double> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> %b, i32 3
-; CHECK-NEXT:    [[C0:%.*]] = fmul double [[A0]], [[B0]]
-; CHECK-NEXT:    [[C1:%.*]] = fmul double [[A1]], [[B1]]
-; CHECK-NEXT:    [[C2:%.*]] = fmul double [[A2]], [[B2]]
-; CHECK-NEXT:    [[C3:%.*]] = fmul double [[A3]], [[B3]]
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[C0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[C1]], i32 1
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[C2]], i32 2
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[C3]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x double> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
 ; CHECK-NEXT:    ret <4 x double> [[R3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
@@ -339,32 +318,15 @@
 
 define <4 x double> @buildvector_div_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_div_4f64(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x double> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x double> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x double> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x double> %a, i32 3
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x double> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x double> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> %b, i32 3
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[B0]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fdiv <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> undef, double [[A2]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A3]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> undef, double [[B2]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[B3]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP12]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP10]], i32 0
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP13]], i32 2
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP10]], i32 1
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP14]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
 ; CHECK-NEXT:    ret <4 x double> [[R3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
@@ -388,38 +350,23 @@
 
 define <8 x float> @buildvector_add_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_add_8f32(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <8 x float> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x float> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x float> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x float> %a, i32 3
-; CHECK-NEXT:    [[A4:%.*]] = extractelement <8 x float> %a, i32 4
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x float> %a, i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x float> %a, i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x float> %a, i32 7
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x float> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x float> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <8 x float> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <8 x float> %b, i32 3
-; CHECK-NEXT:    [[B4:%.*]] = extractelement <8 x float> %b, i32 4
-; CHECK-NEXT:    [[B5:%.*]] = extractelement <8 x float> %b, i32 5
-; CHECK-NEXT:    [[B6:%.*]] = extractelement <8 x float> %b, i32 6
-; CHECK-NEXT:    [[B7:%.*]] = extractelement <8 x float> %b, i32 7
-; CHECK-NEXT:    [[C0:%.*]] = fadd float [[A0]], [[B0]]
-; CHECK-NEXT:    [[C1:%.*]] = fadd float [[A1]], [[B1]]
-; CHECK-NEXT:    [[C2:%.*]] = fadd float [[A2]], [[B2]]
-; CHECK-NEXT:    [[C3:%.*]] = fadd float [[A3]], [[B3]]
-; CHECK-NEXT:    [[C4:%.*]] = fadd float [[A4]], [[B4]]
-; CHECK-NEXT:    [[C5:%.*]] = fadd float [[A5]], [[B5]]
-; CHECK-NEXT:    [[C6:%.*]] = fadd float [[A6]], [[B6]]
-; CHECK-NEXT:    [[C7:%.*]] = fadd float [[A7]], [[B7]]
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[C0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[C1]], i32 1
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[C2]], i32 2
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[C3]], i32 3
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[C4]], i32 4
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[C5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[C6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[C7]], i32 7
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
 ; CHECK-NEXT:    ret <8 x float> [[R7]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
@@ -459,38 +406,23 @@
 
 define <8 x float> @buildvector_sub_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_sub_8f32(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <8 x float> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x float> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x float> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x float> %a, i32 3
-; CHECK-NEXT:    [[A4:%.*]] = extractelement <8 x float> %a, i32 4
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x float> %a, i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x float> %a, i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x float> %a, i32 7
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x float> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x float> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <8 x float> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <8 x float> %b, i32 3
-; CHECK-NEXT:    [[B4:%.*]] = extractelement <8 x float> %b, i32 4
-; CHECK-NEXT:    [[B5:%.*]] = extractelement <8 x float> %b, i32 5
-; CHECK-NEXT:    [[B6:%.*]] = extractelement <8 x float> %b, i32 6
-; CHECK-NEXT:    [[B7:%.*]] = extractelement <8 x float> %b, i32 7
-; CHECK-NEXT:    [[C0:%.*]] = fsub float [[A0]], [[B0]]
-; CHECK-NEXT:    [[C1:%.*]] = fsub float [[A1]], [[B1]]
-; CHECK-NEXT:    [[C2:%.*]] = fsub float [[A2]], [[B2]]
-; CHECK-NEXT:    [[C3:%.*]] = fsub float [[A3]], [[B3]]
-; CHECK-NEXT:    [[C4:%.*]] = fsub float [[A4]], [[B4]]
-; CHECK-NEXT:    [[C5:%.*]] = fsub float [[A5]], [[B5]]
-; CHECK-NEXT:    [[C6:%.*]] = fsub float [[A6]], [[B6]]
-; CHECK-NEXT:    [[C7:%.*]] = fsub float [[A7]], [[B7]]
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[C0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[C1]], i32 1
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[C2]], i32 2
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[C3]], i32 3
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[C4]], i32 4
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[C5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[C6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[C7]], i32 7
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
 ; CHECK-NEXT:    ret <8 x float> [[R7]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
@@ -530,38 +462,23 @@
 
 define <8 x float> @buildvector_mul_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_mul_8f32(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <8 x float> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x float> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x float> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x float> %a, i32 3
-; CHECK-NEXT:    [[A4:%.*]] = extractelement <8 x float> %a, i32 4
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x float> %a, i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x float> %a, i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x float> %a, i32 7
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x float> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x float> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <8 x float> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <8 x float> %b, i32 3
-; CHECK-NEXT:    [[B4:%.*]] = extractelement <8 x float> %b, i32 4
-; CHECK-NEXT:    [[B5:%.*]] = extractelement <8 x float> %b, i32 5
-; CHECK-NEXT:    [[B6:%.*]] = extractelement <8 x float> %b, i32 6
-; CHECK-NEXT:    [[B7:%.*]] = extractelement <8 x float> %b, i32 7
-; CHECK-NEXT:    [[C0:%.*]] = fmul float [[A0]], [[B0]]
-; CHECK-NEXT:    [[C1:%.*]] = fmul float [[A1]], [[B1]]
-; CHECK-NEXT:    [[C2:%.*]] = fmul float [[A2]], [[B2]]
-; CHECK-NEXT:    [[C3:%.*]] = fmul float [[A3]], [[B3]]
-; CHECK-NEXT:    [[C4:%.*]] = fmul float [[A4]], [[B4]]
-; CHECK-NEXT:    [[C5:%.*]] = fmul float [[A5]], [[B5]]
-; CHECK-NEXT:    [[C6:%.*]] = fmul float [[A6]], [[B6]]
-; CHECK-NEXT:    [[C7:%.*]] = fmul float [[A7]], [[B7]]
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[C0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[C1]], i32 1
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[C2]], i32 2
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[C3]], i32 3
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[C4]], i32 4
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[C5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[C6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[C7]], i32 7
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x float> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
 ; CHECK-NEXT:    ret <8 x float> [[R7]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
@@ -601,56 +518,23 @@
 
 define <8 x float> @buildvector_div_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_div_8f32(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <8 x float> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x float> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x float> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x float> %a, i32 3
-; CHECK-NEXT:    [[A4:%.*]] = extractelement <8 x float> %a, i32 4
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x float> %a, i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x float> %a, i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x float> %a, i32 7
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x float> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x float> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <8 x float> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <8 x float> %b, i32 3
-; CHECK-NEXT:    [[B4:%.*]] = extractelement <8 x float> %b, i32 4
-; CHECK-NEXT:    [[B5:%.*]] = extractelement <8 x float> %b, i32 5
-; CHECK-NEXT:    [[B6:%.*]] = extractelement <8 x float> %b, i32 6
-; CHECK-NEXT:    [[B7:%.*]] = extractelement <8 x float> %b, i32 7
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[A1]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[A2]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[A3]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> undef, float [[B0]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[B1]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[B2]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[B3]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = fdiv <4 x float> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> undef, float [[A4]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[A5]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[A6]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[A7]], i32 3
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> undef, float [[B4]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[B5]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[B6]], i32 2
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[B7]], i32 3
-; CHECK-NEXT:    [[TMP18:%.*]] = fdiv <4 x float> [[TMP13]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP9]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP19]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP9]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP20]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP9]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP21]], i32 2
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP9]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP22]], i32 3
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP18]], i32 0
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP23]], i32 4
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[TMP18]], i32 1
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP24]], i32 5
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP18]], i32 2
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP25]], i32 6
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x float> [[TMP18]], i32 3
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP26]], i32 7
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
 ; CHECK-NEXT:    ret <8 x float> [[R7]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
@@ -694,38 +578,23 @@
 
 define <8 x double> @buildvector_add_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_add_8f64(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <8 x double> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x double> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x double> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x double> %a, i32 3
-; CHECK-NEXT:    [[A4:%.*]] = extractelement <8 x double> %a, i32 4
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x double> %a, i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x double> %a, i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x double> %a, i32 7
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x double> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x double> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <8 x double> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <8 x double> %b, i32 3
-; CHECK-NEXT:    [[B4:%.*]] = extractelement <8 x double> %b, i32 4
-; CHECK-NEXT:    [[B5:%.*]] = extractelement <8 x double> %b, i32 5
-; CHECK-NEXT:    [[B6:%.*]] = extractelement <8 x double> %b, i32 6
-; CHECK-NEXT:    [[B7:%.*]] = extractelement <8 x double> %b, i32 7
-; CHECK-NEXT:    [[C0:%.*]] = fadd double [[A0]], [[B0]]
-; CHECK-NEXT:    [[C1:%.*]] = fadd double [[A1]], [[B1]]
-; CHECK-NEXT:    [[C2:%.*]] = fadd double [[A2]], [[B2]]
-; CHECK-NEXT:    [[C3:%.*]] = fadd double [[A3]], [[B3]]
-; CHECK-NEXT:    [[C4:%.*]] = fadd double [[A4]], [[B4]]
-; CHECK-NEXT:    [[C5:%.*]] = fadd double [[A5]], [[B5]]
-; CHECK-NEXT:    [[C6:%.*]] = fadd double [[A6]], [[B6]]
-; CHECK-NEXT:    [[C7:%.*]] = fadd double [[A7]], [[B7]]
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[C0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[C1]], i32 1
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[C2]], i32 2
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
 ; CHECK-NEXT:    ret <8 x double> [[R7]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
@@ -765,38 +634,23 @@
 
 define <8 x double> @buildvector_sub_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_sub_8f64(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <8 x double> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x double> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x double> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x double> %a, i32 3
-; CHECK-NEXT:    [[A4:%.*]] = extractelement <8 x double> %a, i32 4
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x double> %a, i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x double> %a, i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x double> %a, i32 7
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x double> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x double> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <8 x double> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <8 x double> %b, i32 3
-; CHECK-NEXT:    [[B4:%.*]] = extractelement <8 x double> %b, i32 4
-; CHECK-NEXT:    [[B5:%.*]] = extractelement <8 x double> %b, i32 5
-; CHECK-NEXT:    [[B6:%.*]] = extractelement <8 x double> %b, i32 6
-; CHECK-NEXT:    [[B7:%.*]] = extractelement <8 x double> %b, i32 7
-; CHECK-NEXT:    [[C0:%.*]] = fsub double [[A0]], [[B0]]
-; CHECK-NEXT:    [[C1:%.*]] = fsub double [[A1]], [[B1]]
-; CHECK-NEXT:    [[C2:%.*]] = fsub double [[A2]], [[B2]]
-; CHECK-NEXT:    [[C3:%.*]] = fsub double [[A3]], [[B3]]
-; CHECK-NEXT:    [[C4:%.*]] = fsub double [[A4]], [[B4]]
-; CHECK-NEXT:    [[C5:%.*]] = fsub double [[A5]], [[B5]]
-; CHECK-NEXT:    [[C6:%.*]] = fsub double [[A6]], [[B6]]
-; CHECK-NEXT:    [[C7:%.*]] = fsub double [[A7]], [[B7]]
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[C0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[C1]], i32 1
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[C2]], i32 2
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
 ; CHECK-NEXT:    ret <8 x double> [[R7]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
@@ -836,38 +690,23 @@
 
 define <8 x double> @buildvector_mul_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_mul_8f64(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <8 x double> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x double> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x double> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x double> %a, i32 3
-; CHECK-NEXT:    [[A4:%.*]] = extractelement <8 x double> %a, i32 4
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x double> %a, i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x double> %a, i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x double> %a, i32 7
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x double> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x double> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <8 x double> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <8 x double> %b, i32 3
-; CHECK-NEXT:    [[B4:%.*]] = extractelement <8 x double> %b, i32 4
-; CHECK-NEXT:    [[B5:%.*]] = extractelement <8 x double> %b, i32 5
-; CHECK-NEXT:    [[B6:%.*]] = extractelement <8 x double> %b, i32 6
-; CHECK-NEXT:    [[B7:%.*]] = extractelement <8 x double> %b, i32 7
-; CHECK-NEXT:    [[C0:%.*]] = fmul double [[A0]], [[B0]]
-; CHECK-NEXT:    [[C1:%.*]] = fmul double [[A1]], [[B1]]
-; CHECK-NEXT:    [[C2:%.*]] = fmul double [[A2]], [[B2]]
-; CHECK-NEXT:    [[C3:%.*]] = fmul double [[A3]], [[B3]]
-; CHECK-NEXT:    [[C4:%.*]] = fmul double [[A4]], [[B4]]
-; CHECK-NEXT:    [[C5:%.*]] = fmul double [[A5]], [[B5]]
-; CHECK-NEXT:    [[C6:%.*]] = fmul double [[A6]], [[B6]]
-; CHECK-NEXT:    [[C7:%.*]] = fmul double [[A7]], [[B7]]
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[C0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[C1]], i32 1
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[C2]], i32 2
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
 ; CHECK-NEXT:    ret <8 x double> [[R7]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
@@ -907,58 +746,23 @@
 
 define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_div_8f64(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <8 x double> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x double> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x double> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x double> %a, i32 3
-; CHECK-NEXT:    [[A4:%.*]] = extractelement <8 x double> %a, i32 4
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x double> %a, i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x double> %a, i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x double> %a, i32 7
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x double> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x double> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <8 x double> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <8 x double> %b, i32 3
-; CHECK-NEXT:    [[B4:%.*]] = extractelement <8 x double> %b, i32 4
-; CHECK-NEXT:    [[B5:%.*]] = extractelement <8 x double> %b, i32 5
-; CHECK-NEXT:    [[B6:%.*]] = extractelement <8 x double> %b, i32 6
-; CHECK-NEXT:    [[B7:%.*]] = extractelement <8 x double> %b, i32 7
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[B0]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fdiv <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> undef, double [[A2]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A3]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> undef, double [[B2]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[B3]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> undef, double [[A4]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[A5]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> undef, double [[B4]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[B5]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = fdiv <2 x double> [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> undef, double [[A6]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[A7]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> undef, double [[B6]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> [[TMP18]], double [[B7]], i32 1
-; CHECK-NEXT:    [[TMP20:%.*]] = fdiv <2 x double> [[TMP17]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP21]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP22]], i32 1
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x double> [[TMP10]], i32 0
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP23]], i32 2
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x double> [[TMP10]], i32 1
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP24]], i32 3
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x double> [[TMP15]], i32 0
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP25]], i32 4
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x double> [[TMP15]], i32 1
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP26]], i32 5
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[TMP20]], i32 0
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP27]], i32 6
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x double> [[TMP20]], i32 1
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP28]], i32 7
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
 ; CHECK-NEXT:    ret <8 x double> [[R7]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
@@ -998,70 +802,39 @@
 
 define <16 x float> @buildvector_add_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_add_16f32(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <16 x float> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <16 x float> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <16 x float> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <16 x float> %a, i32 3
-; CHECK-NEXT:    [[A4:%.*]] = extractelement <16 x float> %a, i32 4
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <16 x float> %a, i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <16 x float> %a, i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <16 x float> %a, i32 7
-; CHECK-NEXT:    [[A8:%.*]] = extractelement <16 x float> %a, i32 8
-; CHECK-NEXT:    [[A9:%.*]] = extractelement <16 x float> %a, i32 9
-; CHECK-NEXT:    [[A10:%.*]] = extractelement <16 x float> %a, i32 10
-; CHECK-NEXT:    [[A11:%.*]] = extractelement <16 x float> %a, i32 11
-; CHECK-NEXT:    [[A12:%.*]] = extractelement <16 x float> %a, i32 12
-; CHECK-NEXT:    [[A13:%.*]] = extractelement <16 x float> %a, i32 13
-; CHECK-NEXT:    [[A14:%.*]] = extractelement <16 x float> %a, i32 14
-; CHECK-NEXT:    [[A15:%.*]] = extractelement <16 x float> %a, i32 15
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <16 x float> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <16 x float> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <16 x float> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <16 x float> %b, i32 3
-; CHECK-NEXT:    [[B4:%.*]] = extractelement <16 x float> %b, i32 4
-; CHECK-NEXT:    [[B5:%.*]] = extractelement <16 x float> %b, i32 5
-; CHECK-NEXT:    [[B6:%.*]] = extractelement <16 x float> %b, i32 6
-; CHECK-NEXT:    [[B7:%.*]] = extractelement <16 x float> %b, i32 7
-; CHECK-NEXT:    [[B8:%.*]] = extractelement <16 x float> %b, i32 8
-; CHECK-NEXT:    [[B9:%.*]] = extractelement <16 x float> %b, i32 9
-; CHECK-NEXT:    [[B10:%.*]] = extractelement <16 x float> %b, i32 10
-; CHECK-NEXT:    [[B11:%.*]] = extractelement <16 x float> %b, i32 11
-; CHECK-NEXT:    [[B12:%.*]] = extractelement <16 x float> %b, i32 12
-; CHECK-NEXT:    [[B13:%.*]] = extractelement <16 x float> %b, i32 13
-; CHECK-NEXT:    [[B14:%.*]] = extractelement <16 x float> %b, i32 14
-; CHECK-NEXT:    [[B15:%.*]] = extractelement <16 x float> %b, i32 15
-; CHECK-NEXT:    [[C0:%.*]] = fadd float [[A0]], [[B0]]
-; CHECK-NEXT:    [[C1:%.*]] = fadd float [[A1]], [[B1]]
-; CHECK-NEXT:    [[C2:%.*]] = fadd float [[A2]], [[B2]]
-; CHECK-NEXT:    [[C3:%.*]] = fadd float [[A3]], [[B3]]
-; CHECK-NEXT:    [[C4:%.*]] = fadd float [[A4]], [[B4]]
-; CHECK-NEXT:    [[C5:%.*]] = fadd float [[A5]], [[B5]]
-; CHECK-NEXT:    [[C6:%.*]] = fadd float [[A6]], [[B6]]
-; CHECK-NEXT:    [[C7:%.*]] = fadd float [[A7]], [[B7]]
-; CHECK-NEXT:    [[C8:%.*]] = fadd float [[A8]], [[B8]]
-; CHECK-NEXT:    [[C9:%.*]] = fadd float [[A9]], [[B9]]
-; CHECK-NEXT:    [[C10:%.*]] = fadd float [[A10]], [[B10]]
-; CHECK-NEXT:    [[C11:%.*]] = fadd float [[A11]], [[B11]]
-; CHECK-NEXT:    [[C12:%.*]] = fadd float [[A12]], [[B12]]
-; CHECK-NEXT:    [[C13:%.*]] = fadd float [[A13]], [[B13]]
-; CHECK-NEXT:    [[C14:%.*]] = fadd float [[A14]], [[B14]]
-; CHECK-NEXT:    [[C15:%.*]] = fadd float [[A15]], [[B15]]
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[C0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[C1]], i32 1
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[C2]], i32 2
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[C3]], i32 3
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[C4]], i32 4
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[C5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[C6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[C7]], i32 7
-; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[C8]], i32 8
-; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[C9]], i32 9
-; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[C10]], i32 10
-; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[C11]], i32 11
-; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[C12]], i32 12
-; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[C13]], i32 13
-; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[C14]], i32 14
-; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[C15]], i32 15
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
+; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
+; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
+; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
+; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
+; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
+; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
+; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
+; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
 ; CHECK-NEXT:    ret <16 x float> [[R15]]
 ;
   %a0  = extractelement <16 x float> %a, i32 0
@@ -1133,70 +906,39 @@
 
 define <16 x float> @buildvector_sub_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_sub_16f32(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <16 x float> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <16 x float> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <16 x float> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <16 x float> %a, i32 3
-; CHECK-NEXT:    [[A4:%.*]] = extractelement <16 x float> %a, i32 4
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <16 x float> %a, i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <16 x float> %a, i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <16 x float> %a, i32 7
-; CHECK-NEXT:    [[A8:%.*]] = extractelement <16 x float> %a, i32 8
-; CHECK-NEXT:    [[A9:%.*]] = extractelement <16 x float> %a, i32 9
-; CHECK-NEXT:    [[A10:%.*]] = extractelement <16 x float> %a, i32 10
-; CHECK-NEXT:    [[A11:%.*]] = extractelement <16 x float> %a, i32 11
-; CHECK-NEXT:    [[A12:%.*]] = extractelement <16 x float> %a, i32 12
-; CHECK-NEXT:    [[A13:%.*]] = extractelement <16 x float> %a, i32 13
-; CHECK-NEXT:    [[A14:%.*]] = extractelement <16 x float> %a, i32 14
-; CHECK-NEXT:    [[A15:%.*]] = extractelement <16 x float> %a, i32 15
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <16 x float> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <16 x float> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <16 x float> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <16 x float> %b, i32 3
-; CHECK-NEXT:    [[B4:%.*]] = extractelement <16 x float> %b, i32 4
-; CHECK-NEXT:    [[B5:%.*]] = extractelement <16 x float> %b, i32 5
-; CHECK-NEXT:    [[B6:%.*]] = extractelement <16 x float> %b, i32 6
-; CHECK-NEXT:    [[B7:%.*]] = extractelement <16 x float> %b, i32 7
-; CHECK-NEXT:    [[B8:%.*]] = extractelement <16 x float> %b, i32 8
-; CHECK-NEXT:    [[B9:%.*]] = extractelement <16 x float> %b, i32 9
-; CHECK-NEXT:    [[B10:%.*]] = extractelement <16 x float> %b, i32 10
-; CHECK-NEXT:    [[B11:%.*]] = extractelement <16 x float> %b, i32 11
-; CHECK-NEXT:    [[B12:%.*]] = extractelement <16 x float> %b, i32 12
-; CHECK-NEXT:    [[B13:%.*]] = extractelement <16 x float> %b, i32 13
-; CHECK-NEXT:    [[B14:%.*]] = extractelement <16 x float> %b, i32 14
-; CHECK-NEXT:    [[B15:%.*]] = extractelement <16 x float> %b, i32 15
-; CHECK-NEXT:    [[C0:%.*]] = fsub float [[A0]], [[B0]]
-; CHECK-NEXT:    [[C1:%.*]] = fsub float [[A1]], [[B1]]
-; CHECK-NEXT:    [[C2:%.*]] = fsub float [[A2]], [[B2]]
-; CHECK-NEXT:    [[C3:%.*]] = fsub float [[A3]], [[B3]]
-; CHECK-NEXT:    [[C4:%.*]] = fsub float [[A4]], [[B4]]
-; CHECK-NEXT:    [[C5:%.*]] = fsub float [[A5]], [[B5]]
-; CHECK-NEXT:    [[C6:%.*]] = fsub float [[A6]], [[B6]]
-; CHECK-NEXT:    [[C7:%.*]] = fsub float [[A7]], [[B7]]
-; CHECK-NEXT:    [[C8:%.*]] = fsub float [[A8]], [[B8]]
-; CHECK-NEXT:    [[C9:%.*]] = fsub float [[A9]], [[B9]]
-; CHECK-NEXT:    [[C10:%.*]] = fsub float [[A10]], [[B10]]
-; CHECK-NEXT:    [[C11:%.*]] = fsub float [[A11]], [[B11]]
-; CHECK-NEXT:    [[C12:%.*]] = fsub float [[A12]], [[B12]]
-; CHECK-NEXT:    [[C13:%.*]] = fsub float [[A13]], [[B13]]
-; CHECK-NEXT:    [[C14:%.*]] = fsub float [[A14]], [[B14]]
-; CHECK-NEXT:    [[C15:%.*]] = fsub float [[A15]], [[B15]]
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[C0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[C1]], i32 1
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[C2]], i32 2
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[C3]], i32 3
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[C4]], i32 4
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[C5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[C6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[C7]], i32 7
-; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[C8]], i32 8
-; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[C9]], i32 9
-; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[C10]], i32 10
-; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[C11]], i32 11
-; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[C12]], i32 12
-; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[C13]], i32 13
-; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[C14]], i32 14
-; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[C15]], i32 15
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
+; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
+; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
+; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
+; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
+; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
+; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
+; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
+; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
 ; CHECK-NEXT:    ret <16 x float> [[R15]]
 ;
   %a0  = extractelement <16 x float> %a, i32 0
@@ -1268,70 +1010,39 @@
 
 define <16 x float> @buildvector_mul_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_mul_16f32(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <16 x float> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <16 x float> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <16 x float> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <16 x float> %a, i32 3
-; CHECK-NEXT:    [[A4:%.*]] = extractelement <16 x float> %a, i32 4
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <16 x float> %a, i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <16 x float> %a, i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <16 x float> %a, i32 7
-; CHECK-NEXT:    [[A8:%.*]] = extractelement <16 x float> %a, i32 8
-; CHECK-NEXT:    [[A9:%.*]] = extractelement <16 x float> %a, i32 9
-; CHECK-NEXT:    [[A10:%.*]] = extractelement <16 x float> %a, i32 10
-; CHECK-NEXT:    [[A11:%.*]] = extractelement <16 x float> %a, i32 11
-; CHECK-NEXT:    [[A12:%.*]] = extractelement <16 x float> %a, i32 12
-; CHECK-NEXT:    [[A13:%.*]] = extractelement <16 x float> %a, i32 13
-; CHECK-NEXT:    [[A14:%.*]] = extractelement <16 x float> %a, i32 14
-; CHECK-NEXT:    [[A15:%.*]] = extractelement <16 x float> %a, i32 15
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <16 x float> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <16 x float> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <16 x float> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <16 x float> %b, i32 3
-; CHECK-NEXT:    [[B4:%.*]] = extractelement <16 x float> %b, i32 4
-; CHECK-NEXT:    [[B5:%.*]] = extractelement <16 x float> %b, i32 5
-; CHECK-NEXT:    [[B6:%.*]] = extractelement <16 x float> %b, i32 6
-; CHECK-NEXT:    [[B7:%.*]] = extractelement <16 x float> %b, i32 7
-; CHECK-NEXT:    [[B8:%.*]] = extractelement <16 x float> %b, i32 8
-; CHECK-NEXT:    [[B9:%.*]] = extractelement <16 x float> %b, i32 9
-; CHECK-NEXT:    [[B10:%.*]] = extractelement <16 x float> %b, i32 10
-; CHECK-NEXT:    [[B11:%.*]] = extractelement <16 x float> %b, i32 11
-; CHECK-NEXT:    [[B12:%.*]] = extractelement <16 x float> %b, i32 12
-; CHECK-NEXT:    [[B13:%.*]] = extractelement <16 x float> %b, i32 13
-; CHECK-NEXT:    [[B14:%.*]] = extractelement <16 x float> %b, i32 14
-; CHECK-NEXT:    [[B15:%.*]] = extractelement <16 x float> %b, i32 15
-; CHECK-NEXT:    [[C0:%.*]] = fmul float [[A0]], [[B0]]
-; CHECK-NEXT:    [[C1:%.*]] = fmul float [[A1]], [[B1]]
-; CHECK-NEXT:    [[C2:%.*]] = fmul float [[A2]], [[B2]]
-; CHECK-NEXT:    [[C3:%.*]] = fmul float [[A3]], [[B3]]
-; CHECK-NEXT:    [[C4:%.*]] = fmul float [[A4]], [[B4]]
-; CHECK-NEXT:    [[C5:%.*]] = fmul float [[A5]], [[B5]]
-; CHECK-NEXT:    [[C6:%.*]] = fmul float [[A6]], [[B6]]
-; CHECK-NEXT:    [[C7:%.*]] = fmul float [[A7]], [[B7]]
-; CHECK-NEXT:    [[C8:%.*]] = fmul float [[A8]], [[B8]]
-; CHECK-NEXT:    [[C9:%.*]] = fmul float [[A9]], [[B9]]
-; CHECK-NEXT:    [[C10:%.*]] = fmul float [[A10]], [[B10]]
-; CHECK-NEXT:    [[C11:%.*]] = fmul float [[A11]], [[B11]]
-; CHECK-NEXT:    [[C12:%.*]] = fmul float [[A12]], [[B12]]
-; CHECK-NEXT:    [[C13:%.*]] = fmul float [[A13]], [[B13]]
-; CHECK-NEXT:    [[C14:%.*]] = fmul float [[A14]], [[B14]]
-; CHECK-NEXT:    [[C15:%.*]] = fmul float [[A15]], [[B15]]
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[C0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[C1]], i32 1
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[C2]], i32 2
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[C3]], i32 3
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[C4]], i32 4
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[C5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[C6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[C7]], i32 7
-; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[C8]], i32 8
-; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[C9]], i32 9
-; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[C10]], i32 10
-; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[C11]], i32 11
-; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[C12]], i32 12
-; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[C13]], i32 13
-; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[C14]], i32 14
-; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[C15]], i32 15
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
+; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
+; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
+; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
+; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
+; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
+; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
+; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
+; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
 ; CHECK-NEXT:    ret <16 x float> [[R15]]
 ;
   %a0  = extractelement <16 x float> %a, i32 0
@@ -1403,106 +1114,39 @@
 
 define <16 x float> @buildvector_div_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_div_16f32(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <16 x float> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <16 x float> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <16 x float> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <16 x float> %a, i32 3
-; CHECK-NEXT:    [[A4:%.*]] = extractelement <16 x float> %a, i32 4
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <16 x float> %a, i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <16 x float> %a, i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <16 x float> %a, i32 7
-; CHECK-NEXT:    [[A8:%.*]] = extractelement <16 x float> %a, i32 8
-; CHECK-NEXT:    [[A9:%.*]] = extractelement <16 x float> %a, i32 9
-; CHECK-NEXT:    [[A10:%.*]] = extractelement <16 x float> %a, i32 10
-; CHECK-NEXT:    [[A11:%.*]] = extractelement <16 x float> %a, i32 11
-; CHECK-NEXT:    [[A12:%.*]] = extractelement <16 x float> %a, i32 12
-; CHECK-NEXT:    [[A13:%.*]] = extractelement <16 x float> %a, i32 13
-; CHECK-NEXT:    [[A14:%.*]] = extractelement <16 x float> %a, i32 14
-; CHECK-NEXT:    [[A15:%.*]] = extractelement <16 x float> %a, i32 15
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <16 x float> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <16 x float> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <16 x float> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <16 x float> %b, i32 3
-; CHECK-NEXT:    [[B4:%.*]] = extractelement <16 x float> %b, i32 4
-; CHECK-NEXT:    [[B5:%.*]] = extractelement <16 x float> %b, i32 5
-; CHECK-NEXT:    [[B6:%.*]] = extractelement <16 x float> %b, i32 6
-; CHECK-NEXT:    [[B7:%.*]] = extractelement <16 x float> %b, i32 7
-; CHECK-NEXT:    [[B8:%.*]] = extractelement <16 x float> %b, i32 8
-; CHECK-NEXT:    [[B9:%.*]] = extractelement <16 x float> %b, i32 9
-; CHECK-NEXT:    [[B10:%.*]] = extractelement <16 x float> %b, i32 10
-; CHECK-NEXT:    [[B11:%.*]] = extractelement <16 x float> %b, i32 11
-; CHECK-NEXT:    [[B12:%.*]] = extractelement <16 x float> %b, i32 12
-; CHECK-NEXT:    [[B13:%.*]] = extractelement <16 x float> %b, i32 13
-; CHECK-NEXT:    [[B14:%.*]] = extractelement <16 x float> %b, i32 14
-; CHECK-NEXT:    [[B15:%.*]] = extractelement <16 x float> %b, i32 15
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[A1]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[A2]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[A3]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> undef, float [[B0]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[B1]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[B2]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[B3]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = fdiv <4 x float> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> undef, float [[A4]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[A5]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[A6]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[A7]], i32 3
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> undef, float [[B4]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[B5]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[B6]], i32 2
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[B7]], i32 3
-; CHECK-NEXT:    [[TMP18:%.*]] = fdiv <4 x float> [[TMP13]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> undef, float [[A8]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[A9]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[A10]], i32 2
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[A11]], i32 3
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> undef, float [[B8]], i32 0
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[B9]], i32 1
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[B10]], i32 2
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[B11]], i32 3
-; CHECK-NEXT:    [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> undef, float [[A12]], i32 0
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> [[TMP28]], float [[A13]], i32 1
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[A14]], i32 2
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[A15]], i32 3
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> undef, float [[B12]], i32 0
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[B13]], i32 1
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[B14]], i32 2
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[B15]], i32 3
-; CHECK-NEXT:    [[TMP36:%.*]] = fdiv <4 x float> [[TMP31]], [[TMP35]]
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x float> [[TMP9]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP37]], i32 0
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x float> [[TMP9]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP38]], i32 1
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x float> [[TMP9]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP39]], i32 2
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x float> [[TMP9]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP40]], i32 3
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x float> [[TMP18]], i32 0
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP41]], i32 4
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x float> [[TMP18]], i32 1
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP42]], i32 5
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x float> [[TMP18]], i32 2
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP43]], i32 6
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x float> [[TMP18]], i32 3
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP44]], i32 7
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x float> [[TMP27]], i32 0
-; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP45]], i32 8
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x float> [[TMP27]], i32 1
-; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP46]], i32 9
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x float> [[TMP27]], i32 2
-; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP47]], i32 10
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x float> [[TMP27]], i32 3
-; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP48]], i32 11
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x float> [[TMP36]], i32 0
-; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP49]], i32 12
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x float> [[TMP36]], i32 1
-; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP50]], i32 13
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <4 x float> [[TMP36]], i32 2
-; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP51]], i32 14
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <4 x float> [[TMP36]], i32 3
-; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP52]], i32 15
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
+; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
+; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
+; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
+; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
+; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
+; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
+; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
+; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
 ; CHECK-NEXT:    ret <16 x float> [[R15]]
 ;
   %a0  = extractelement <16 x float> %a, i32 0
diff --git a/test/Transforms/SLPVectorizer/X86/fptosi.ll b/test/Transforms/SLPVectorizer/X86/fptosi.ll
index a06a13e..9e95416 100644
--- a/test/Transforms/SLPVectorizer/X86/fptosi.ll
+++ b/test/Transforms/SLPVectorizer/X86/fptosi.ll
@@ -489,4 +489,54 @@
   ret void
 }
 
+;
+; FPTOSI BUILDVECTOR
+;
+
+define <4 x i32> @fptosi_4xf64_4i32(double %a0, double %a1, double %a2, double %a3) #0 {
+; CHECK-LABEL: @fptosi_4xf64_4i32(
+; CHECK-NEXT:    [[CVT0:%.*]] = fptosi double %a0 to i32
+; CHECK-NEXT:    [[CVT1:%.*]] = fptosi double %a1 to i32
+; CHECK-NEXT:    [[CVT2:%.*]] = fptosi double %a2 to i32
+; CHECK-NEXT:    [[CVT3:%.*]] = fptosi double %a3 to i32
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x i32> undef, i32 [[CVT0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2
+; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x i32> [[RES2]], i32 [[CVT3]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+;
+  %cvt0 = fptosi double %a0 to i32
+  %cvt1 = fptosi double %a1 to i32
+  %cvt2 = fptosi double %a2 to i32
+  %cvt3 = fptosi double %a3 to i32
+  %res0 = insertelement <4 x i32> undef, i32 %cvt0, i32 0
+  %res1 = insertelement <4 x i32> %res0, i32 %cvt1, i32 1
+  %res2 = insertelement <4 x i32> %res1, i32 %cvt2, i32 2
+  %res3 = insertelement <4 x i32> %res2, i32 %cvt3, i32 3
+  ret <4 x i32> %res3
+}
+
+define <4 x i32> @fptosi_4xf32_4i32(float %a0, float %a1, float %a2, float %a3) #0 {
+; CHECK-LABEL: @fptosi_4xf32_4i32(
+; CHECK-NEXT:    [[CVT0:%.*]] = fptosi float %a0 to i32
+; CHECK-NEXT:    [[CVT1:%.*]] = fptosi float %a1 to i32
+; CHECK-NEXT:    [[CVT2:%.*]] = fptosi float %a2 to i32
+; CHECK-NEXT:    [[CVT3:%.*]] = fptosi float %a3 to i32
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x i32> undef, i32 [[CVT0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2
+; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x i32> [[RES2]], i32 [[CVT3]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+;
+  %cvt0 = fptosi float %a0 to i32
+  %cvt1 = fptosi float %a1 to i32
+  %cvt2 = fptosi float %a2 to i32
+  %cvt3 = fptosi float %a3 to i32
+  %res0 = insertelement <4 x i32> undef, i32 %cvt0, i32 0
+  %res1 = insertelement <4 x i32> %res0, i32 %cvt1, i32 1
+  %res2 = insertelement <4 x i32> %res1, i32 %cvt2, i32 2
+  %res3 = insertelement <4 x i32> %res2, i32 %cvt3, i32 3
+  ret <4 x i32> %res3
+}
+
 attributes #0 = { nounwind }
diff --git a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index 06587cd..9e4f503 100644
--- a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -616,42 +616,38 @@
 define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
 ; CHECK-LABEL: @multi_tree(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double %w, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double %x, i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 0.000000e+00, double 1.000000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double %y, i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double %z, i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], <double 2.000000e+00, double 3.000000e+00>
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
-; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
-; CHECK-NEXT:    [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> undef, double %w, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double %x, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double %y, i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double %z, i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP6]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP5]], i32 1
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP5]], i32 2
+; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP5]], i32 3
+; CHECK-NEXT:    [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP9]], i32 0
 ; CHECK-NEXT:    ret <4 x double> [[I4]]
 ;
 ; ZEROTHRESH-LABEL: @multi_tree(
 ; ZEROTHRESH-NEXT:  entry:
-; ZEROTHRESH-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double %w, i32 0
-; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double %x, i32 1
-; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 0.000000e+00, double 1.000000e+00>
-; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double %y, i32 0
-; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double %z, i32 1
-; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], <double 2.000000e+00, double 3.000000e+00>
-; ZEROTHRESH-NEXT:    [[TMP6:%.*]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP2]]
-; ZEROTHRESH-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; ZEROTHRESH-NEXT:    [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP7]], i32 3
-; ZEROTHRESH-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; ZEROTHRESH-NEXT:    [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP8]], i32 2
-; ZEROTHRESH-NEXT:    [[TMP9:%.*]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP5]]
-; ZEROTHRESH-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
-; ZEROTHRESH-NEXT:    [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP10]], i32 1
-; ZEROTHRESH-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
-; ZEROTHRESH-NEXT:    [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP11]], i32 0
+; ZEROTHRESH-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> undef, double %w, i32 0
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double %x, i32 1
+; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double %y, i32 2
+; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double %z, i32 3
+; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
+; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[TMP4]]
+; ZEROTHRESH-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
+; ZEROTHRESH-NEXT:    [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP6]], i32 3
+; ZEROTHRESH-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP5]], i32 1
+; ZEROTHRESH-NEXT:    [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP7]], i32 2
+; ZEROTHRESH-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP5]], i32 2
+; ZEROTHRESH-NEXT:    [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP8]], i32 1
+; ZEROTHRESH-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP5]], i32 3
+; ZEROTHRESH-NEXT:    [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP9]], i32 0
 ; ZEROTHRESH-NEXT:    ret <4 x double> [[I4]]
 ;
 entry:
@@ -673,92 +669,44 @@
 define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: @_vadd256(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <8 x float> %a, i32 0
-; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <8 x float> %b, i32 0
-; CHECK-NEXT:    [[VECEXT2:%.*]] = extractelement <8 x float> %a, i32 1
-; CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <8 x float> %b, i32 1
-; CHECK-NEXT:    [[VECEXT5:%.*]] = extractelement <8 x float> %a, i32 2
-; CHECK-NEXT:    [[VECEXT6:%.*]] = extractelement <8 x float> %b, i32 2
-; CHECK-NEXT:    [[VECEXT8:%.*]] = extractelement <8 x float> %a, i32 3
-; CHECK-NEXT:    [[VECEXT9:%.*]] = extractelement <8 x float> %b, i32 3
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> undef, float [[VECEXT]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[VECEXT2]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[VECEXT5]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[VECEXT8]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> undef, float [[VECEXT1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[VECEXT3]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[VECEXT6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[VECEXT9]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP3]], [[TMP7]]
-; CHECK-NEXT:    [[VECEXT11:%.*]] = extractelement <8 x float> %a, i32 4
-; CHECK-NEXT:    [[VECEXT12:%.*]] = extractelement <8 x float> %b, i32 4
-; CHECK-NEXT:    [[VECEXT14:%.*]] = extractelement <8 x float> %a, i32 5
-; CHECK-NEXT:    [[VECEXT15:%.*]] = extractelement <8 x float> %b, i32 5
-; CHECK-NEXT:    [[VECEXT17:%.*]] = extractelement <8 x float> %a, i32 6
-; CHECK-NEXT:    [[VECEXT18:%.*]] = extractelement <8 x float> %b, i32 6
-; CHECK-NEXT:    [[VECEXT20:%.*]] = extractelement <8 x float> %a, i32 7
-; CHECK-NEXT:    [[VECEXT21:%.*]] = extractelement <8 x float> %b, i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> undef, float [[VECEXT11]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[VECEXT14]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[VECEXT17]], i32 2
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[VECEXT20]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> undef, float [[VECEXT12]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[VECEXT15]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[VECEXT18]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[VECEXT21]], i32 3
-; CHECK-NEXT:    [[TMP17:%.*]] = fadd <4 x float> [[TMP12]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP8]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP18]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP8]], i32 1
-; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP19]], i32 1
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP8]], i32 2
-; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP20]], i32 2
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
-; CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP21]], i32 3
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP17]], i32 0
-; CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP22]], i32 4
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP17]], i32 1
-; CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP23]], i32 5
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[TMP17]], i32 2
-; CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP24]], i32 6
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP17]], i32 3
-; CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP25]], i32 7
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <8 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP0]], i32 4
+; CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP5]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP0]], i32 5
+; CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP6]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP0]], i32 6
+; CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP7]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP0]], i32 7
+; CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP8]], i32 7
 ; CHECK-NEXT:    ret <8 x float> [[VECINIT7_I]]
 ;
 ; ZEROTHRESH-LABEL: @_vadd256(
 ; ZEROTHRESH-NEXT:  entry:
-; ZEROTHRESH-NEXT:    [[VECEXT:%.*]] = extractelement <8 x float> %a, i32 0
-; ZEROTHRESH-NEXT:    [[VECEXT1:%.*]] = extractelement <8 x float> %b, i32 0
-; ZEROTHRESH-NEXT:    [[ADD:%.*]] = fadd float [[VECEXT]], [[VECEXT1]]
-; ZEROTHRESH-NEXT:    [[VECEXT2:%.*]] = extractelement <8 x float> %a, i32 1
-; ZEROTHRESH-NEXT:    [[VECEXT3:%.*]] = extractelement <8 x float> %b, i32 1
-; ZEROTHRESH-NEXT:    [[ADD4:%.*]] = fadd float [[VECEXT2]], [[VECEXT3]]
-; ZEROTHRESH-NEXT:    [[VECEXT5:%.*]] = extractelement <8 x float> %a, i32 2
-; ZEROTHRESH-NEXT:    [[VECEXT6:%.*]] = extractelement <8 x float> %b, i32 2
-; ZEROTHRESH-NEXT:    [[ADD7:%.*]] = fadd float [[VECEXT5]], [[VECEXT6]]
-; ZEROTHRESH-NEXT:    [[VECEXT8:%.*]] = extractelement <8 x float> %a, i32 3
-; ZEROTHRESH-NEXT:    [[VECEXT9:%.*]] = extractelement <8 x float> %b, i32 3
-; ZEROTHRESH-NEXT:    [[ADD10:%.*]] = fadd float [[VECEXT8]], [[VECEXT9]]
-; ZEROTHRESH-NEXT:    [[VECEXT11:%.*]] = extractelement <8 x float> %a, i32 4
-; ZEROTHRESH-NEXT:    [[VECEXT12:%.*]] = extractelement <8 x float> %b, i32 4
-; ZEROTHRESH-NEXT:    [[ADD13:%.*]] = fadd float [[VECEXT11]], [[VECEXT12]]
-; ZEROTHRESH-NEXT:    [[VECEXT14:%.*]] = extractelement <8 x float> %a, i32 5
-; ZEROTHRESH-NEXT:    [[VECEXT15:%.*]] = extractelement <8 x float> %b, i32 5
-; ZEROTHRESH-NEXT:    [[ADD16:%.*]] = fadd float [[VECEXT14]], [[VECEXT15]]
-; ZEROTHRESH-NEXT:    [[VECEXT17:%.*]] = extractelement <8 x float> %a, i32 6
-; ZEROTHRESH-NEXT:    [[VECEXT18:%.*]] = extractelement <8 x float> %b, i32 6
-; ZEROTHRESH-NEXT:    [[ADD19:%.*]] = fadd float [[VECEXT17]], [[VECEXT18]]
-; ZEROTHRESH-NEXT:    [[VECEXT20:%.*]] = extractelement <8 x float> %a, i32 7
-; ZEROTHRESH-NEXT:    [[VECEXT21:%.*]] = extractelement <8 x float> %b, i32 7
-; ZEROTHRESH-NEXT:    [[ADD22:%.*]] = fadd float [[VECEXT20]], [[VECEXT21]]
-; ZEROTHRESH-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[ADD]], i32 0
-; ZEROTHRESH-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[ADD4]], i32 1
-; ZEROTHRESH-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[ADD7]], i32 2
-; ZEROTHRESH-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[ADD10]], i32 3
-; ZEROTHRESH-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[ADD13]], i32 4
-; ZEROTHRESH-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[ADD16]], i32 5
-; ZEROTHRESH-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[ADD19]], i32 6
-; ZEROTHRESH-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[ADD22]], i32 7
+; ZEROTHRESH-NEXT:    [[TMP0:%.*]] = fadd <8 x float> %a, %b
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
+; ZEROTHRESH-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
+; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
+; ZEROTHRESH-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP2]], i32 1
+; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP0]], i32 2
+; ZEROTHRESH-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP3]], i32 2
+; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP0]], i32 3
+; ZEROTHRESH-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP4]], i32 3
+; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP0]], i32 4
+; ZEROTHRESH-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP5]], i32 4
+; ZEROTHRESH-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP0]], i32 5
+; ZEROTHRESH-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP6]], i32 5
+; ZEROTHRESH-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP0]], i32 6
+; ZEROTHRESH-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP7]], i32 6
+; ZEROTHRESH-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP0]], i32 7
+; ZEROTHRESH-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP8]], i32 7
 ; ZEROTHRESH-NEXT:    ret <8 x float> [[VECINIT7_I]]
 ;
   entry:
diff --git a/test/Transforms/SLPVectorizer/X86/sitofp.ll b/test/Transforms/SLPVectorizer/X86/sitofp.ll
index 6e91a21..3b0d338 100644
--- a/test/Transforms/SLPVectorizer/X86/sitofp.ll
+++ b/test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -1175,4 +1175,54 @@
   ret void
 }
 
+;
+; SITOFP BUILDVECTOR
+;
+
+define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
+; CHECK-LABEL: @sitofp_4xi32_4f64(
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 %a0 to double
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 %a1 to double
+; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 %a2 to double
+; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 %a3 to double
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x double> undef, double [[CVT0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x double> [[RES0]], double [[CVT1]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x double> [[RES1]], double [[CVT2]], i32 2
+; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x double> [[RES2]], double [[CVT3]], i32 3
+; CHECK-NEXT:    ret <4 x double> [[RES3]]
+;
+  %cvt0 = sitofp i32 %a0 to double
+  %cvt1 = sitofp i32 %a1 to double
+  %cvt2 = sitofp i32 %a2 to double
+  %cvt3 = sitofp i32 %a3 to double
+  %res0 = insertelement <4 x double> undef, double %cvt0, i32 0
+  %res1 = insertelement <4 x double> %res0, double %cvt1, i32 1
+  %res2 = insertelement <4 x double> %res1, double %cvt2, i32 2
+  %res3 = insertelement <4 x double> %res2, double %cvt3, i32 3
+  ret <4 x double> %res3
+}
+
+define <4 x float> @sitofp_4xi32_4f32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
+; CHECK-LABEL: @sitofp_4xi32_4f32(
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 %a0 to float
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 %a1 to float
+; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 %a2 to float
+; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 %a3 to float
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x float> [[RES0]], float [[CVT1]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x float> [[RES1]], float [[CVT2]], i32 2
+; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x float> [[RES2]], float [[CVT3]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[RES3]]
+;
+  %cvt0 = sitofp i32 %a0 to float
+  %cvt1 = sitofp i32 %a1 to float
+  %cvt2 = sitofp i32 %a2 to float
+  %cvt3 = sitofp i32 %a3 to float
+  %res0 = insertelement <4 x float> undef, float %cvt0, i32 0
+  %res1 = insertelement <4 x float> %res0, float %cvt1, i32 1
+  %res2 = insertelement <4 x float> %res1, float %cvt2, i32 2
+  %res3 = insertelement <4 x float> %res2, float %cvt3, i32 3
+  ret <4 x float> %res3
+}
+
 attributes #0 = { nounwind }
diff --git a/test/Transforms/SROA/dbg-single-piece.ll b/test/Transforms/SROA/dbg-single-piece.ll
index 319b7c1..b8301c1 100644
--- a/test/Transforms/SROA/dbg-single-piece.ll
+++ b/test/Transforms/SROA/dbg-single-piece.ll
@@ -13,7 +13,7 @@
 ; CHECK-NOT: call void @llvm.dbg.value
 ; CHECK: call void @llvm.dbg.value(metadata %foo* undef, i64 0, {{.*}}, metadata ![[BIT_PIECE:[0-9]+]]), !dbg
 ; CHECK-NOT: call void @llvm.dbg.value
-; CHECK: ![[BIT_PIECE]] = !DIExpression(DW_OP_bit_piece, 64, 64)
+; CHECK: ![[BIT_PIECE]] = !DIExpression(DW_OP_LLVM_fragment, 64, 64)
   %0 = bitcast %foo* %retval to i8*
   %1 = getelementptr inbounds i8, i8* %0, i64 8
   %2 = bitcast i8* %1 to %foo**
diff --git a/test/Transforms/Util/split-bit-piece.ll b/test/Transforms/Util/split-bit-piece.ll
index 9343214..3d7bcac 100644
--- a/test/Transforms/Util/split-bit-piece.ll
+++ b/test/Transforms/Util/split-bit-piece.ll
@@ -14,7 +14,7 @@
   %v2 = alloca i64, align 8
   store i32 %hs, i32* %hs.addr, align 4
 ; CHECK: call void @llvm.dbg.value(metadata i32 %hs, i64 0, metadata !{{[0-9]+}}, metadata ![[EXPR:[0-9]+]])
-; CHECK: ![[EXPR]] = !DIExpression(DW_OP_bit_piece, 0
+; CHECK: ![[EXPR]] = !DIExpression(DW_OP_LLVM_fragment, 0
   call void @llvm.dbg.declare(metadata i64* %v1, metadata !9, metadata !12), !dbg !13
   %0 = load i32, i32* %hs.addr, align 4
   %conv = sext i32 %0 to i64
diff --git a/test/Transforms/WholeProgramDevirt/bad-read-from-vtable.ll b/test/Transforms/WholeProgramDevirt/bad-read-from-vtable.ll
index 97445ef..4885be7 100644
--- a/test/Transforms/WholeProgramDevirt/bad-read-from-vtable.ll
+++ b/test/Transforms/WholeProgramDevirt/bad-read-from-vtable.ll
@@ -3,14 +3,15 @@
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-@vt = global [2 x i8*] [i8* zeroinitializer, i8* bitcast (void (i8*)* @vf to i8*)], !type !0
+@vt1 = global [2 x i8*] [i8* zeroinitializer, i8* bitcast (void (i8*)* @vf to i8*)], !type !0
+@vt2 = global i8* bitcast (void (i8*)* @vf to i8*), !type !1
 
 define void @vf(i8* %this) {
   ret void
 }
 
-; CHECK: define void @unaligned
-define void @unaligned(i8* %obj) {
+; CHECK: define void @unaligned1
+define void @unaligned1(i8* %obj) {
   %vtableptr = bitcast i8* %obj to [1 x i8*]**
   %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
   %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
@@ -25,6 +26,22 @@
   ret void
 }
 
+; CHECK: define void @unaligned2
+define void @unaligned2(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid2")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr i8, i8* %vtablei8, i32 1
+  %fptrptr_casted = bitcast i8* %fptrptr to i8**
+  %fptr = load i8*, i8** %fptrptr_casted
+  %fptr_casted = bitcast i8* %fptr to void (i8*)*
+  ; CHECK: call void %
+  call void %fptr_casted(i8* %obj)
+  ret void
+}
+
 ; CHECK: define void @outofbounds
 define void @outofbounds(i8* %obj) {
   %vtableptr = bitcast i8* %obj to [1 x i8*]**
@@ -61,3 +78,4 @@
 declare void @llvm.assume(i1)
 
 !0 = !{i32 0, !"typeid"}
+!1 = !{i32 0, !"typeid2"}
diff --git a/test/Transforms/WholeProgramDevirt/non-array-vtable.ll b/test/Transforms/WholeProgramDevirt/pointer-vtable.ll
similarity index 96%
rename from test/Transforms/WholeProgramDevirt/non-array-vtable.ll
rename to test/Transforms/WholeProgramDevirt/pointer-vtable.ll
index e9c2db7..5e76a5a 100644
--- a/test/Transforms/WholeProgramDevirt/non-array-vtable.ll
+++ b/test/Transforms/WholeProgramDevirt/pointer-vtable.ll
@@ -19,7 +19,7 @@
   %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to void (i8*)*
-  ; CHECK: call void %
+  ; CHECK: call void @vf(
   call void %fptr_casted(i8* %obj)
   ret void
 }
diff --git a/test/Transforms/WholeProgramDevirt/soa-vtable.ll b/test/Transforms/WholeProgramDevirt/soa-vtable.ll
new file mode 100644
index 0000000..3b6afc5
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/soa-vtable.ll
@@ -0,0 +1,52 @@
+; RUN: opt -S -wholeprogramdevirt %s | FileCheck %s
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+%vtTy = type { [2 x void (i8*)*], [2 x void (i8*)*] }
+
+@vt = constant %vtTy { [2 x void (i8*)*] [void (i8*)* null, void (i8*)* @vf1], [2 x void (i8*)*] [void (i8*)* null, void (i8*)* @vf2] }, !type !0, !type !1
+
+define void @vf1(i8* %this) {
+  ret void
+}
+
+define void @vf2(i8* %this) {
+  ret void
+}
+
+; CHECK: define void @call1
+define void @call1(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid1")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to void (i8*)*
+  ; CHECK: call void @vf1(
+  call void %fptr_casted(i8* %obj)
+  ret void
+}
+
+; CHECK: define void @call2
+define void @call2(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid2")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to void (i8*)*
+  ; CHECK: call void @vf2(
+  call void %fptr_casted(i8* %obj)
+  ret void
+}
+
+declare i1 @llvm.type.test(i8*, metadata)
+declare void @llvm.assume(i1)
+
+!0 = !{i32 8, !"typeid1"}
+!1 = !{i32 24, !"typeid2"}
diff --git a/test/Transforms/WholeProgramDevirt/struct-vtable.ll b/test/Transforms/WholeProgramDevirt/struct-vtable.ll
new file mode 100644
index 0000000..81e41d4
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/struct-vtable.ll
@@ -0,0 +1,63 @@
+; RUN: opt -S -wholeprogramdevirt %s | FileCheck %s
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+%vtTy = type { void (i8*)* }
+
+@vt = constant %vtTy { void (i8*)* @vf }, !type !0
+
+define void @vf(i8* %this) {
+  ret void
+}
+
+; CHECK: define void @call
+define void @call(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to void (i8*)*
+  ; CHECK: call void @vf(
+  call void %fptr_casted(i8* %obj)
+  ret void
+}
+
+; CHECK: define void @call_oob
+define void @call_oob(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 4
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to void (i8*)*
+  ; CHECK: call void %
+  call void %fptr_casted(i8* %obj)
+  ret void
+}
+
+; CHECK: define void @call_unaligned
+define void @call_unaligned(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr i8, i8* %vtablei8, i32 1
+  %fptrptr_casted = bitcast i8* %fptrptr to i8**
+  %fptr = load i8*, i8** %fptrptr_casted
+  %fptr_casted = bitcast i8* %fptr to void (i8*)*
+  ; CHECK: call void %
+  call void %fptr_casted(i8* %obj)
+  ret void
+}
+
+declare i1 @llvm.type.test(i8*, metadata)
+declare void @llvm.assume(i1)
+
+!0 = !{i32 0, !"typeid"}
diff --git a/test/tools/llvm-readobj/program-headers.test b/test/tools/llvm-readobj/program-headers.test
index 9bcd133..e507442 100644
--- a/test/tools/llvm-readobj/program-headers.test
+++ b/test/tools/llvm-readobj/program-headers.test
@@ -13,7 +13,10 @@
 ##
 ## test.s is an empty file.
 ## linker.script:
-## PHDRS { text PT_LOAD FILEHDR PHDRS; foo 0x65a3dbe6; bar 0x65a3dbe7; }
+## PHDRS { text PT_LOAD FILEHDR PHDRS; foo 0x65a3dbe6; bar 0x65a3dbe7; zed 0x65a41be6; }
+##   Where 0x65a3dbe6 is the value of PT_OPENBSD_RANDOMIZE,
+##         0x65a3dbe7 is the value of PT_OPENBSD_WXNEEDED,
+##         0x65a41be6 is the value of PT_OPENBSD_BOOTDATA
 ## SECTIONS { . = SIZEOF_HEADERS; .all : { *(.*) } : text }
 RUN: llvm-readobj -program-headers %p/../../Object/Inputs/openbsd-phdrs.elf-x86-64 \
 RUN:     | FileCheck %s -check-prefix OPENBSD-X86-64
@@ -175,3 +178,14 @@
 OPENBSD-X86-64-NEXT:     ]
 OPENBSD-X86-64-NEXT:     Alignment:
 OPENBSD-X86-64-NEXT:   }
+OPENBSD-X86-64-NEXT:   ProgramHeader {
+OPENBSD-X86-64-NEXT:     Type:  PT_OPENBSD_BOOTDATA
+OPENBSD-X86-64-NEXT:     Offset:
+OPENBSD-X86-64-NEXT:     VirtualAddress:
+OPENBSD-X86-64-NEXT:     PhysicalAddress:
+OPENBSD-X86-64-NEXT:     FileSize:
+OPENBSD-X86-64-NEXT:     MemSize:
+OPENBSD-X86-64-NEXT:     Flags [
+OPENBSD-X86-64-NEXT:     ]
+OPENBSD-X86-64-NEXT:     Alignment:
+OPENBSD-X86-64-NEXT:   }
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 1a6ff63..aa0beb4 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -264,6 +264,7 @@
   initializeLowerIntrinsicsPass(*Registry);
   initializeCountingFunctionInserterPass(*Registry);
   initializeUnreachableBlockElimLegacyPassPass(*Registry);
+  initializeConstantHoistingLegacyPassPass(*Registry);
 
   // Register the target printer for --version.
   cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
diff --git a/tools/lli/CMakeLists.txt b/tools/lli/CMakeLists.txt
index 1ecbfff..f02e193 100644
--- a/tools/lli/CMakeLists.txt
+++ b/tools/lli/CMakeLists.txt
@@ -7,7 +7,6 @@
   Core
   ExecutionEngine
   IRReader
-  Instrumentation
   Interpreter
   MC
   MCJIT
diff --git a/tools/llvm-lto2/llvm-lto2.cpp b/tools/llvm-lto2/llvm-lto2.cpp
index 01756d2..e8bf7a3 100644
--- a/tools/llvm-lto2/llvm-lto2.cpp
+++ b/tools/llvm-lto2/llvm-lto2.cpp
@@ -17,6 +17,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/LTO/Caching.h"
+#include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/LTO/LTO.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetSelect.h"
@@ -31,6 +32,11 @@
                            "(default = '-O2')"),
              cl::Prefix, cl::ZeroOrMore, cl::init('2'));
 
+static cl::opt<char> CGOptLevel(
+    "cg-opt-level",
+    cl::desc("Codegen optimization level (0, 1, 2 or 3, default = '2')"),
+    cl::init('2'));
+
 static cl::list<std::string> InputFilenames(cl::Positional, cl::OneOrMore,
                                             cl::desc("<input bitcode files>"));
 
@@ -74,6 +80,15 @@
              "A resolution for each symbol must be specified."),
     cl::ZeroOrMore);
 
+static cl::opt<std::string> OverrideTriple(
+    "override-triple",
+    cl::desc("Replace target triples in input files with this triple"));
+
+static cl::opt<std::string> DefaultTriple(
+    "default-triple",
+    cl::desc(
+        "Replace unspecified target triples in input files with this triple"));
+
 static void check(Error E, std::string Msg) {
   if (!E)
     return;
@@ -146,6 +161,13 @@
     exit(1);
   };
 
+  Conf.CPU = MCPU;
+  Conf.Options = InitTargetOptionsFromCodeGenFlags();
+  Conf.MAttrs = MAttrs;
+  if (auto RM = getRelocModel())
+    Conf.RelocModel = *RM;
+  Conf.CodeModel = CMModel;
+
   if (SaveTemps)
     check(Conf.addSaveTemps(OutputFilename + "."),
           "Config::addSaveTemps failed");
@@ -155,6 +177,26 @@
   Conf.AAPipeline = AAPipeline;
 
   Conf.OptLevel = OptLevel - '0';
+  switch (CGOptLevel) {
+  case '0':
+    Conf.CGOptLevel = CodeGenOpt::None;
+    break;
+  case '1':
+    Conf.CGOptLevel = CodeGenOpt::Less;
+    break;
+  case '2':
+    Conf.CGOptLevel = CodeGenOpt::Default;
+    break;
+  case '3':
+    Conf.CGOptLevel = CodeGenOpt::Aggressive;
+    break;
+  default:
+    llvm::errs() << "invalid cg optimization level: " << CGOptLevel << '\n';
+    return 1;
+  }
+
+  Conf.OverrideTriple = OverrideTriple;
+  Conf.DefaultTriple = DefaultTriple;
 
   ThinBackend Backend;
   if (ThinLTODistributedIndexes)
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index 8a904a8..497fb19 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -394,9 +394,22 @@
 }
 
 static int fillCommandLineSymbols(MCAsmParser &Parser) {
-  for (auto &I: DefineSymbol)
-    if (Parser.getContext().setSymbolValue(Parser.getStreamer(), I))
+  for (auto &I: DefineSymbol) {
+    auto Pair = StringRef(I).split('=');
+    auto Sym = Pair.first;
+    auto Val = Pair.second;
+
+    if (Sym.empty() || Val.empty()) {
+      errs() << "error: defsym must be of the form: sym=value: " << I << "\n";
       return 1;
+    }
+    int64_t Value;
+    if (Val.getAsInteger(0, Value)) {
+      errs() << "error: Value is not an integer: " << Val << "\n";
+      return 1;
+    }
+    Parser.getContext().setSymbolValue(Parser.getStreamer(), Sym, Value);
+  }
   return 0;
 }
 
diff --git a/tools/llvm-pdbdump/LLVMOutputStyle.cpp b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
index 2287e0d..98c67ec9 100644
--- a/tools/llvm-pdbdump/LLVMOutputStyle.cpp
+++ b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
@@ -350,11 +350,15 @@
 Error LLVMOutputStyle::dumpGlobalsStream() {
   if (!opts::raw::DumpGlobals)
     return Error::success();
+  if (!File.hasPDBGlobalsStream()) {
+    P.printString("Globals Stream not present");
+    return Error::success();
+  }
 
-  DictScope D(P, "Globals Stream");
   auto Globals = File.getPDBGlobalsStream();
   if (!Globals)
     return Globals.takeError();
+  DictScope D(P, "Globals Stream");
 
   auto Dbi = File.getPDBDbiStream();
   if (!Dbi)
@@ -447,6 +451,10 @@
 Error LLVMOutputStyle::dumpInfoStream() {
   if (!opts::raw::DumpHeaders)
     return Error::success();
+  if (!File.hasPDBInfoStream()) {
+    P.printString("PDB Stream not present");
+    return Error::success();
+  }
   auto IS = File.getPDBInfoStream();
   if (!IS)
     return IS.takeError();
@@ -485,11 +493,19 @@
   StringRef Label;
   StringRef VerLabel;
   if (StreamIdx == StreamTPI) {
+    if (!File.hasPDBTpiStream()) {
+      P.printString("Type Info Stream (TPI) not present");
+      return Error::success();
+    }
     DumpRecordBytes = opts::raw::DumpTpiRecordBytes;
     DumpRecords = opts::raw::DumpTpiRecords;
     Label = "Type Info Stream (TPI)";
     VerLabel = "TPI Version";
   } else if (StreamIdx == StreamIPI) {
+    if (!File.hasPDBIpiStream()) {
+      P.printString("Type Info Stream (IPI) not present");
+      return Error::success();
+    }
     DumpRecordBytes = opts::raw::DumpIpiRecordBytes;
     DumpRecords = opts::raw::DumpIpiRecords;
     Label = "Type Info Stream (IPI)";
@@ -556,6 +572,10 @@
                      opts::raw::DumpModuleFiles || opts::raw::DumpLineInfo;
   if (!opts::raw::DumpHeaders && !DumpModules)
     return Error::success();
+  if (!File.hasPDBDbiStream()) {
+    P.printString("DBI Stream not present");
+    return Error::success();
+  }
 
   auto DS = File.getPDBDbiStream();
   if (!DS)
@@ -742,6 +762,10 @@
 Error LLVMOutputStyle::dumpSectionContribs() {
   if (!opts::raw::DumpSectionContribs)
     return Error::success();
+  if (!File.hasPDBDbiStream()) {
+    P.printString("DBI Stream not present");
+    return Error::success();
+  }
 
   auto Dbi = File.getPDBDbiStream();
   if (!Dbi)
@@ -789,6 +813,10 @@
 Error LLVMOutputStyle::dumpSectionMap() {
   if (!opts::raw::DumpSectionMap)
     return Error::success();
+  if (!File.hasPDBDbiStream()) {
+    P.printString("DBI Stream not present");
+    return Error::success();
+  }
 
   auto Dbi = File.getPDBDbiStream();
   if (!Dbi)
@@ -813,11 +841,15 @@
 Error LLVMOutputStyle::dumpPublicsStream() {
   if (!opts::raw::DumpPublics)
     return Error::success();
+  if (!File.hasPDBPublicsStream()) {
+    P.printString("Publics Stream not present");
+    return Error::success();
+  }
 
-  DictScope D(P, "Publics Stream");
   auto Publics = File.getPDBPublicsStream();
   if (!Publics)
     return Publics.takeError();
+  DictScope D(P, "Publics Stream");
 
   auto Dbi = File.getPDBDbiStream();
   if (!Dbi)
@@ -856,6 +888,10 @@
 Error LLVMOutputStyle::dumpSectionHeaders() {
   if (!opts::raw::DumpSectionHeaders)
     return Error::success();
+  if (!File.hasPDBDbiStream()) {
+    P.printString("DBI Stream not present");
+    return Error::success();
+  }
 
   auto Dbi = File.getPDBDbiStream();
   if (!Dbi)
@@ -885,6 +921,10 @@
 Error LLVMOutputStyle::dumpFpoStream() {
   if (!opts::raw::DumpFpo)
     return Error::success();
+  if (!File.hasPDBDbiStream()) {
+    P.printString("DBI Stream not present");
+    return Error::success();
+  }
 
   auto Dbi = File.getPDBDbiStream();
   if (!Dbi)
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index 5d7f922..dbeb03d 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -1160,6 +1160,7 @@
 
   LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_RANDOMIZE);
   LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_WXNEEDED);
+  LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_BOOTDATA);
 
   default: return "";
   }
diff --git a/tools/obj2yaml/CMakeLists.txt b/tools/obj2yaml/CMakeLists.txt
index 9b89552..0fab6d1 100644
--- a/tools/obj2yaml/CMakeLists.txt
+++ b/tools/obj2yaml/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  DebugInfoDWARF
   Object
   ObjectYAML
   Support
@@ -7,6 +8,7 @@
 add_llvm_tool(obj2yaml
   obj2yaml.cpp
   coff2yaml.cpp
+  dwarf2yaml.cpp
   elf2yaml.cpp
   macho2yaml.cpp
   Error.cpp
diff --git a/tools/obj2yaml/dwarf2yaml.cpp b/tools/obj2yaml/dwarf2yaml.cpp
new file mode 100644
index 0000000..ca55702
--- /dev/null
+++ b/tools/obj2yaml/dwarf2yaml.cpp
@@ -0,0 +1,77 @@
+//===------ dwarf2yaml.cpp - obj2yaml conversion tool -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Error.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
+#include "llvm/ObjectYAML/DWARFYAML.h"
+
+using namespace llvm;
+
+void dumpDebugAbbrev(DWARFContextInMemory &DCtx, DWARFYAML::Data &Y) {
+  auto AbbrevSetPtr = DCtx.getDebugAbbrev();
+  if (AbbrevSetPtr) {
+    for (auto AbbrvDeclSet : *AbbrevSetPtr) {
+      for (auto AbbrvDecl : AbbrvDeclSet.second) {
+        DWARFYAML::Abbrev Abbrv;
+        Abbrv.Code = AbbrvDecl.getCode();
+        Abbrv.Tag = AbbrvDecl.getTag();
+        Abbrv.Children = AbbrvDecl.hasChildren() ? dwarf::DW_CHILDREN_yes
+                                                 : dwarf::DW_CHILDREN_no;
+        for (auto Attribute : AbbrvDecl.attributes()) {
+          DWARFYAML::AttributeAbbrev AttAbrv;
+          AttAbrv.Attribute = Attribute.Attr;
+          AttAbrv.Form = Attribute.Form;
+          Abbrv.Attributes.push_back(AttAbrv);
+        }
+        Y.AbbrevDecls.push_back(Abbrv);
+      }
+    }
+  }
+}
+
+void dumpDebugStrings(DWARFContextInMemory &DCtx, DWARFYAML::Data &Y) {
+  StringRef RemainingTable = DCtx.getStringSection();
+  while (RemainingTable.size() > 0) {
+    auto SymbolPair = RemainingTable.split('\0');
+    RemainingTable = SymbolPair.second;
+    Y.DebugStrings.push_back(SymbolPair.first);
+  }
+}
+
+void dumpDebugARanges(DWARFContextInMemory &DCtx, DWARFYAML::Data &Y) {
+  DataExtractor ArangesData(DCtx.getARangeSection(), DCtx.isLittleEndian(), 0);
+  uint32_t Offset = 0;
+  DWARFDebugArangeSet Set;
+
+  while (Set.extract(ArangesData, &Offset)) {
+    DWARFYAML::ARange Range;
+    Range.Length = Set.getHeader().Length;
+    Range.Version = Set.getHeader().Version;
+    Range.CuOffset = Set.getHeader().CuOffset;
+    Range.AddrSize = Set.getHeader().AddrSize;
+    Range.SegSize = Set.getHeader().SegSize;
+    for (auto Descriptor : Set.descriptors()) {
+      DWARFYAML::ARangeDescriptor Desc;
+      Desc.Address = Descriptor.Address;
+      Desc.Length = Descriptor.Length;
+      Range.Descriptors.push_back(Desc);
+    }
+    Y.ARanges.push_back(Range);
+  }
+}
+
+std::error_code dwarf2yaml(DWARFContextInMemory &DCtx,
+                           DWARFYAML::Data &Y) {
+  dumpDebugAbbrev(DCtx, Y);
+  dumpDebugStrings(DCtx, Y);
+  dumpDebugARanges(DCtx, Y);
+
+  return obj2yaml_error::success;
+}
diff --git a/tools/obj2yaml/macho2yaml.cpp b/tools/obj2yaml/macho2yaml.cpp
index 3d32aa1..9cd0546 100644
--- a/tools/obj2yaml/macho2yaml.cpp
+++ b/tools/obj2yaml/macho2yaml.cpp
@@ -9,6 +9,7 @@
 
 #include "Error.h"
 #include "obj2yaml.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/ObjectYAML/ObjectYAML.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -34,6 +35,10 @@
                        ArrayRef<uint8_t> OpcodeBuffer, bool Lazy = false);
   void dumpExportTrie(std::unique_ptr<MachOYAML::Object> &Y);
   void dumpSymbols(std::unique_ptr<MachOYAML::Object> &Y);
+  void dumpDebugAbbrev(DWARFContextInMemory &DCtx,
+                        std::unique_ptr<MachOYAML::Object> &Y);
+  void dumpDebugStrings(DWARFContextInMemory &DCtx,
+                        std::unique_ptr<MachOYAML::Object> &Y);
 
 public:
   MachODumper(const object::MachOObjectFile &O) : Obj(O) {}
@@ -163,6 +168,10 @@
   dumpHeader(Y);
   dumpLoadCommands(Y);
   dumpLinkEdit(Y);
+
+  DWARFContextInMemory DICtx(Obj);
+  if(auto Err = dwarf2yaml(DICtx, Y->DWARF))
+    return errorCodeToError(Err);
   return std::move(Y);
 }
 
diff --git a/tools/obj2yaml/obj2yaml.h b/tools/obj2yaml/obj2yaml.h
index 28c7475..70d4ebd 100644
--- a/tools/obj2yaml/obj2yaml.h
+++ b/tools/obj2yaml/obj2yaml.h
@@ -24,4 +24,15 @@
 std::error_code macho2yaml(llvm::raw_ostream &Out,
                            const llvm::object::Binary &Obj);
 
+// Forward decls for dwarf2yaml
+namespace llvm {
+class DWARFContextInMemory;
+namespace DWARFYAML {
+struct Data;
+}
+}
+
+std::error_code dwarf2yaml(llvm::DWARFContextInMemory &DCtx,
+                           llvm::DWARFYAML::Data &Y);
+
 #endif
diff --git a/tools/yaml2obj/CMakeLists.txt b/tools/yaml2obj/CMakeLists.txt
index 885a69f..5e72649 100644
--- a/tools/yaml2obj/CMakeLists.txt
+++ b/tools/yaml2obj/CMakeLists.txt
@@ -8,6 +8,7 @@
 add_llvm_tool(yaml2obj
   yaml2obj.cpp
   yaml2coff.cpp
+  yaml2dwarf.cpp
   yaml2elf.cpp
   yaml2macho.cpp
   )
diff --git a/tools/yaml2obj/yaml2dwarf.cpp b/tools/yaml2obj/yaml2dwarf.cpp
new file mode 100644
index 0000000..fcc5833
--- /dev/null
+++ b/tools/yaml2obj/yaml2dwarf.cpp
@@ -0,0 +1,68 @@
+//===- yaml2dwarf - Convert YAML to DWARF binary data ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief The DWARF component of yaml2obj.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/DWARFYAML.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+void ZeroFillBytes(raw_ostream &OS, size_t Size) {
+  std::vector<uint8_t> FillData;
+  FillData.insert(FillData.begin(), Size, 0);
+  OS.write(reinterpret_cast<char *>(FillData.data()), Size);
+}
+
+void yaml2debug_str(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  for (auto Str : DI.DebugStrings) {
+    OS.write(Str.data(), Str.size());
+    OS.write('\0');
+  }
+}
+
+void yaml2debug_abbrev(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  for (auto AbbrevDecl : DI.AbbrevDecls) {
+    encodeULEB128(AbbrevDecl.Code, OS);
+    encodeULEB128(AbbrevDecl.Tag, OS);
+    OS.write(AbbrevDecl.Children);
+    for (auto Attr : AbbrevDecl.Attributes) {
+      encodeULEB128(Attr.Attribute, OS);
+      encodeULEB128(Attr.Form, OS);
+    }
+    encodeULEB128(0, OS);
+    encodeULEB128(0, OS);
+  }
+}
+
+void yaml2debug_aranges(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  for (auto Range : DI.ARanges) {
+    auto HeaderStart = OS.tell();
+    OS.write(reinterpret_cast<char *>(&Range.Length), 4);
+    OS.write(reinterpret_cast<char *>(&Range.Version), 2);
+    OS.write(reinterpret_cast<char *>(&Range.CuOffset), 4);
+    OS.write(reinterpret_cast<char *>(&Range.AddrSize), 1);
+    OS.write(reinterpret_cast<char *>(&Range.SegSize), 1);
+
+    auto HeaderSize = OS.tell() - HeaderStart;
+    auto FirstDescriptor = alignTo(HeaderSize, Range.AddrSize * 2);
+    ZeroFillBytes(OS, FirstDescriptor - HeaderSize);
+
+    for (auto Descriptor : Range.Descriptors) {
+      OS.write(reinterpret_cast<char *>(&Descriptor.Address), Range.AddrSize);
+      OS.write(reinterpret_cast<char *>(&Descriptor.Length), Range.AddrSize);
+    }
+    ZeroFillBytes(OS, Range.AddrSize * 2);
+  }
+}
diff --git a/tools/yaml2obj/yaml2macho.cpp b/tools/yaml2obj/yaml2macho.cpp
index fb29e20..76dec4b 100644
--- a/tools/yaml2obj/yaml2macho.cpp
+++ b/tools/yaml2obj/yaml2macho.cpp
@@ -41,6 +41,8 @@
   Error writeLoadCommands(raw_ostream &OS);
   Error writeSectionData(raw_ostream &OS);
   Error writeLinkEditData(raw_ostream &OS);
+  Error writeDWARFData(raw_ostream &OS,
+                       std::vector<MachOYAML::Section> &Sections);
   void writeBindOpcodes(raw_ostream &OS,
                         std::vector<MachOYAML::BindOpcode> &BindOpcodes);
   // LinkEdit writers
@@ -240,6 +242,9 @@
       if (0 == strncmp(&segname[0], "__LINKEDIT", 16)) {
         if (auto Err = writeLinkEditData(OS))
           return Err;
+      } else if (0 == strncmp(&segname[0], "__DWARF", 16)) {
+        if (auto Err = writeDWARFData(OS, LC.Sections))
+          return Err;
       } else {
         // Zero Fill any data between the end of the last thing we wrote and the
         // start of this section.
@@ -252,7 +257,8 @@
           // the
           // start of this section.
           assert(
-              OS.tell() - fileStart <= Sec.offset &&
+              (OS.tell() - fileStart <= Sec.offset ||
+               Sec.offset == (uint32_t)0) &&
               "Wrote too much data somewhere, section offsets don't line up.");
           currOffset = OS.tell() - fileStart;
           if (currOffset < Sec.offset) {
@@ -378,6 +384,22 @@
   return Error::success();
 }
 
+Error MachOWriter::writeDWARFData(raw_ostream &OS,
+                                  std::vector<MachOYAML::Section> &Sections) {
+  for(auto Section : Sections) {
+    ZeroToOffset(OS, Section.offset);
+    if (0 == strncmp(&Section.sectname[0], "__debug_str", 16)) {
+      yaml2debug_str(OS, Obj.DWARF);
+    } else if (0 == strncmp(&Section.sectname[0], "__debug_abbrev", 16)) {
+      yaml2debug_abbrev(OS, Obj.DWARF);
+    }
+    else if (0 == strncmp(&Section.sectname[0], "__debug_aranges", 16)) {
+      yaml2debug_aranges(OS, Obj.DWARF);
+    }
+  }
+  return Error::success();
+}
+
 Error MachOWriter::writeRebaseOpcodes(raw_ostream &OS) {
   MachOYAML::LinkEditData &LinkEdit = Obj.LinkEdit;
 
diff --git a/tools/yaml2obj/yaml2obj.h b/tools/yaml2obj/yaml2obj.h
index b5025e8..cd481c0 100644
--- a/tools/yaml2obj/yaml2obj.h
+++ b/tools/yaml2obj/yaml2obj.h
@@ -23,6 +23,10 @@
 struct Object;
 }
 
+namespace DWARFYAML {
+struct Data;
+}
+
 namespace yaml {
 class Input;
 struct YamlObjectFile;
@@ -33,4 +37,11 @@
 int yaml2elf(llvm::ELFYAML::Object &Doc, llvm::raw_ostream &Out);
 int yaml2macho(llvm::yaml::YamlObjectFile &Doc, llvm::raw_ostream &Out);
 
+void yaml2debug_abbrev(llvm::raw_ostream &OS,
+                       const llvm::DWARFYAML::Data &DI);
+void yaml2debug_str(llvm::raw_ostream &OS,
+                    const llvm::DWARFYAML::Data &DI);
+
+void yaml2debug_aranges(llvm::raw_ostream &OS, const llvm::DWARFYAML::Data &DI);
+
 #endif
diff --git a/unittests/DebugInfo/DWARF/CMakeLists.txt b/unittests/DebugInfo/DWARF/CMakeLists.txt
index 4bec17c..eafca4a 100644
--- a/unittests/DebugInfo/DWARF/CMakeLists.txt
+++ b/unittests/DebugInfo/DWARF/CMakeLists.txt
@@ -1,8 +1,15 @@
 set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
+  AsmPrinter
   DebugInfoDWARF
+  MC
+  Object
+  Support
   )
 
 set(DebugInfoSources
+  DwarfGenerator.cpp
+  DWARFDebugInfoTest.cpp
   DWARFFormValueTest.cpp
   )
 
diff --git a/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
new file mode 100644
index 0000000..f2a1a14
--- /dev/null
+++ b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
@@ -0,0 +1,791 @@
+//===- llvm/unittest/DebugInfo/DWARFFormValueTest.cpp ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfGenerator.h"
+#include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetSelect.h"
+#include "gtest/gtest.h"
+#include <climits>
+
+using namespace llvm;
+using namespace dwarf;
+
+namespace {
+
+void initLLVMIfNeeded() {
+  static bool gInitialized = false;
+  if (!gInitialized) {
+    gInitialized = true;
+    InitializeAllTargets();
+    InitializeAllTargetMCs();
+    InitializeAllAsmPrinters();
+    InitializeAllAsmParsers();
+  }
+}
+
+Triple getHostTripleForAddrSize(uint8_t AddrSize) {
+  Triple PT(Triple::normalize(LLVM_HOST_TRIPLE));
+
+  if (AddrSize == 8 && PT.isArch32Bit())
+    return PT.get64BitArchVariant();
+  if (AddrSize == 4 && PT.isArch64Bit())
+    return PT.get32BitArchVariant();
+  return PT;
+}
+
+/// Take any llvm::Expected and check and handle any errors.
+///
+/// \param Expected a llvm::Excepted instance to check.
+/// \returns true if there were errors, false otherwise.
+template <typename T>
+static bool HandleExpectedError(T &Expected) {
+  std::string ErrorMsg;
+  handleAllErrors(Expected.takeError(), [&](const llvm::ErrorInfoBase &EI) {
+    ErrorMsg = EI.message();
+  });
+  if (!ErrorMsg.empty()) {
+    ::testing::AssertionFailure() << "error: " << ErrorMsg;
+    return true;
+  }
+  return false;
+}
+
+template <uint16_t Version, class AddrType, class RefAddrType>
+void TestAllForms() {
+  // Test that we can decode all DW_FORM values correctly.
+
+  const uint8_t AddrSize = sizeof(AddrType);
+  const AddrType AddrValue = (AddrType)0x0123456789abcdefULL;
+  const uint8_t BlockData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 0};
+  const uint32_t BlockSize = sizeof(BlockData);
+  const RefAddrType RefAddr = 0x12345678;
+  const uint8_t Data1 = 0x01U;
+  const uint16_t Data2 = 0x2345U;
+  const uint32_t Data4 = 0x6789abcdU;
+  const uint64_t Data8 = 0x0011223344556677ULL;
+  const uint64_t Data8_2 = 0xAABBCCDDEEFF0011ULL;
+  const int64_t SData = INT64_MIN;
+  const uint64_t UData[] = {UINT64_MAX - 1, UINT64_MAX - 2, UINT64_MAX - 3,
+                            UINT64_MAX - 4, UINT64_MAX - 5, UINT64_MAX - 6,
+                            UINT64_MAX - 7, UINT64_MAX - 8, UINT64_MAX - 9};
+#define UDATA_1 18446744073709551614ULL
+  const uint32_t Dwarf32Values[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const char *StringValue = "Hello";
+  const char *StrpValue = "World";
+  initLLVMIfNeeded();
+  Triple Triple = getHostTripleForAddrSize(AddrSize);
+  auto ExpectedDG = dwarfgen::Generator::create(Triple, Version);
+  if (HandleExpectedError(ExpectedDG))
+    return;
+  dwarfgen::Generator *DG = ExpectedDG.get().get();
+  dwarfgen::CompileUnit &CU = DG->addCompileUnit();
+  dwarfgen::DIE CUDie = CU.getUnitDIE();
+  uint16_t Attr = DW_AT_lo_user;
+
+  //----------------------------------------------------------------------
+  // Test address forms
+  //----------------------------------------------------------------------
+  const auto Attr_DW_FORM_addr = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_addr, DW_FORM_addr, AddrValue);
+
+  //----------------------------------------------------------------------
+  // Test block forms
+  //----------------------------------------------------------------------
+  const auto Attr_DW_FORM_block = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_block, DW_FORM_block, BlockData, BlockSize);
+
+  const auto Attr_DW_FORM_block1 = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_block1, DW_FORM_block1, BlockData, BlockSize);
+
+  const auto Attr_DW_FORM_block2 = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_block2, DW_FORM_block2, BlockData, BlockSize);
+
+  const auto Attr_DW_FORM_block4 = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_block4, DW_FORM_block4, BlockData, BlockSize);
+
+  //----------------------------------------------------------------------
+  // Test data forms
+  //----------------------------------------------------------------------
+  const auto Attr_DW_FORM_data1 = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_data1, DW_FORM_data1, Data1);
+
+  const auto Attr_DW_FORM_data2 = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_data2, DW_FORM_data2, Data2);
+
+  const auto Attr_DW_FORM_data4 = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_data4, DW_FORM_data4, Data4);
+
+  const auto Attr_DW_FORM_data8 = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_data8, DW_FORM_data8, Data8);
+
+  //----------------------------------------------------------------------
+  // Test string forms
+  //----------------------------------------------------------------------
+  const auto Attr_DW_FORM_string = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_string, DW_FORM_string, StringValue);
+
+  const auto Attr_DW_FORM_strp = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_strp, DW_FORM_strp, StrpValue);
+
+  //----------------------------------------------------------------------
+  // Test reference forms
+  //----------------------------------------------------------------------
+  const auto Attr_DW_FORM_ref_addr = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_ref_addr, DW_FORM_ref_addr, RefAddr);
+
+  const auto Attr_DW_FORM_ref1 = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_ref1, DW_FORM_ref1, Data1);
+
+  const auto Attr_DW_FORM_ref2 = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_ref2, DW_FORM_ref2, Data2);
+
+  const auto Attr_DW_FORM_ref4 = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_ref4, DW_FORM_ref4, Data4);
+
+  const auto Attr_DW_FORM_ref8 = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_ref8, DW_FORM_ref8, Data8);
+
+  const auto Attr_DW_FORM_ref_sig8 = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_ref_sig8, DW_FORM_ref_sig8, Data8_2);
+
+  const auto Attr_DW_FORM_ref_udata = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_ref_udata, DW_FORM_ref_udata, UData[0]);
+
+  //----------------------------------------------------------------------
+  // Test flag forms
+  //----------------------------------------------------------------------
+  const auto Attr_DW_FORM_flag_true = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_flag_true, DW_FORM_flag, true);
+
+  const auto Attr_DW_FORM_flag_false = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_flag_false, DW_FORM_flag, false);
+
+  const auto Attr_DW_FORM_flag_present = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_flag_present, DW_FORM_flag_present);
+
+  //----------------------------------------------------------------------
+  // Test SLEB128 based forms
+  //----------------------------------------------------------------------
+  const auto Attr_DW_FORM_sdata = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_sdata, DW_FORM_sdata, SData);
+
+  //----------------------------------------------------------------------
+  // Test ULEB128 based forms
+  //----------------------------------------------------------------------
+  const auto Attr_DW_FORM_udata = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_udata, DW_FORM_udata, UData[0]);
+
+  //----------------------------------------------------------------------
+  // Test DWARF32/DWARF64 forms
+  //----------------------------------------------------------------------
+  const auto Attr_DW_FORM_GNU_ref_alt = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_GNU_ref_alt, DW_FORM_GNU_ref_alt,
+                     Dwarf32Values[0]);
+
+  const auto Attr_DW_FORM_sec_offset = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_DW_FORM_sec_offset, DW_FORM_sec_offset,
+                     Dwarf32Values[1]);
+
+  //----------------------------------------------------------------------
+  // Add an address at the end to make sure we can decode this value
+  //----------------------------------------------------------------------
+  const auto Attr_Last = static_cast<dwarf::Attribute>(Attr++);
+  CUDie.addAttribute(Attr_Last, DW_FORM_addr, AddrValue);
+
+  //----------------------------------------------------------------------
+  // Generate the DWARF
+  //----------------------------------------------------------------------
+  StringRef FileBytes = DG->generate();
+  MemoryBufferRef FileBuffer(FileBytes, "dwarf");
+  auto Obj = object::ObjectFile::createObjectFile(FileBuffer);
+  EXPECT_TRUE((bool)Obj);
+  DWARFContextInMemory DwarfContext(*Obj.get());
+  uint32_t NumCUs = DwarfContext.getNumCompileUnits();
+  EXPECT_EQ(NumCUs, 1u);
+  DWARFCompileUnit *U = DwarfContext.getCompileUnitAtIndex(0);
+  auto DiePtr = U->getUnitDIE(false);
+  EXPECT_TRUE(DiePtr != nullptr);
+
+  //----------------------------------------------------------------------
+  // Test address forms
+  //----------------------------------------------------------------------
+  EXPECT_EQ(DiePtr->getAttributeValueAsAddress(U, Attr_DW_FORM_addr, 0),
+            AddrValue);
+
+  //----------------------------------------------------------------------
+  // Test block forms
+  //----------------------------------------------------------------------
+  DWARFFormValue FormValue;
+  ArrayRef<uint8_t> ExtractedBlockData;
+  Optional<ArrayRef<uint8_t>> BlockDataOpt;
+
+  EXPECT_TRUE(DiePtr->getAttributeValue(U, Attr_DW_FORM_block, FormValue));
+  BlockDataOpt = FormValue.getAsBlock();
+  EXPECT_TRUE(BlockDataOpt.hasValue());
+  ExtractedBlockData = BlockDataOpt.getValue();
+  EXPECT_EQ(ExtractedBlockData.size(), BlockSize);
+  EXPECT_TRUE(memcmp(ExtractedBlockData.data(), BlockData, BlockSize) == 0);
+
+  EXPECT_TRUE(DiePtr->getAttributeValue(U, Attr_DW_FORM_block1, FormValue));
+  BlockDataOpt = FormValue.getAsBlock();
+  EXPECT_TRUE(BlockDataOpt.hasValue());
+  ExtractedBlockData = BlockDataOpt.getValue();
+  EXPECT_EQ(ExtractedBlockData.size(), BlockSize);
+  EXPECT_TRUE(memcmp(ExtractedBlockData.data(), BlockData, BlockSize) == 0);
+
+  EXPECT_TRUE(DiePtr->getAttributeValue(U, Attr_DW_FORM_block2, FormValue));
+  BlockDataOpt = FormValue.getAsBlock();
+  EXPECT_TRUE(BlockDataOpt.hasValue());
+  ExtractedBlockData = BlockDataOpt.getValue();
+  EXPECT_EQ(ExtractedBlockData.size(), BlockSize);
+  EXPECT_TRUE(memcmp(ExtractedBlockData.data(), BlockData, BlockSize) == 0);
+
+  EXPECT_TRUE(DiePtr->getAttributeValue(U, Attr_DW_FORM_block4, FormValue));
+  BlockDataOpt = FormValue.getAsBlock();
+  EXPECT_TRUE(BlockDataOpt.hasValue());
+  ExtractedBlockData = BlockDataOpt.getValue();
+  EXPECT_EQ(ExtractedBlockData.size(), BlockSize);
+  EXPECT_TRUE(memcmp(ExtractedBlockData.data(), BlockData, BlockSize) == 0);
+
+  //----------------------------------------------------------------------
+  // Test data forms
+  //----------------------------------------------------------------------
+  EXPECT_EQ(
+      DiePtr->getAttributeValueAsUnsignedConstant(U, Attr_DW_FORM_data1, 0),
+      Data1);
+  EXPECT_EQ(
+      DiePtr->getAttributeValueAsUnsignedConstant(U, Attr_DW_FORM_data2, 0),
+      Data2);
+  EXPECT_EQ(
+      DiePtr->getAttributeValueAsUnsignedConstant(U, Attr_DW_FORM_data4, 0),
+      Data4);
+  EXPECT_EQ(
+      DiePtr->getAttributeValueAsUnsignedConstant(U, Attr_DW_FORM_data8, 0),
+      Data8);
+
+  //----------------------------------------------------------------------
+  // Test string forms
+  //----------------------------------------------------------------------
+  const char *ExtractedStringValue =
+      DiePtr->getAttributeValueAsString(U, Attr_DW_FORM_string, nullptr);
+  EXPECT_TRUE(ExtractedStringValue != nullptr);
+  EXPECT_TRUE(strcmp(StringValue, ExtractedStringValue) == 0);
+
+  const char *ExtractedStrpValue =
+      DiePtr->getAttributeValueAsString(U, Attr_DW_FORM_strp, nullptr);
+  EXPECT_TRUE(ExtractedStrpValue != nullptr);
+  EXPECT_TRUE(strcmp(StrpValue, ExtractedStrpValue) == 0);
+
+  //----------------------------------------------------------------------
+  // Test reference forms
+  //----------------------------------------------------------------------
+  EXPECT_EQ(DiePtr->getAttributeValueAsReference(U, Attr_DW_FORM_ref_addr, 0),
+            RefAddr);
+  EXPECT_EQ(DiePtr->getAttributeValueAsReference(U, Attr_DW_FORM_ref1, 0),
+            Data1);
+  EXPECT_EQ(DiePtr->getAttributeValueAsReference(U, Attr_DW_FORM_ref2, 0),
+            Data2);
+  EXPECT_EQ(DiePtr->getAttributeValueAsReference(U, Attr_DW_FORM_ref4, 0),
+            Data4);
+  EXPECT_EQ(DiePtr->getAttributeValueAsReference(U, Attr_DW_FORM_ref8, 0),
+            Data8);
+  EXPECT_EQ(DiePtr->getAttributeValueAsReference(U, Attr_DW_FORM_ref_sig8, 0),
+            Data8_2);
+  EXPECT_EQ(DiePtr->getAttributeValueAsReference(U, Attr_DW_FORM_ref_udata, 0),
+            UData[0]);
+
+  //----------------------------------------------------------------------
+  // Test flag forms
+  //----------------------------------------------------------------------
+  EXPECT_EQ(DiePtr->getAttributeValueAsUnsignedConstant(
+                U, Attr_DW_FORM_flag_true, 0ULL),
+            1ULL);
+  EXPECT_EQ(DiePtr->getAttributeValueAsUnsignedConstant(
+                U, Attr_DW_FORM_flag_false, 1ULL),
+            0ULL);
+  EXPECT_EQ(DiePtr->getAttributeValueAsUnsignedConstant(
+                U, Attr_DW_FORM_flag_present, 0ULL),
+            1ULL);
+
+  // TODO: test Attr_DW_FORM_implicit_const extraction
+
+  //----------------------------------------------------------------------
+  // Test SLEB128 based forms
+  //----------------------------------------------------------------------
+  EXPECT_EQ(DiePtr->getAttributeValueAsSignedConstant(U, Attr_DW_FORM_sdata, 0),
+            SData);
+
+  //----------------------------------------------------------------------
+  // Test ULEB128 based forms
+  //----------------------------------------------------------------------
+  EXPECT_EQ(
+      DiePtr->getAttributeValueAsUnsignedConstant(U, Attr_DW_FORM_udata, 0),
+      UData[0]);
+
+  //----------------------------------------------------------------------
+  // Test DWARF32/DWARF64 forms
+  //----------------------------------------------------------------------
+  EXPECT_EQ(
+      DiePtr->getAttributeValueAsReference(U, Attr_DW_FORM_GNU_ref_alt, 0),
+      Dwarf32Values[0]);
+  EXPECT_EQ(
+      DiePtr->getAttributeValueAsSectionOffset(U, Attr_DW_FORM_sec_offset, 0),
+      Dwarf32Values[1]);
+
+  //----------------------------------------------------------------------
+  // Add an address at the end to make sure we can decode this value
+  //----------------------------------------------------------------------
+  EXPECT_EQ(DiePtr->getAttributeValueAsAddress(U, Attr_Last, 0), AddrValue);
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version2Addr4AllForms) {
+  // Test that we can decode all forms for DWARF32, version 2, with 4 byte
+  // addresses.
+  typedef uint32_t AddrType;
+  // DW_FORM_ref_addr are the same as the address type in DWARF32 version 2.
+  typedef AddrType RefAddrType;
+  TestAllForms<2, AddrType, RefAddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version2Addr8AllForms) {
+  // Test that we can decode all forms for DWARF32, version 2, with 4 byte
+  // addresses.
+  typedef uint64_t AddrType;
+  // DW_FORM_ref_addr are the same as the address type in DWARF32 version 2.
+  typedef AddrType RefAddrType;
+  TestAllForms<2, AddrType, RefAddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version3Addr4AllForms) {
+  // Test that we can decode all forms for DWARF32, version 3, with 4 byte
+  // addresses.
+  typedef uint32_t AddrType;
+  // DW_FORM_ref_addr are 4 bytes in DWARF32 for version 3 and later.
+  typedef uint32_t RefAddrType;
+  TestAllForms<3, AddrType, RefAddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version3Addr8AllForms) {
+  // Test that we can decode all forms for DWARF32, version 3, with 8 byte
+  // addresses.
+  typedef uint64_t AddrType;
+  // DW_FORM_ref_addr are 4 bytes in DWARF32 for version 3 and later
+  typedef uint32_t RefAddrType;
+  TestAllForms<3, AddrType, RefAddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version4Addr4AllForms) {
+  // Test that we can decode all forms for DWARF32, version 4, with 4 byte
+  // addresses.
+  typedef uint32_t AddrType;
+  // DW_FORM_ref_addr are 4 bytes in DWARF32 for version 3 and later
+  typedef uint32_t RefAddrType;
+  TestAllForms<4, AddrType, RefAddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version4Addr8AllForms) {
+  // Test that we can decode all forms for DWARF32, version 4, with 8 byte
+  // addresses.
+  typedef uint64_t AddrType;
+  // DW_FORM_ref_addr are 4 bytes in DWARF32 for version 3 and later
+  typedef uint32_t RefAddrType;
+  TestAllForms<4, AddrType, RefAddrType>();
+}
+
+template <uint16_t Version, class AddrType> void TestChildren() {
+  // Test that we can decode DW_FORM_ref_addr values correctly in DWARF 2 with
+  // 4 byte addresses. DW_FORM_ref_addr values should be 4 bytes when using
+  // 8 byte addresses.
+
+  const uint8_t AddrSize = sizeof(AddrType);
+  initLLVMIfNeeded();
+  Triple Triple = getHostTripleForAddrSize(AddrSize);
+  auto ExpectedDG = dwarfgen::Generator::create(Triple, Version);
+  if (HandleExpectedError(ExpectedDG))
+    return;
+  dwarfgen::Generator *DG = ExpectedDG.get().get();
+  dwarfgen::CompileUnit &CU = DG->addCompileUnit();
+  dwarfgen::DIE CUDie = CU.getUnitDIE();
+
+  CUDie.addAttribute(DW_AT_name, DW_FORM_strp, "/tmp/main.c");
+  CUDie.addAttribute(DW_AT_language, DW_FORM_data2, DW_LANG_C);
+
+  dwarfgen::DIE SubprogramDie = CUDie.addChild(DW_TAG_subprogram);
+  SubprogramDie.addAttribute(DW_AT_name, DW_FORM_strp, "main");
+  SubprogramDie.addAttribute(DW_AT_low_pc, DW_FORM_addr, 0x1000U);
+  SubprogramDie.addAttribute(DW_AT_high_pc, DW_FORM_addr, 0x2000U);
+
+  dwarfgen::DIE IntDie = CUDie.addChild(DW_TAG_base_type);
+  IntDie.addAttribute(DW_AT_name, DW_FORM_strp, "int");
+  IntDie.addAttribute(DW_AT_encoding, DW_FORM_data1, DW_ATE_signed);
+  IntDie.addAttribute(DW_AT_byte_size, DW_FORM_data1, 4);
+
+  dwarfgen::DIE ArgcDie = SubprogramDie.addChild(DW_TAG_formal_parameter);
+  ArgcDie.addAttribute(DW_AT_name, DW_FORM_strp, "argc");
+  // ArgcDie.addAttribute(DW_AT_type, DW_FORM_ref4, IntDie);
+  ArgcDie.addAttribute(DW_AT_type, DW_FORM_ref_addr, IntDie);
+
+  StringRef FileBytes = DG->generate();
+  MemoryBufferRef FileBuffer(FileBytes, "dwarf");
+  auto Obj = object::ObjectFile::createObjectFile(FileBuffer);
+  EXPECT_TRUE((bool)Obj);
+  DWARFContextInMemory DwarfContext(*Obj.get());
+
+  // Verify the number of compile units is correct.
+  uint32_t NumCUs = DwarfContext.getNumCompileUnits();
+  EXPECT_EQ(NumCUs, 1u);
+  DWARFCompileUnit *U = DwarfContext.getCompileUnitAtIndex(0);
+
+  // Get the compile unit DIE is valid.
+  auto DiePtr = U->getUnitDIE(false);
+  EXPECT_TRUE(DiePtr != nullptr);
+  // DiePtr->dump(llvm::outs(), U, UINT32_MAX);
+
+  // Verify the first child of the compile unit DIE is our subprogram.
+  auto SubprogramDiePtr = DiePtr->getFirstChild();
+  EXPECT_TRUE(SubprogramDiePtr != nullptr);
+  EXPECT_EQ(SubprogramDiePtr->getTag(), DW_TAG_subprogram);
+
+  // Verify the first child of the subprogram is our formal parameter.
+  auto ArgcDiePtr = SubprogramDiePtr->getFirstChild();
+  EXPECT_TRUE(ArgcDiePtr != nullptr);
+  EXPECT_EQ(ArgcDiePtr->getTag(), DW_TAG_formal_parameter);
+
+  // Verify our formal parameter has a NULL tag sibling.
+  auto NullDiePtr = ArgcDiePtr->getSibling();
+  EXPECT_TRUE(NullDiePtr != nullptr);
+  if (NullDiePtr) {
+    EXPECT_EQ(NullDiePtr->getTag(), DW_TAG_null);
+    EXPECT_TRUE(NullDiePtr->getSibling() == nullptr);
+    EXPECT_TRUE(NullDiePtr->getFirstChild() == nullptr);
+  }
+
+  // Verify the sibling of our subprogram is our integer base type.
+  auto IntDiePtr = SubprogramDiePtr->getSibling();
+  EXPECT_TRUE(IntDiePtr != nullptr);
+  EXPECT_EQ(IntDiePtr->getTag(), DW_TAG_base_type);
+
+  // Verify the sibling of our subprogram is our integer base is a NULL tag.
+  NullDiePtr = IntDiePtr->getSibling();
+  EXPECT_TRUE(NullDiePtr != nullptr);
+  if (NullDiePtr) {
+    EXPECT_EQ(NullDiePtr->getTag(), DW_TAG_null);
+    EXPECT_TRUE(NullDiePtr->getSibling() == nullptr);
+    EXPECT_TRUE(NullDiePtr->getFirstChild() == nullptr);
+  }
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version2Addr4Children) {
+  // Test that we can decode all forms for DWARF32, version 2, with 4 byte
+  // addresses.
+  typedef uint32_t AddrType;
+  TestChildren<2, AddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version2Addr8Children) {
+  // Test that we can decode all forms for DWARF32, version 2, with 8 byte
+  // addresses.
+  typedef uint64_t AddrType;
+  TestChildren<2, AddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version3Addr4Children) {
+  // Test that we can decode all forms for DWARF32, version 3, with 4 byte
+  // addresses.
+  typedef uint32_t AddrType;
+  TestChildren<3, AddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version3Addr8Children) {
+  // Test that we can decode all forms for DWARF32, version 3, with 8 byte
+  // addresses.
+  typedef uint64_t AddrType;
+  TestChildren<3, AddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version4Addr4Children) {
+  // Test that we can decode all forms for DWARF32, version 4, with 4 byte
+  // addresses.
+  typedef uint32_t AddrType;
+  TestChildren<4, AddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version4Addr8Children) {
+  // Test that we can decode all forms for DWARF32, version 4, with 8 byte
+  // addresses.
+  typedef uint64_t AddrType;
+  TestChildren<4, AddrType>();
+}
+
+template <uint16_t Version, class AddrType> void TestReferences() {
+  // Test that we can decode DW_FORM_refXXX values correctly in DWARF.
+
+  const uint8_t AddrSize = sizeof(AddrType);
+  initLLVMIfNeeded();
+  Triple Triple = getHostTripleForAddrSize(AddrSize);
+  auto ExpectedDG = dwarfgen::Generator::create(Triple, Version);
+  if (HandleExpectedError(ExpectedDG))
+    return;
+  dwarfgen::Generator *DG = ExpectedDG.get().get();
+  dwarfgen::CompileUnit &CU1 = DG->addCompileUnit();
+  dwarfgen::CompileUnit &CU2 = DG->addCompileUnit();
+
+  dwarfgen::DIE CU1Die = CU1.getUnitDIE();
+  CU1Die.addAttribute(DW_AT_name, DW_FORM_strp, "/tmp/main.c");
+  CU1Die.addAttribute(DW_AT_language, DW_FORM_data2, DW_LANG_C);
+
+  dwarfgen::DIE CU1TypeDie = CU1Die.addChild(DW_TAG_base_type);
+  CU1TypeDie.addAttribute(DW_AT_name, DW_FORM_strp, "int");
+  CU1TypeDie.addAttribute(DW_AT_encoding, DW_FORM_data1, DW_ATE_signed);
+  CU1TypeDie.addAttribute(DW_AT_byte_size, DW_FORM_data1, 4);
+
+  dwarfgen::DIE CU1Ref1Die = CU1Die.addChild(DW_TAG_variable);
+  CU1Ref1Die.addAttribute(DW_AT_name, DW_FORM_strp, "CU1Ref1");
+  CU1Ref1Die.addAttribute(DW_AT_type, DW_FORM_ref1, CU1TypeDie);
+
+  dwarfgen::DIE CU1Ref2Die = CU1Die.addChild(DW_TAG_variable);
+  CU1Ref2Die.addAttribute(DW_AT_name, DW_FORM_strp, "CU1Ref2");
+  CU1Ref2Die.addAttribute(DW_AT_type, DW_FORM_ref2, CU1TypeDie);
+
+  dwarfgen::DIE CU1Ref4Die = CU1Die.addChild(DW_TAG_variable);
+  CU1Ref4Die.addAttribute(DW_AT_name, DW_FORM_strp, "CU1Ref4");
+  CU1Ref4Die.addAttribute(DW_AT_type, DW_FORM_ref4, CU1TypeDie);
+
+  dwarfgen::DIE CU1Ref8Die = CU1Die.addChild(DW_TAG_variable);
+  CU1Ref8Die.addAttribute(DW_AT_name, DW_FORM_strp, "CU1Ref8");
+  CU1Ref8Die.addAttribute(DW_AT_type, DW_FORM_ref8, CU1TypeDie);
+
+  dwarfgen::DIE CU1RefAddrDie = CU1Die.addChild(DW_TAG_variable);
+  CU1RefAddrDie.addAttribute(DW_AT_name, DW_FORM_strp, "CU1RefAddr");
+  CU1RefAddrDie.addAttribute(DW_AT_type, DW_FORM_ref_addr, CU1TypeDie);
+
+  dwarfgen::DIE CU2Die = CU2.getUnitDIE();
+  CU2Die.addAttribute(DW_AT_name, DW_FORM_strp, "/tmp/foo.c");
+  CU2Die.addAttribute(DW_AT_language, DW_FORM_data2, DW_LANG_C);
+
+  dwarfgen::DIE CU2TypeDie = CU2Die.addChild(DW_TAG_base_type);
+  CU2TypeDie.addAttribute(DW_AT_name, DW_FORM_strp, "float");
+  CU2TypeDie.addAttribute(DW_AT_encoding, DW_FORM_data1, DW_ATE_float);
+  CU2TypeDie.addAttribute(DW_AT_byte_size, DW_FORM_data1, 4);
+
+  dwarfgen::DIE CU2Ref1Die = CU2Die.addChild(DW_TAG_variable);
+  CU2Ref1Die.addAttribute(DW_AT_name, DW_FORM_strp, "CU2Ref1");
+  CU2Ref1Die.addAttribute(DW_AT_type, DW_FORM_ref1, CU2TypeDie);
+
+  dwarfgen::DIE CU2Ref2Die = CU2Die.addChild(DW_TAG_variable);
+  CU2Ref2Die.addAttribute(DW_AT_name, DW_FORM_strp, "CU2Ref2");
+  CU2Ref2Die.addAttribute(DW_AT_type, DW_FORM_ref2, CU2TypeDie);
+
+  dwarfgen::DIE CU2Ref4Die = CU2Die.addChild(DW_TAG_variable);
+  CU2Ref4Die.addAttribute(DW_AT_name, DW_FORM_strp, "CU2Ref4");
+  CU2Ref4Die.addAttribute(DW_AT_type, DW_FORM_ref4, CU2TypeDie);
+
+  dwarfgen::DIE CU2Ref8Die = CU2Die.addChild(DW_TAG_variable);
+  CU2Ref8Die.addAttribute(DW_AT_name, DW_FORM_strp, "CU2Ref8");
+  CU2Ref8Die.addAttribute(DW_AT_type, DW_FORM_ref8, CU2TypeDie);
+
+  dwarfgen::DIE CU2RefAddrDie = CU2Die.addChild(DW_TAG_variable);
+  CU2RefAddrDie.addAttribute(DW_AT_name, DW_FORM_strp, "CU2RefAddr");
+  CU2RefAddrDie.addAttribute(DW_AT_type, DW_FORM_ref_addr, CU2TypeDie);
+
+  // Refer to a type in CU1 from CU2
+  dwarfgen::DIE CU2ToCU1RefAddrDie = CU2Die.addChild(DW_TAG_variable);
+  CU2ToCU1RefAddrDie.addAttribute(DW_AT_name, DW_FORM_strp, "CU2ToCU1RefAddr");
+  CU2ToCU1RefAddrDie.addAttribute(DW_AT_type, DW_FORM_ref_addr, CU1TypeDie);
+
+  // Refer to a type in CU2 from CU1
+  dwarfgen::DIE CU1ToCU2RefAddrDie = CU1Die.addChild(DW_TAG_variable);
+  CU1ToCU2RefAddrDie.addAttribute(DW_AT_name, DW_FORM_strp, "CU1ToCU2RefAddr");
+  CU1ToCU2RefAddrDie.addAttribute(DW_AT_type, DW_FORM_ref_addr, CU2TypeDie);
+
+  StringRef FileBytes = DG->generate();
+  MemoryBufferRef FileBuffer(FileBytes, "dwarf");
+  auto Obj = object::ObjectFile::createObjectFile(FileBuffer);
+  EXPECT_TRUE((bool)Obj);
+  DWARFContextInMemory DwarfContext(*Obj.get());
+
+  // Verify the number of compile units is correct.
+  uint32_t NumCUs = DwarfContext.getNumCompileUnits();
+  EXPECT_EQ(NumCUs, 2u);
+  DWARFCompileUnit *U1 = DwarfContext.getCompileUnitAtIndex(0);
+  DWARFCompileUnit *U2 = DwarfContext.getCompileUnitAtIndex(1);
+
+  // Get the compile unit DIE is valid.
+  auto Unit1DiePtr = U1->getUnitDIE(false);
+  EXPECT_TRUE(Unit1DiePtr != nullptr);
+  // Unit1DiePtr->dump(llvm::outs(), U1, UINT32_MAX);
+
+  auto Unit2DiePtr = U2->getUnitDIE(false);
+  EXPECT_TRUE(Unit2DiePtr != nullptr);
+  // Unit2DiePtr->dump(llvm::outs(), U2, UINT32_MAX);
+
+  // Verify the first child of the compile unit 1 DIE is our int base type.
+  auto CU1TypeDiePtr = Unit1DiePtr->getFirstChild();
+  EXPECT_TRUE(CU1TypeDiePtr != nullptr);
+  EXPECT_EQ(CU1TypeDiePtr->getTag(), DW_TAG_base_type);
+  EXPECT_EQ(
+      CU1TypeDiePtr->getAttributeValueAsUnsignedConstant(U1, DW_AT_encoding, 0),
+      DW_ATE_signed);
+
+  // Verify the first child of the compile unit 2 DIE is our float base type.
+  auto CU2TypeDiePtr = Unit2DiePtr->getFirstChild();
+  EXPECT_TRUE(CU2TypeDiePtr != nullptr);
+  EXPECT_EQ(CU2TypeDiePtr->getTag(), DW_TAG_base_type);
+  EXPECT_EQ(
+      CU2TypeDiePtr->getAttributeValueAsUnsignedConstant(U2, DW_AT_encoding, 0),
+      DW_ATE_float);
+
+  // Verify the sibling of the base type DIE is our Ref1 DIE and that its
+  // DW_AT_type points to our base type DIE.
+  auto CU1Ref1DiePtr = CU1TypeDiePtr->getSibling();
+  EXPECT_TRUE(CU1Ref1DiePtr != nullptr);
+  EXPECT_EQ(CU1Ref1DiePtr->getTag(), DW_TAG_variable);
+  EXPECT_EQ(CU1Ref1DiePtr->getAttributeValueAsReference(U1, DW_AT_type, -1ULL),
+            CU1TypeDiePtr->getOffset());
+  // Verify the sibling is our Ref2 DIE and that its DW_AT_type points to our
+  // base type DIE in CU1.
+  auto CU1Ref2DiePtr = CU1Ref1DiePtr->getSibling();
+  EXPECT_TRUE(CU1Ref2DiePtr != nullptr);
+  EXPECT_EQ(CU1Ref2DiePtr->getTag(), DW_TAG_variable);
+  EXPECT_EQ(CU1Ref2DiePtr->getAttributeValueAsReference(U1, DW_AT_type, -1ULL),
+            CU1TypeDiePtr->getOffset());
+
+  // Verify the sibling is our Ref4 DIE and that its DW_AT_type points to our
+  // base type DIE in CU1.
+  auto CU1Ref4DiePtr = CU1Ref2DiePtr->getSibling();
+  EXPECT_TRUE(CU1Ref4DiePtr != nullptr);
+  EXPECT_EQ(CU1Ref4DiePtr->getTag(), DW_TAG_variable);
+  EXPECT_EQ(CU1Ref4DiePtr->getAttributeValueAsReference(U1, DW_AT_type, -1ULL),
+            CU1TypeDiePtr->getOffset());
+
+  // Verify the sibling is our Ref8 DIE and that its DW_AT_type points to our
+  // base type DIE in CU1.
+  auto CU1Ref8DiePtr = CU1Ref4DiePtr->getSibling();
+  EXPECT_TRUE(CU1Ref8DiePtr != nullptr);
+  EXPECT_EQ(CU1Ref8DiePtr->getTag(), DW_TAG_variable);
+  EXPECT_EQ(CU1Ref8DiePtr->getAttributeValueAsReference(U1, DW_AT_type, -1ULL),
+            CU1TypeDiePtr->getOffset());
+
+  // Verify the sibling is our RefAddr DIE and that its DW_AT_type points to our
+  // base type DIE in CU1.
+  auto CU1RefAddrDiePtr = CU1Ref8DiePtr->getSibling();
+  EXPECT_TRUE(CU1RefAddrDiePtr != nullptr);
+  EXPECT_EQ(CU1RefAddrDiePtr->getTag(), DW_TAG_variable);
+  EXPECT_EQ(
+      CU1RefAddrDiePtr->getAttributeValueAsReference(U1, DW_AT_type, -1ULL),
+      CU1TypeDiePtr->getOffset());
+
+  // Verify the sibling of the Ref4 DIE is our RefAddr DIE and that its
+  // DW_AT_type points to our base type DIE.
+  auto CU1ToCU2RefAddrDiePtr = CU1RefAddrDiePtr->getSibling();
+  EXPECT_TRUE(CU1ToCU2RefAddrDiePtr != nullptr);
+  EXPECT_EQ(CU1ToCU2RefAddrDiePtr->getTag(), DW_TAG_variable);
+  EXPECT_EQ(CU1ToCU2RefAddrDiePtr->getAttributeValueAsReference(U1, DW_AT_type,
+                                                                -1ULL),
+            CU2TypeDiePtr->getOffset());
+
+  // Verify the sibling of the base type DIE is our Ref1 DIE and that its
+  // DW_AT_type points to our base type DIE.
+  auto CU2Ref1DiePtr = CU2TypeDiePtr->getSibling();
+  EXPECT_TRUE(CU2Ref1DiePtr != nullptr);
+  EXPECT_EQ(CU2Ref1DiePtr->getTag(), DW_TAG_variable);
+  EXPECT_EQ(CU2Ref1DiePtr->getAttributeValueAsReference(U2, DW_AT_type, -1ULL),
+            CU2TypeDiePtr->getOffset());
+  // Verify the sibling is our Ref2 DIE and that its DW_AT_type points to our
+  // base type DIE in CU2.
+  auto CU2Ref2DiePtr = CU2Ref1DiePtr->getSibling();
+  EXPECT_TRUE(CU2Ref2DiePtr != nullptr);
+  EXPECT_EQ(CU2Ref2DiePtr->getTag(), DW_TAG_variable);
+  EXPECT_EQ(CU2Ref2DiePtr->getAttributeValueAsReference(U2, DW_AT_type, -1ULL),
+            CU2TypeDiePtr->getOffset());
+
+  // Verify the sibling is our Ref4 DIE and that its DW_AT_type points to our
+  // base type DIE in CU2.
+  auto CU2Ref4DiePtr = CU2Ref2DiePtr->getSibling();
+  EXPECT_TRUE(CU2Ref4DiePtr != nullptr);
+  EXPECT_EQ(CU2Ref4DiePtr->getTag(), DW_TAG_variable);
+  EXPECT_EQ(CU2Ref4DiePtr->getAttributeValueAsReference(U2, DW_AT_type, -1ULL),
+            CU2TypeDiePtr->getOffset());
+
+  // Verify the sibling is our Ref8 DIE and that its DW_AT_type points to our
+  // base type DIE in CU2.
+  auto CU2Ref8DiePtr = CU2Ref4DiePtr->getSibling();
+  EXPECT_TRUE(CU2Ref8DiePtr != nullptr);
+  EXPECT_EQ(CU2Ref8DiePtr->getTag(), DW_TAG_variable);
+  EXPECT_EQ(CU2Ref8DiePtr->getAttributeValueAsReference(U2, DW_AT_type, -1ULL),
+            CU2TypeDiePtr->getOffset());
+
+  // Verify the sibling is our RefAddr DIE and that its DW_AT_type points to our
+  // base type DIE in CU2.
+  auto CU2RefAddrDiePtr = CU2Ref8DiePtr->getSibling();
+  EXPECT_TRUE(CU2RefAddrDiePtr != nullptr);
+  EXPECT_EQ(CU2RefAddrDiePtr->getTag(), DW_TAG_variable);
+  EXPECT_EQ(
+      CU2RefAddrDiePtr->getAttributeValueAsReference(U2, DW_AT_type, -1ULL),
+      CU2TypeDiePtr->getOffset());
+
+  // Verify the sibling of the Ref4 DIE is our RefAddr DIE and that its
+  // DW_AT_type points to our base type DIE.
+  auto CU2ToCU1RefAddrDiePtr = CU2RefAddrDiePtr->getSibling();
+  EXPECT_TRUE(CU2ToCU1RefAddrDiePtr != nullptr);
+  EXPECT_EQ(CU2ToCU1RefAddrDiePtr->getTag(), DW_TAG_variable);
+  EXPECT_EQ(CU2ToCU1RefAddrDiePtr->getAttributeValueAsReference(U2, DW_AT_type,
+                                                                -1ULL),
+            CU1TypeDiePtr->getOffset());
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version2Addr4References) {
+  // Test that we can decode all forms for DWARF32, version 2, with 4 byte
+  // addresses.
+  typedef uint32_t AddrType;
+  TestReferences<2, AddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version2Addr8References) {
+  // Test that we can decode all forms for DWARF32, version 2, with 8 byte
+  // addresses.
+  typedef uint64_t AddrType;
+  TestReferences<2, AddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version3Addr4References) {
+  // Test that we can decode all forms for DWARF32, version 3, with 4 byte
+  // addresses.
+  typedef uint32_t AddrType;
+  TestReferences<3, AddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version3Addr8References) {
+  // Test that we can decode all forms for DWARF32, version 3, with 8 byte
+  // addresses.
+  typedef uint64_t AddrType;
+  TestReferences<3, AddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version4Addr4References) {
+  // Test that we can decode all forms for DWARF32, version 4, with 4 byte
+  // addresses.
+  typedef uint32_t AddrType;
+  TestReferences<4, AddrType>();
+}
+
+TEST(DWARFDebugInfo, TestDWARF32Version4Addr8References) {
+  // Test that we can decode all forms for DWARF32, version 4, with 8 byte
+  // addresses.
+  typedef uint64_t AddrType;
+  TestReferences<4, AddrType>();
+}
+
+} // end anonymous namespace
diff --git a/unittests/DebugInfo/DWARF/DwarfGenerator.cpp b/unittests/DebugInfo/DWARF/DwarfGenerator.cpp
new file mode 100644
index 0000000..0503987
--- /dev/null
+++ b/unittests/DebugInfo/DWARF/DwarfGenerator.cpp
@@ -0,0 +1,264 @@
+//===--- unittests/DebugInfo/DWARF/DwarfGenerator.cpp -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../lib/CodeGen/AsmPrinter/DwarfStringPool.h"
+#include "DwarfGenerator.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/IR/LegacyPassManagers.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetOptionsCommandFlags.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+using namespace dwarf;
+
+namespace {} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+/// dwarfgen::DIE implementation.
+//===----------------------------------------------------------------------===//
+unsigned dwarfgen::DIE::computeSizeAndOffsets(unsigned Offset) {
+  auto &DG = CU->getGenerator();
+  return Die->computeOffsetsAndAbbrevs(DG.getAsmPrinter(), DG.getAbbrevSet(),
+                                       Offset);
+}
+
+void dwarfgen::DIE::addAttribute(uint16_t A, dwarf::Form Form, uint64_t U) {
+  auto &DG = CU->getGenerator();
+  Die->addValue(DG.getAllocator(), static_cast<dwarf::Attribute>(A), Form,
+                DIEInteger(U));
+}
+
+void dwarfgen::DIE::addAttribute(uint16_t A, dwarf::Form Form,
+                                 StringRef String) {
+  auto &DG = CU->getGenerator();
+  if (Form == DW_FORM_string) {
+    Die->addValue(DG.getAllocator(), static_cast<dwarf::Attribute>(A), Form,
+                  new (DG.getAllocator()) DIEInlineString(String));
+  } else {
+    Die->addValue(
+        DG.getAllocator(), static_cast<dwarf::Attribute>(A), Form,
+        DIEString(DG.getStringPool().getEntry(*DG.getAsmPrinter(), String)));
+  }
+}
+
+void dwarfgen::DIE::addAttribute(uint16_t A, dwarf::Form Form,
+                                 dwarfgen::DIE &RefDie) {
+  auto &DG = CU->getGenerator();
+  Die->addValue(DG.getAllocator(), static_cast<dwarf::Attribute>(A), Form,
+                DIEEntry(*RefDie.Die));
+}
+
+void dwarfgen::DIE::addAttribute(uint16_t A, dwarf::Form Form, const void *P,
+                                 size_t S) {
+  auto &DG = CU->getGenerator();
+  DIEBlock *Block = new (DG.getAllocator()) DIEBlock;
+  for (size_t I = 0; I < S; ++I)
+    Block->addValue(DG.getAllocator(), (dwarf::Attribute)0,
+                    dwarf::DW_FORM_data1, DIEInteger(((uint8_t *)P)[I]));
+
+  Block->ComputeSize(DG.getAsmPrinter());
+  Die->addValue(DG.getAllocator(), static_cast<dwarf::Attribute>(A), Form,
+                Block);
+}
+
+void dwarfgen::DIE::addAttribute(uint16_t A, dwarf::Form Form) {
+  auto &DG = CU->getGenerator();
+  assert(Form == DW_FORM_flag_present);
+  Die->addValue(DG.getAllocator(), static_cast<dwarf::Attribute>(A), Form,
+                DIEInteger(1));
+}
+
+dwarfgen::DIE dwarfgen::DIE::addChild(dwarf::Tag Tag) {
+  auto &DG = CU->getGenerator();
+  return dwarfgen::DIE(CU,
+                       &Die->addChild(llvm::DIE::get(DG.getAllocator(), Tag)));
+}
+
+dwarfgen::DIE dwarfgen::CompileUnit::getUnitDIE() {
+  return dwarfgen::DIE(this, &DU.getUnitDie());
+}
+
+//===----------------------------------------------------------------------===//
+/// dwarfgen::Generator implementation.
+//===----------------------------------------------------------------------===//
+
+dwarfgen::Generator::Generator()
+    : MAB(nullptr), MCE(nullptr), MS(nullptr), StringPool(nullptr),
+      Abbreviations(Allocator) {}
+dwarfgen::Generator::~Generator() = default;
+
+llvm::Expected<std::unique_ptr<dwarfgen::Generator>>
+dwarfgen::Generator::create(Triple TheTriple, uint16_t DwarfVersion) {
+  std::unique_ptr<dwarfgen::Generator> GenUP(new dwarfgen::Generator());
+  llvm::Error error = GenUP->init(TheTriple, DwarfVersion);
+  if (error)
+    return Expected<std::unique_ptr<dwarfgen::Generator>>(std::move(error));
+  return Expected<std::unique_ptr<dwarfgen::Generator>>(std::move(GenUP));
+}
+
+llvm::Error dwarfgen::Generator::init(Triple TheTriple, uint16_t V) {
+  Version = V;
+  std::string ErrorStr;
+  std::string TripleName;
+
+  // Get the target.
+  const Target *TheTarget =
+      TargetRegistry::lookupTarget(TripleName, TheTriple, ErrorStr);
+  if (!TheTarget)
+    return make_error<StringError>(ErrorStr, inconvertibleErrorCode());
+
+  TripleName = TheTriple.getTriple();
+
+  // Create all the MC Objects.
+  MRI.reset(TheTarget->createMCRegInfo(TripleName));
+  if (!MRI)
+    return make_error<StringError>(Twine("no register info for target ") +
+                                       TripleName,
+                                   inconvertibleErrorCode());
+
+  MAI.reset(TheTarget->createMCAsmInfo(*MRI, TripleName));
+  if (!MAI)
+    return make_error<StringError>("no asm info for target " + TripleName,
+                                   inconvertibleErrorCode());
+
+  MOFI.reset(new MCObjectFileInfo);
+  MC.reset(new MCContext(MAI.get(), MRI.get(), MOFI.get()));
+  MOFI->InitMCObjectFileInfo(TheTriple, /*PIC*/ false, CodeModel::Default, *MC);
+
+  MCTargetOptions Options;
+  MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, "", Options);
+  if (!MAB)
+    return make_error<StringError>("no asm backend for target " + TripleName,
+                                   inconvertibleErrorCode());
+
+  MII.reset(TheTarget->createMCInstrInfo());
+  if (!MII)
+    return make_error<StringError>("no instr info info for target " +
+                                       TripleName,
+                                   inconvertibleErrorCode());
+
+  MSTI.reset(TheTarget->createMCSubtargetInfo(TripleName, "", ""));
+  if (!MSTI)
+    return make_error<StringError>("no subtarget info for target " + TripleName,
+                                   inconvertibleErrorCode());
+
+  MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, *MC);
+  if (!MCE)
+    return make_error<StringError>("no code emitter for target " + TripleName,
+                                   inconvertibleErrorCode());
+
+  Stream = make_unique<raw_svector_ostream>(FileBytes);
+
+  MCTargetOptions MCOptions = InitMCTargetOptionsFromFlags();
+  MS = TheTarget->createMCObjectStreamer(
+      TheTriple, *MC, *MAB, *Stream, MCE, *MSTI, MCOptions.MCRelaxAll,
+      MCOptions.MCIncrementalLinkerCompatible,
+      /*DWARFMustBeAtTheEnd*/ false);
+  if (!MS)
+    return make_error<StringError>("no object streamer for target " +
+                                       TripleName,
+                                   inconvertibleErrorCode());
+
+  // Finally create the AsmPrinter we'll use to emit the DIEs.
+  TM.reset(TheTarget->createTargetMachine(TripleName, "", "", TargetOptions(),
+                                          None));
+  if (!TM)
+    return make_error<StringError>("no target machine for target " + TripleName,
+                                   inconvertibleErrorCode());
+
+  Asm.reset(TheTarget->createAsmPrinter(*TM, std::unique_ptr<MCStreamer>(MS)));
+  if (!Asm)
+    return make_error<StringError>("no asm printer for target " + TripleName,
+                                   inconvertibleErrorCode());
+
+  // Set the DWARF version correctly on all classes that we use.
+  MC->setDwarfVersion(Version);
+  Asm->setDwarfVersion(Version);
+
+  StringPool = new DwarfStringPool(Allocator, *Asm, StringRef());
+
+  return Error::success();
+}
+
+StringRef dwarfgen::Generator::generate() {
+  // Offset from the first CU in the debug info section is 0 initially.
+  unsigned SecOffset = 0;
+
+  // Iterate over each compile unit and set the size and offsets for each
+  // DIE within each compile unit. All offsets are CU relative.
+  for (auto &CU : CompileUnits) {
+    // Set the absolute .debug_info offset for this compile unit.
+    CU->setOffset(SecOffset);
+    // The DIEs contain compile unit relative offsets.
+    unsigned CUOffset = 11;
+    CUOffset = CU->getUnitDIE().computeSizeAndOffsets(CUOffset);
+    // Update our absolute .debug_info offset.
+    SecOffset += CUOffset;
+    CU->setLength(CUOffset - 4);
+  }
+  Abbreviations.Emit(Asm.get(), MOFI->getDwarfAbbrevSection());
+  StringPool->emit(*Asm, MOFI->getDwarfStrSection());
+  MS->SwitchSection(MOFI->getDwarfInfoSection());
+  for (auto &CU : CompileUnits) {
+    uint16_t Version = CU->getVersion();
+    auto Length = CU->getLength();
+    MC->setDwarfVersion(Version);
+    assert(Length != -1U);
+    Asm->EmitInt32(Length);
+    Asm->EmitInt16(Version);
+    Asm->EmitInt32(0);
+    Asm->EmitInt8(CU->getAddressSize());
+    Asm->emitDwarfDIE(*CU->getUnitDIE().Die);
+  }
+
+  MS->Finish();
+  if (FileBytes.empty())
+    return StringRef();
+  return StringRef(FileBytes.data(), FileBytes.size());
+}
+
+bool dwarfgen::Generator::saveFile(StringRef Path) {
+  if (FileBytes.empty())
+    return false;
+  std::error_code EC;
+  raw_fd_ostream Strm(Path, EC, sys::fs::F_None);
+  if (EC)
+    return false;
+  Strm.write(FileBytes.data(), FileBytes.size());
+  Strm.close();
+  return true;
+}
+
+dwarfgen::CompileUnit &dwarfgen::Generator::addCompileUnit() {
+  CompileUnits.push_back(std::unique_ptr<CompileUnit>(
+      new CompileUnit(*this, Version, Asm->getPointerSize())));
+  return *CompileUnits.back();
+}
diff --git a/unittests/DebugInfo/DWARF/DwarfGenerator.h b/unittests/DebugInfo/DWARF/DwarfGenerator.h
new file mode 100644
index 0000000..f3d2413
--- /dev/null
+++ b/unittests/DebugInfo/DWARF/DwarfGenerator.h
@@ -0,0 +1,231 @@
+//===--- unittests/DebugInfo/DWARF/DwarfGenerator.h -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A file that can generate DWARF debug info for unit tests.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UNITTESTS_DEBUG_INFO_DWARF_DWARFGENERATOR_H
+#define LLVM_UNITTESTS_DEBUG_INFO_DWARF_DWARFGENERATOR_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/Support/Error.h"
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+namespace llvm {
+
+class AsmPrinter;
+class DIE;
+class DIEAbbrev;
+class DwarfStringPool;
+class MCAsmBackend;
+class MCAsmInfo;
+class MCCodeEmitter;
+class MCContext;
+struct MCDwarfLineTableParams;
+class MCInstrInfo;
+class MCObjectFileInfo;
+class MCRegisterInfo;
+class MCStreamer;
+class MCSubtargetInfo;
+class raw_fd_ostream;
+class TargetMachine;
+class Triple;
+
+namespace dwarfgen {
+
+class Generator;
+class CompileUnit;
+
+/// A DWARF debug information entry class used to generate DWARF DIEs.
+///
+/// This class is used to quickly generate DWARF debug information by creating
+/// child DIEs or adding attributes to the current DIE. Instances of this class
+/// are created from the compile unit (dwarfgen::CompileUnit::getUnitDIE()) or
+/// by calling dwarfgen::DIE::addChild(...) and using the returned DIE object.
+class DIE {
+  dwarfgen::CompileUnit *CU;
+  llvm::DIE *Die;
+
+protected:
+  friend class Generator;
+  friend class CompileUnit;
+
+  DIE(CompileUnit *U = nullptr, llvm::DIE *D = nullptr) : CU(U), Die(D) {}
+
+  /// Called with a compile/type unit relative offset prior to generating the
+  /// DWARF debug info.
+  ///
+  /// \param CUOffset the compile/type unit relative offset where the
+  /// abbreviation code for this DIE will be encoded.
+  unsigned computeSizeAndOffsets(unsigned CUOffset);
+
+public:
+  /// Add an attribute value that has no value.
+  ///
+  /// \param Attr a dwarf::Attribute enumeration value or any uint16_t that
+  /// represents a user defined DWARF attribute.
+  /// \param Form the dwarf::Form to use when encoding the attribute. This is
+  /// only used with the DW_FORM_flag_present form encoding.
+  void addAttribute(uint16_t Attr, dwarf::Form Form);
+
+  /// Add an attribute value to be encoded as a DIEInteger
+  ///
+  /// \param Attr a dwarf::Attribute enumeration value or any uint16_t that
+  /// represents a user defined DWARF attribute.
+  /// \param Form the dwarf::Form to use when encoding the attribute.
+  /// \param U the unsigned integer to encode.
+  void addAttribute(uint16_t Attr, dwarf::Form Form, uint64_t U);
+
+  /// Add an attribute value to be encoded as a DIEString or DIEInlinedString.
+  ///
+  /// \param Attr a dwarf::Attribute enumeration value or any uint16_t that
+  /// represents a user defined DWARF attribute.
+  /// \param Form the dwarf::Form to use when encoding the attribute. The form
+  /// must be one of DW_FORM_strp or DW_FORM_string.
+  /// \param String the string to encode.
+  void addAttribute(uint16_t Attr, dwarf::Form Form, StringRef String);
+
+  /// Add an attribute value to be encoded as a DIEEntry.
+  ///
+  /// DIEEntry attributes refer to other llvm::DIE objects that have been
+  /// created.
+  ///
+  /// \param Attr a dwarf::Attribute enumeration value or any uint16_t that
+  /// represents a user defined DWARF attribute.
+  /// \param Form the dwarf::Form to use when encoding the attribute. The form
+  /// must be one of DW_FORM_strp or DW_FORM_string.
+  /// \param RefDie the DIE that this attriute refers to.
+  void addAttribute(uint16_t Attr, dwarf::Form Form, dwarfgen::DIE &RefDie);
+
+  /// Add an attribute value to be encoded as a DIEBlock.
+  ///
+  /// DIEBlock attributes refers to binary data that is stored as the
+  /// attribute's value.
+  ///
+  /// \param Attr a dwarf::Attribute enumeration value or any uint16_t that
+  /// represents a user defined DWARF attribute.
+  /// \param Form the dwarf::Form to use when encoding the attribute. The form
+  /// must be one of DW_FORM_strp or DW_FORM_string.
+  /// \param P a pointer to the data to store as the attribute value.
+  /// \param S the size in bytes of the data pointed to by P .
+  void addAttribute(uint16_t Attr, dwarf::Form Form, const void *P, size_t S);
+
+  /// Add a new child to this DIE object.
+  ///
+  /// \param Tag the dwarf::Tag to assing to the llvm::DIE object.
+  /// \returns the newly created DIE object that is now a child owned by this
+  /// object.
+  dwarfgen::DIE addChild(dwarf::Tag Tag);
+};
+
+/// A DWARF compile unit used to generate DWARF compile/type units.
+///
+/// Instances of these classes are created by instances of the Generator
+/// class. All information required to generate a DWARF compile unit is
+/// contained inside this class.
+class CompileUnit {
+  Generator &DG;
+  DIEUnit DU;
+
+public:
+  CompileUnit(Generator &D, uint16_t V, uint8_t A)
+      : DG(D), DU(V, A, dwarf::DW_TAG_compile_unit) {}
+  DIE getUnitDIE();
+  Generator &getGenerator() { return DG; }
+  uint64_t getOffset() const { return DU.getDebugSectionOffset(); }
+  uint64_t getLength() const { return DU.getLength(); }
+  uint16_t getVersion() const { return DU.getDwarfVersion(); }
+  uint16_t getAddressSize() const { return DU.getAddressSize(); }
+  void setOffset(uint64_t Offset) { DU.setDebugSectionOffset(Offset); }
+  void setLength(uint64_t Length) { DU.setLength(Length); }
+};
+
+/// A DWARF generator.
+///
+/// Generate DWARF for unit tests by creating any instance of this class and
+/// calling Generator::addCompileUnit(), and then getting the dwarfgen::DIE from
+/// the returned compile unit and adding attributes and children to each DIE.
+class Generator {
+  std::unique_ptr<MCRegisterInfo> MRI;
+  std::unique_ptr<MCAsmInfo> MAI;
+  std::unique_ptr<MCObjectFileInfo> MOFI;
+  std::unique_ptr<MCContext> MC;
+  MCAsmBackend *MAB; // Owned by MCStreamer
+  std::unique_ptr<MCInstrInfo> MII;
+  std::unique_ptr<MCSubtargetInfo> MSTI;
+  MCCodeEmitter *MCE; // Owned by MCStreamer
+  MCStreamer *MS;     // Owned by AsmPrinter
+  std::unique_ptr<TargetMachine> TM;
+  std::unique_ptr<AsmPrinter> Asm;
+  DwarfStringPool *StringPool; // Owned by Allocator
+  std::vector<std::unique_ptr<CompileUnit>> CompileUnits;
+  BumpPtrAllocator Allocator;
+  DIEAbbrevSet Abbreviations;
+
+  SmallString<4096> FileBytes;
+  /// The stream we use to generate the DWARF into as an ELF file.
+  std::unique_ptr<raw_svector_ostream> Stream;
+  /// The DWARF version to generate.
+  uint16_t Version;
+
+  /// Private constructor, call Generator::Create(...) to get a DWARF generator
+  /// expected.
+  Generator();
+
+  /// Create the streamer and setup the output buffer.
+  llvm::Error init(Triple TheTriple, uint16_t DwarfVersion);
+
+public:
+  /// Create a DWARF generator or get an appropriate error.
+  ///
+  /// \param TheTriple the triple to use when creating any required support
+  /// classes needed to emit the DWARF.
+  /// \param DwarfVersion the version of DWARF to emit.
+  ///
+  /// \returns a llvm::Expected that either contains a unique_ptr to a Generator
+  /// or a llvm::Error.
+  static llvm::Expected<std::unique_ptr<Generator>>
+  create(Triple TheTriple, uint16_t DwarfVersion);
+
+  ~Generator();
+
+  /// Generate all DWARF sections and return a memory buffer that
+  /// contains an ELF file that contains the DWARF.
+  StringRef generate();
+
+  /// Add a compile unit to be generated.
+  ///
+  /// \returns a dwarfgen::CompileUnit that can be used to retrieve the compile
+  /// unit dwarfgen::DIE that can be used to add attributes and add child DIE
+  /// objedts to.
+  dwarfgen::CompileUnit &addCompileUnit();
+
+  BumpPtrAllocator &getAllocator() { return Allocator; }
+  AsmPrinter *getAsmPrinter() const { return Asm.get(); }
+  DIEAbbrevSet &getAbbrevSet() { return Abbreviations; }
+  DwarfStringPool &getStringPool() { return *StringPool; }
+
+  /// Save the generated DWARF file to disk.
+  ///
+  /// \param Path the path to save the ELF file to.
+  bool saveFile(StringRef Path);
+};
+
+} // end namespace dwarfgen
+
+} // end namespace llvm
+
+#endif // LLVM_UNITTESTS_DEBUG_INFO_DWARF_DWARFGENERATOR_H
diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp
index 0a3442e..ab6785c 100644
--- a/unittests/IR/MetadataTest.cpp
+++ b/unittests/IR/MetadataTest.cpp
@@ -2011,19 +2011,20 @@
   // Valid constructions.
   EXPECT_VALID(dwarf::DW_OP_plus, 6);
   EXPECT_VALID(dwarf::DW_OP_deref);
-  EXPECT_VALID(dwarf::DW_OP_bit_piece, 3, 7);
+  EXPECT_VALID(dwarf::DW_OP_LLVM_fragment, 3, 7);
   EXPECT_VALID(dwarf::DW_OP_plus, 6, dwarf::DW_OP_deref);
   EXPECT_VALID(dwarf::DW_OP_deref, dwarf::DW_OP_plus, 6);
-  EXPECT_VALID(dwarf::DW_OP_deref, dwarf::DW_OP_bit_piece, 3, 7);
-  EXPECT_VALID(dwarf::DW_OP_deref, dwarf::DW_OP_plus, 6, dwarf::DW_OP_bit_piece, 3, 7);
+  EXPECT_VALID(dwarf::DW_OP_deref, dwarf::DW_OP_LLVM_fragment, 3, 7);
+  EXPECT_VALID(dwarf::DW_OP_deref, dwarf::DW_OP_plus, 6,
+               dwarf::DW_OP_LLVM_fragment, 3, 7);
 
   // Invalid constructions.
   EXPECT_INVALID(~0u);
   EXPECT_INVALID(dwarf::DW_OP_plus);
-  EXPECT_INVALID(dwarf::DW_OP_bit_piece);
-  EXPECT_INVALID(dwarf::DW_OP_bit_piece, 3);
-  EXPECT_INVALID(dwarf::DW_OP_bit_piece, 3, 7, dwarf::DW_OP_plus, 3);
-  EXPECT_INVALID(dwarf::DW_OP_bit_piece, 3, 7, dwarf::DW_OP_deref);
+  EXPECT_INVALID(dwarf::DW_OP_LLVM_fragment);
+  EXPECT_INVALID(dwarf::DW_OP_LLVM_fragment, 3);
+  EXPECT_INVALID(dwarf::DW_OP_LLVM_fragment, 3, 7, dwarf::DW_OP_plus, 3);
+  EXPECT_INVALID(dwarf::DW_OP_LLVM_fragment, 3, 7, dwarf::DW_OP_deref);
 
 #undef EXPECT_VALID
 #undef EXPECT_INVALID
diff --git a/unittests/Support/FormatVariadicTest.cpp b/unittests/Support/FormatVariadicTest.cpp
index 7dd23cf..ca6df4c 100644
--- a/unittests/Support/FormatVariadicTest.cpp
+++ b/unittests/Support/FormatVariadicTest.cpp
@@ -13,6 +13,35 @@
 
 using namespace llvm;
 
+// Compile-time tests for the uses_format_member template
+namespace {
+struct ConstFormat {
+  void format(raw_ostream &OS, StringRef Opt) const { OS << "ConstFormat"; }
+};
+
+struct Format {
+  void format(raw_ostream &OS, StringRef Opt) { OS << "Format"; }
+};
+
+using detail::uses_format_member;
+
+static_assert(uses_format_member<Format>::value, "");
+static_assert(uses_format_member<Format &>::value, "");
+static_assert(uses_format_member<Format &&>::value, "");
+static_assert(!uses_format_member<const Format>::value, "");
+static_assert(!uses_format_member<const Format &>::value, "");
+static_assert(!uses_format_member<const volatile Format>::value, "");
+static_assert(!uses_format_member<const volatile Format &>::value, "");
+
+static_assert(uses_format_member<ConstFormat>::value, "");
+static_assert(uses_format_member<ConstFormat &>::value, "");
+static_assert(uses_format_member<ConstFormat &&>::value, "");
+static_assert(uses_format_member<const ConstFormat>::value, "");
+static_assert(uses_format_member<const ConstFormat &>::value, "");
+static_assert(uses_format_member<const volatile ConstFormat>::value, "");
+static_assert(uses_format_member<const volatile ConstFormat &>::value, "");
+}
+
 TEST(FormatVariadicTest, EmptyFormatString) {
   auto Replacements = formatv_object_base::parseFormatString("");
   EXPECT_EQ(0U, Replacements.size());
@@ -511,7 +540,7 @@
 
   public:
     explicit Negative(int N) : N(N) {}
-    void format(raw_ostream &S, StringRef Options) { S << -N; }
+    void format(raw_ostream &S, StringRef Options) const { S << -N; }
   };
 
   EXPECT_EQ("-7", formatv("{0}", Negative(7)).str());
@@ -535,4 +564,27 @@
 
   SmallString<4> S2 = formatv("{0} {1}", 1, 2);
   EXPECT_EQ("1 2", S2);
-}
\ No newline at end of file
+}
+
+TEST(FormatVariadicTest, FormatMember) {
+  EXPECT_EQ("Format", formatv("{0}", Format()).str());
+
+  Format var;
+  EXPECT_EQ("Format", formatv("{0}", var).str());
+  EXPECT_EQ("Format", formatv("{0}", std::move(var)).str());
+
+  // Not supposed to compile
+  // const Format cvar{};
+  // EXPECT_EQ("Format", formatv("{0}", cvar).str());
+}
+
+TEST(FormatVariadicTest, FormatMemberConst) {
+  EXPECT_EQ("ConstFormat", formatv("{0}", ConstFormat()).str());
+
+  ConstFormat var;
+  EXPECT_EQ("ConstFormat", formatv("{0}", var).str());
+  EXPECT_EQ("ConstFormat", formatv("{0}", std::move(var)).str());
+
+  const ConstFormat cvar{};
+  EXPECT_EQ("ConstFormat", formatv("{0}", cvar).str());
+}
diff --git a/unittests/Support/TargetParserTest.cpp b/unittests/Support/TargetParserTest.cpp
index 8040be0..c84313a 100644
--- a/unittests/Support/TargetParserTest.cpp
+++ b/unittests/Support/TargetParserTest.cpp
@@ -16,111 +16,397 @@
 using namespace llvm;
 
 namespace {
-static const unsigned kAArch64ArchExtKinds[] = {
-#define AARCH64_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE) ID,
-#include "llvm/Support/AArch64TargetParser.def"
-#undef AARCH64_ARCH_EXT_NAME
-};
-
-template <typename T> struct ArchNames {
-  const char *Name;
-  unsigned DefaultFPU;
-  unsigned ArchBaseExtensions;
-  T ID;
-  ARMBuildAttrs::CPUArch ArchAttr;
-};
-ArchNames<AArch64::ArchKind> kAArch64ARCHNames[] = {
-#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU,        \
-                     ARCH_BASE_EXT)                                            \
-  {NAME, ARM::ARCH_FPU, ARCH_BASE_EXT, AArch64::ArchKind::ID, ARCH_ATTR},
-#include "llvm/Support/AArch64TargetParser.def"
-#undef AARCH64_ARCH
-};
-ArchNames<ARM::ArchKind> kARMARCHNames[] = {
-#define ARM_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU,            \
-                 ARCH_BASE_EXT)                                                \
-  {NAME, ARM::ARCH_FPU, ARCH_BASE_EXT, ARM::ID, ARCH_ATTR},
-#include "llvm/Support/ARMTargetParser.def"
-#undef ARM_ARCH
-};
-
-template <typename T> struct CpuNames {
-  const char *Name;
-  T ID;
-  unsigned DefaultFPU;
-  unsigned DefaultExt;
-};
-CpuNames<AArch64::ArchKind> kAArch64CPUNames[] = {
-#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT)       \
-  {NAME, AArch64::ArchKind::ID, ARM::DEFAULT_FPU, DEFAULT_EXT},
-#include "llvm/Support/AArch64TargetParser.def"
-#undef AARCH64_CPU_NAME
-};
-CpuNames<ARM::ArchKind> kARMCPUNames[] = {
-#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT)           \
-  {NAME, ARM::ID, ARM::DEFAULT_FPU, DEFAULT_EXT},
-#include "llvm/Support/ARMTargetParser.def"
-#undef ARM_CPU_NAME
-};
-
 const char *ARMArch[] = {
-    "armv2",        "armv2a",      "armv3",    "armv3m",       "armv4",
-    "armv4t",       "armv5",       "armv5t",   "armv5e",       "armv5te",
-    "armv5tej",     "armv6",       "armv6j",   "armv6k",       "armv6hl",
-    "armv6t2",      "armv6kz",     "armv6z",   "armv6zk",      "armv6-m",
-    "armv6m",       "armv6sm",     "armv6s-m", "armv7-a",      "armv7",
-    "armv7a",       "armv7hl",     "armv7l",   "armv7-r",      "armv7r",
-    "armv7-m",      "armv7m",      "armv7k",   "armv7s",       "armv7e-m",
-    "armv7em",      "armv8-a",     "armv8",    "armv8a",       "armv8.1-a",
-    "armv8.1a",     "armv8.2-a",   "armv8.2a", "armv8-m.base", "armv8m.base",
-    "armv8-m.main", "armv8m.main", "iwmmxt",   "iwmmxt2",      "xscale"};
+    "armv2",        "armv2a",      "armv3",        "armv3m",      "armv4",
+    "armv4t",       "armv5",       "armv5t",       "armv5e",      "armv5te",
+    "armv5tej",     "armv6",       "armv6j",       "armv6k",      "armv6hl",
+    "armv6t2",      "armv6kz",     "armv6z",       "armv6zk",     "armv6-m",
+    "armv6m",       "armv6sm",     "armv6s-m",     "armv7-a",     "armv7",
+    "armv7a",       "armv7hl",     "armv7l",       "armv7-r",     "armv7r",
+    "armv7-m",      "armv7m",      "armv7k",       "armv7s",      "armv7e-m",
+    "armv7em",      "armv8-a",     "armv8",        "armv8a",      "armv8.1-a",
+    "armv8.1a",     "armv8.2-a",   "armv8.2a",     "armv8-r",     "armv8r",
+    "armv8-m.base", "armv8m.base", "armv8-m.main", "armv8m.main", "iwmmxt",
+    "iwmmxt2",      "xscale"};
 
-template <typename T, size_t N>
-bool contains(const T (&array)[N], const T element) {
-  return std::find(std::begin(array), std::end(array), element) !=
-         std::end(array);
+bool testARMCPU(StringRef CPUName, StringRef ExpectedArch,
+                StringRef ExpectedFPU, unsigned ExpectedFlags,
+                StringRef CPUAttr) {
+  unsigned ArchKind = ARM::parseCPUArch(CPUName);
+  bool pass = ARM::getArchName(ArchKind).equals(ExpectedArch);
+  unsigned FPUKind = ARM::getDefaultFPU(CPUName, ArchKind);
+  pass &= ARM::getFPUName(FPUKind).equals(ExpectedFPU);
+
+  unsigned ExtKind = ARM::getDefaultExtensions(CPUName, ArchKind);
+  if (ExtKind > 1 && (ExtKind & ARM::AEK_NONE))
+    pass &= ((ExtKind ^ ARM::AEK_NONE) == ExpectedFlags);
+  else
+    pass &= (ExtKind == ExpectedFlags);
+
+  pass &= ARM::getCPUAttr(ArchKind).equals(CPUAttr);
+
+  return pass;
 }
 
-template <size_t N>
-bool contains(const char *(&array)[N], const char *element) {
-  return std::find_if(std::begin(array), std::end(array), [&](const char *S) {
-           return ::strcmp(S, element) == 0;
-         }) != std::end(array);
+TEST(TargetParserTest, testARMCPU) {
+  EXPECT_TRUE(testARMCPU("invalid", "invalid", "invalid",
+                         ARM::AEK_NONE, ""));
+  EXPECT_TRUE(testARMCPU("generic", "invalid", "none",
+                         ARM::AEK_NONE, ""));
+
+  EXPECT_TRUE(testARMCPU("arm2", "armv2", "none",
+                         ARM::AEK_NONE, "2"));
+  EXPECT_TRUE(testARMCPU("arm3", "armv2a", "none",
+                         ARM::AEK_NONE, "2A"));
+  EXPECT_TRUE(testARMCPU("arm6", "armv3", "none",
+                         ARM::AEK_NONE, "3"));
+  EXPECT_TRUE(testARMCPU("arm7m", "armv3m", "none",
+                         ARM::AEK_NONE, "3M"));
+  EXPECT_TRUE(testARMCPU("arm8", "armv4", "none",
+                         ARM::AEK_NONE, "4"));
+  EXPECT_TRUE(testARMCPU("arm810", "armv4", "none",
+                         ARM::AEK_NONE, "4"));
+  EXPECT_TRUE(testARMCPU("strongarm", "armv4", "none",
+                         ARM::AEK_NONE, "4"));
+  EXPECT_TRUE(testARMCPU("strongarm110", "armv4", "none",
+                         ARM::AEK_NONE, "4"));
+  EXPECT_TRUE(testARMCPU("strongarm1100", "armv4", "none",
+                         ARM::AEK_NONE, "4"));
+  EXPECT_TRUE(testARMCPU("strongarm1110", "armv4", "none",
+                         ARM::AEK_NONE, "4"));
+  EXPECT_TRUE(testARMCPU("arm7tdmi", "armv4t", "none",
+                         ARM::AEK_NONE, "4T"));
+  EXPECT_TRUE(testARMCPU("arm7tdmi-s", "armv4t", "none",
+                         ARM::AEK_NONE, "4T"));
+  EXPECT_TRUE(testARMCPU("arm710t", "armv4t", "none",
+                         ARM::AEK_NONE, "4T"));
+  EXPECT_TRUE(testARMCPU("arm720t", "armv4t", "none",
+                         ARM::AEK_NONE, "4T"));
+  EXPECT_TRUE(testARMCPU("arm9", "armv4t", "none",
+                         ARM::AEK_NONE, "4T"));
+  EXPECT_TRUE(testARMCPU("arm9tdmi", "armv4t", "none",
+                         ARM::AEK_NONE, "4T"));
+  EXPECT_TRUE(testARMCPU("arm920", "armv4t", "none",
+                         ARM::AEK_NONE, "4T"));
+  EXPECT_TRUE(testARMCPU("arm920t", "armv4t", "none",
+                         ARM::AEK_NONE, "4T"));
+  EXPECT_TRUE(testARMCPU("arm922t", "armv4t", "none",
+                         ARM::AEK_NONE, "4T"));
+  EXPECT_TRUE(testARMCPU("arm9312", "armv4t", "none",
+                         ARM::AEK_NONE, "4T"));
+  EXPECT_TRUE(testARMCPU("arm940t", "armv4t", "none",
+                         ARM::AEK_NONE, "4T"));
+  EXPECT_TRUE(testARMCPU("ep9312", "armv4t", "none",
+                         ARM::AEK_NONE, "4T"));
+  EXPECT_TRUE(testARMCPU("arm10tdmi", "armv5t", "none",
+                         ARM::AEK_NONE, "5T"));
+  EXPECT_TRUE(testARMCPU("arm1020t", "armv5t", "none",
+                         ARM::AEK_NONE, "5T"));
+  EXPECT_TRUE(testARMCPU("arm9e", "armv5te", "none",
+                         ARM::AEK_DSP, "5TE"));
+  EXPECT_TRUE(testARMCPU("arm946e-s", "armv5te", "none",
+                         ARM::AEK_DSP, "5TE"));
+  EXPECT_TRUE(testARMCPU("arm966e-s", "armv5te", "none",
+                         ARM::AEK_DSP, "5TE"));
+  EXPECT_TRUE(testARMCPU("arm968e-s", "armv5te", "none",
+                         ARM::AEK_DSP, "5TE"));
+  EXPECT_TRUE(testARMCPU("arm10e", "armv5te", "none",
+                         ARM::AEK_DSP, "5TE"));
+  EXPECT_TRUE(testARMCPU("arm1020e", "armv5te", "none",
+                         ARM::AEK_DSP, "5TE"));
+  EXPECT_TRUE(testARMCPU("arm1022e", "armv5te", "none",
+                         ARM::AEK_DSP, "5TE"));
+  EXPECT_TRUE(testARMCPU("arm926ej-s", "armv5tej", "none",
+                         ARM::AEK_DSP, "5TEJ"));
+  EXPECT_TRUE(testARMCPU("arm1136j-s", "armv6", "none",
+                         ARM::AEK_DSP, "6"));
+  EXPECT_TRUE(testARMCPU("arm1136jf-s", "armv6", "vfpv2",
+                         ARM::AEK_DSP, "6"));
+  EXPECT_TRUE(testARMCPU("arm1136jz-s", "armv6", "none",
+                         ARM::AEK_DSP, "6"));
+  EXPECT_TRUE(testARMCPU("arm1176j-s", "armv6k", "none",
+                         ARM::AEK_DSP, "6K"));
+  EXPECT_TRUE(testARMCPU("arm1176jz-s", "armv6kz", "none",
+                         ARM::AEK_SEC | ARM::AEK_DSP, "6KZ"));
+  EXPECT_TRUE(testARMCPU("mpcore", "armv6k", "vfpv2",
+                         ARM::AEK_DSP, "6K"));
+  EXPECT_TRUE(testARMCPU("mpcorenovfp", "armv6k", "none",
+                         ARM::AEK_DSP, "6K"));
+  EXPECT_TRUE(testARMCPU("arm1176jzf-s", "armv6kz", "vfpv2",
+                         ARM::AEK_SEC | ARM::AEK_DSP, "6KZ"));
+  EXPECT_TRUE(testARMCPU("arm1156t2-s", "armv6t2", "none",
+                         ARM::AEK_DSP, "6T2"));
+  EXPECT_TRUE(testARMCPU("arm1156t2f-s", "armv6t2", "vfpv2",
+                         ARM::AEK_DSP, "6T2"));
+  EXPECT_TRUE(testARMCPU("cortex-m0", "armv6-m", "none",
+                         ARM::AEK_NONE, "6-M"));
+  EXPECT_TRUE(testARMCPU("cortex-m0plus", "armv6-m", "none",
+                         ARM::AEK_NONE, "6-M"));
+  EXPECT_TRUE(testARMCPU("cortex-m1", "armv6-m", "none",
+                         ARM::AEK_NONE, "6-M"));
+  EXPECT_TRUE(testARMCPU("sc000", "armv6-m", "none",
+                         ARM::AEK_NONE, "6-M"));
+  EXPECT_TRUE(testARMCPU("cortex-a5", "armv7-a", "neon-vfpv4",
+                         ARM::AEK_MP | ARM::AEK_SEC | ARM::AEK_DSP, "7-A"));
+  EXPECT_TRUE(testARMCPU("cortex-a7", "armv7-a", "neon-vfpv4",
+                         ARM::AEK_HWDIV | ARM::AEK_HWDIVARM | ARM::AEK_MP |
+                         ARM::AEK_SEC | ARM::AEK_VIRT | ARM::AEK_DSP,
+                         "7-A"));
+  EXPECT_TRUE(testARMCPU("cortex-a8", "armv7-a", "neon",
+                         ARM::AEK_SEC | ARM::AEK_DSP, "7-A"));
+  EXPECT_TRUE(testARMCPU("cortex-a9", "armv7-a", "neon-fp16",
+                         ARM::AEK_MP | ARM::AEK_SEC | ARM::AEK_DSP, "7-A"));
+  EXPECT_TRUE(testARMCPU("cortex-a12", "armv7-a", "neon-vfpv4",
+                         ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT |
+                         ARM::AEK_HWDIVARM | ARM::AEK_HWDIV | ARM::AEK_DSP,
+                         "7-A"));
+  EXPECT_TRUE(testARMCPU("cortex-a15", "armv7-a", "neon-vfpv4",
+                         ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT |
+                         ARM::AEK_HWDIVARM | ARM::AEK_HWDIV | ARM::AEK_DSP,
+                         "7-A"));
+  EXPECT_TRUE(testARMCPU("cortex-a17", "armv7-a", "neon-vfpv4",
+                         ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT |
+                         ARM::AEK_HWDIVARM | ARM::AEK_HWDIV | ARM::AEK_DSP,
+                         "7-A"));
+  EXPECT_TRUE(testARMCPU("krait", "armv7-a", "neon-vfpv4",
+                         ARM::AEK_HWDIVARM | ARM::AEK_HWDIV | ARM::AEK_DSP,
+                         "7-A"));
+  EXPECT_TRUE(testARMCPU("cortex-r4", "armv7-r", "none",
+                         ARM::AEK_HWDIV | ARM::AEK_DSP, "7-R"));
+  EXPECT_TRUE(testARMCPU("cortex-r4f", "armv7-r", "vfpv3-d16",
+                         ARM::AEK_HWDIV | ARM::AEK_DSP, "7-R"));
+  EXPECT_TRUE(testARMCPU("cortex-r5", "armv7-r", "vfpv3-d16",
+                         ARM::AEK_MP | ARM::AEK_HWDIVARM | ARM::AEK_HWDIV |
+                         ARM::AEK_DSP, "7-R"));
+  EXPECT_TRUE(testARMCPU("cortex-r7", "armv7-r", "vfpv3-d16-fp16",
+                         ARM::AEK_MP | ARM::AEK_HWDIVARM | ARM::AEK_HWDIV |
+                         ARM::AEK_DSP, "7-R"));
+  EXPECT_TRUE(testARMCPU("cortex-r8", "armv7-r", "vfpv3-d16-fp16",
+                         ARM::AEK_MP | ARM::AEK_HWDIVARM | ARM::AEK_HWDIV |
+                         ARM::AEK_DSP, "7-R"));
+  EXPECT_TRUE(testARMCPU("cortex-r52", "armv8-r", "neon-fp-armv8",
+                         ARM::AEK_CRC | ARM::AEK_MP | ARM::AEK_VIRT |
+                         ARM::AEK_HWDIVARM | ARM::AEK_HWDIV | ARM::AEK_DSP,
+                         "8-R"));
+  EXPECT_TRUE(testARMCPU("sc300", "armv7-m", "none",
+                         ARM::AEK_HWDIV, "7-M"));
+  EXPECT_TRUE(testARMCPU("cortex-m3", "armv7-m", "none",
+                         ARM::AEK_HWDIV, "7-M"));
+  EXPECT_TRUE(testARMCPU("cortex-m4", "armv7e-m", "fpv4-sp-d16",
+                         ARM::AEK_HWDIV | ARM::AEK_DSP, "7E-M"));
+  EXPECT_TRUE(testARMCPU("cortex-m7", "armv7e-m", "fpv5-d16",
+                         ARM::AEK_HWDIV | ARM::AEK_DSP, "7E-M"));
+  EXPECT_TRUE(testARMCPU("cortex-a32", "armv8-a", "crypto-neon-fp-armv8",
+                         ARM::AEK_CRC | ARM::AEK_SEC | ARM::AEK_MP |
+                         ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+                         ARM::AEK_HWDIV | ARM::AEK_DSP,
+                         "8-A"));
+  EXPECT_TRUE(testARMCPU("cortex-a35", "armv8-a", "crypto-neon-fp-armv8",
+                         ARM::AEK_CRC | ARM::AEK_SEC | ARM::AEK_MP |
+                         ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+                         ARM::AEK_HWDIV | ARM::AEK_DSP,
+                         "8-A"));
+  EXPECT_TRUE(testARMCPU("cortex-a53", "armv8-a", "crypto-neon-fp-armv8",
+                         ARM::AEK_CRC | ARM::AEK_SEC | ARM::AEK_MP |
+                         ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+                         ARM::AEK_HWDIV | ARM::AEK_DSP,
+                         "8-A"));
+  EXPECT_TRUE(testARMCPU("cortex-a57", "armv8-a", "crypto-neon-fp-armv8",
+                         ARM::AEK_CRC | ARM::AEK_SEC | ARM::AEK_MP |
+                         ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+                         ARM::AEK_HWDIV | ARM::AEK_DSP,
+                         "8-A"));
+  EXPECT_TRUE(testARMCPU("cortex-a72", "armv8-a", "crypto-neon-fp-armv8",
+                         ARM::AEK_CRC | ARM::AEK_SEC | ARM::AEK_MP |
+                         ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+                         ARM::AEK_HWDIV | ARM::AEK_DSP,
+                         "8-A"));
+  EXPECT_TRUE(testARMCPU("cortex-a73", "armv8-a", "crypto-neon-fp-armv8",
+                         ARM::AEK_CRC | ARM::AEK_SEC | ARM::AEK_MP |
+                         ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+                         ARM::AEK_HWDIV | ARM::AEK_DSP,
+                         "8-A"));
+  EXPECT_TRUE(testARMCPU("cyclone", "armv8-a", "crypto-neon-fp-armv8",
+                         ARM::AEK_CRC | ARM::AEK_SEC | ARM::AEK_MP |
+                         ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+                         ARM::AEK_HWDIV | ARM::AEK_DSP,
+                         "8-A"));
+  EXPECT_TRUE(testARMCPU("exynos-m1", "armv8-a", "crypto-neon-fp-armv8",
+                         ARM::AEK_CRC | ARM::AEK_SEC | ARM::AEK_MP |
+                         ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+                         ARM::AEK_HWDIV | ARM::AEK_DSP,
+                         "8-A"));
+  EXPECT_TRUE(testARMCPU("exynos-m2", "armv8-a", "crypto-neon-fp-armv8",
+                         ARM::AEK_CRC | ARM::AEK_SEC | ARM::AEK_MP |
+                         ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+                         ARM::AEK_HWDIV | ARM::AEK_DSP,
+                         "8-A"));
+  EXPECT_TRUE(testARMCPU("iwmmxt", "iwmmxt", "none",
+                         ARM::AEK_NONE, "iwmmxt"));
+  EXPECT_TRUE(testARMCPU("xscale", "xscale", "none",
+                         ARM::AEK_NONE, "xscale"));
+  EXPECT_TRUE(testARMCPU("swift", "armv7s", "neon-vfpv4",
+                         ARM::AEK_HWDIVARM | ARM::AEK_HWDIV | ARM::AEK_DSP,
+                         "7-S"));
 }
 
-TEST(TargetParserTest, ARMArchName) {
-  for (ARM::ArchKind AK = static_cast<ARM::ArchKind>(0);
-       AK <= ARM::ArchKind::AK_LAST;
-       AK = static_cast<ARM::ArchKind>(static_cast<unsigned>(AK) + 1))
-    EXPECT_TRUE(AK == ARM::AK_LAST ? ARM::getArchName(AK).empty()
-                                   : !ARM::getArchName(AK).empty());
+bool testARMArch(StringRef Arch, StringRef DefaultCPU, StringRef SubArch,
+                 unsigned ArchAttr) {
+  unsigned ArchKind = ARM::parseArch(Arch);
+  return (ArchKind != ARM::AK_INVALID) &
+         ARM::getDefaultCPU(Arch).equals(DefaultCPU) &
+         ARM::getSubArch(ArchKind).equals(SubArch) &
+         (ARM::getArchAttr(ArchKind) == ArchAttr);
 }
 
-TEST(TargetParserTest, ARMCPUAttr) {
-  for (ARM::ArchKind AK = static_cast<ARM::ArchKind>(0);
-       AK <= ARM::ArchKind::AK_LAST;
-       AK = static_cast<ARM::ArchKind>(static_cast<unsigned>(AK) + 1))
-    EXPECT_TRUE((AK == ARM::AK_INVALID || AK == ARM::AK_LAST)
-                    ? ARM::getCPUAttr(AK).empty()
-                    : !ARM::getCPUAttr(AK).empty());
+TEST(TargetParserTest, testARMArch) {
+  EXPECT_TRUE(
+      testARMArch("armv2", "arm2", "v2",
+                          ARMBuildAttrs::CPUArch::Pre_v4));
+  EXPECT_TRUE(
+      testARMArch("armv2a", "arm3", "v2a",
+                          ARMBuildAttrs::CPUArch::Pre_v4));
+  EXPECT_TRUE(
+      testARMArch("armv3", "arm6", "v3",
+                          ARMBuildAttrs::CPUArch::Pre_v4));
+  EXPECT_TRUE(
+      testARMArch("armv3m", "arm7m", "v3m",
+                          ARMBuildAttrs::CPUArch::Pre_v4));
+  EXPECT_TRUE(
+      testARMArch("armv4", "strongarm", "v4",
+                          ARMBuildAttrs::CPUArch::v4));
+  EXPECT_TRUE(
+      testARMArch("armv4t", "arm7tdmi", "v4t",
+                          ARMBuildAttrs::CPUArch::v4T));
+  EXPECT_TRUE(
+      testARMArch("armv5t", "arm10tdmi", "v5",
+                          ARMBuildAttrs::CPUArch::v5T));
+  EXPECT_TRUE(
+      testARMArch("armv5te", "arm1022e", "v5e",
+                          ARMBuildAttrs::CPUArch::v5TE));
+  EXPECT_TRUE(
+      testARMArch("armv5tej", "arm926ej-s", "v5e",
+                          ARMBuildAttrs::CPUArch::v5TEJ));
+  EXPECT_TRUE(
+      testARMArch("armv6", "arm1136jf-s", "v6",
+                          ARMBuildAttrs::CPUArch::v6));
+  EXPECT_TRUE(
+      testARMArch("armv6k", "arm1176j-s", "v6k",
+                          ARMBuildAttrs::CPUArch::v6K));
+  EXPECT_TRUE(
+      testARMArch("armv6t2", "arm1156t2-s", "v6t2",
+                          ARMBuildAttrs::CPUArch::v6T2));
+  EXPECT_TRUE(
+      testARMArch("armv6kz", "arm1176jzf-s", "v6kz",
+                          ARMBuildAttrs::CPUArch::v6KZ));
+  EXPECT_TRUE(
+      testARMArch("armv6-m", "cortex-m0", "v6m",
+                          ARMBuildAttrs::CPUArch::v6_M));
+  EXPECT_TRUE(
+      testARMArch("armv7-a", "cortex-a8", "v7",
+                          ARMBuildAttrs::CPUArch::v7));
+  EXPECT_TRUE(
+      testARMArch("armv7-r", "cortex-r4", "v7r",
+                          ARMBuildAttrs::CPUArch::v7));
+  EXPECT_TRUE(
+      testARMArch("armv7-m", "cortex-m3", "v7m",
+                          ARMBuildAttrs::CPUArch::v7));
+  EXPECT_TRUE(
+      testARMArch("armv7e-m", "cortex-m4", "v7em",
+                          ARMBuildAttrs::CPUArch::v7E_M));
+  EXPECT_TRUE(
+      testARMArch("armv8-a", "cortex-a53", "v8",
+                          ARMBuildAttrs::CPUArch::v8_A));
+  EXPECT_TRUE(
+      testARMArch("armv8.1-a", "generic", "v8.1a",
+                          ARMBuildAttrs::CPUArch::v8_A));
+  EXPECT_TRUE(
+      testARMArch("armv8.2-a", "generic", "v8.2a",
+                          ARMBuildAttrs::CPUArch::v8_A));
+  EXPECT_TRUE(
+      testARMArch("armv8-r", "cortex-r52", "v8r",
+                          ARMBuildAttrs::CPUArch::v8_R));
+  EXPECT_TRUE(
+      testARMArch("armv8-m.base", "generic", "v8m.base",
+                          ARMBuildAttrs::CPUArch::v8_M_Base));
+  EXPECT_TRUE(
+      testARMArch("armv8-m.main", "generic", "v8m.main",
+                          ARMBuildAttrs::CPUArch::v8_M_Main));
+  EXPECT_TRUE(
+      testARMArch("iwmmxt", "iwmmxt", "",
+                          ARMBuildAttrs::CPUArch::v5TE));
+  EXPECT_TRUE(
+      testARMArch("iwmmxt2", "generic", "",
+                          ARMBuildAttrs::CPUArch::v5TE));
+  EXPECT_TRUE(
+      testARMArch("xscale", "xscale", "v5e",
+                          ARMBuildAttrs::CPUArch::v5TE));
+  EXPECT_TRUE(
+      testARMArch("armv7s", "swift", "v7s",
+                          ARMBuildAttrs::CPUArch::v7));
+  EXPECT_TRUE(
+      testARMArch("armv7k", "generic", "v7k",
+                          ARMBuildAttrs::CPUArch::v7));
 }
 
-TEST(TargetParserTest, ARMSubArch) {
-  for (ARM::ArchKind AK = static_cast<ARM::ArchKind>(0);
-       AK <= ARM::ArchKind::AK_LAST;
-       AK = static_cast<ARM::ArchKind>(static_cast<unsigned>(AK) + 1))
-    EXPECT_TRUE((AK == ARM::AK_INVALID || AK == ARM::AK_IWMMXT ||
-                 AK == ARM::AK_IWMMXT2 || AK == ARM::AK_LAST)
-                    ? ARM::getSubArch(AK).empty()
-                    : !ARM::getSubArch(AK).empty());
+bool testARMExtension(StringRef CPUName, unsigned ArchKind, StringRef ArchExt) {
+  return ARM::getDefaultExtensions(CPUName, ArchKind) &
+         ARM::parseArchExt(ArchExt);
 }
 
-TEST(TargetParserTest, ARMFPUName) {
-  for (ARM::FPUKind FK = static_cast<ARM::FPUKind>(0);
-       FK <= ARM::FPUKind::FK_LAST;
-       FK = static_cast<ARM::FPUKind>(static_cast<unsigned>(FK) + 1))
-    EXPECT_TRUE(FK == ARM::FK_LAST ? ARM::getFPUName(FK).empty()
-                                   : !ARM::getFPUName(FK).empty());
+TEST(TargetParserTest, testARMExtension) {
+  EXPECT_FALSE(testARMExtension("arm2", 0, "thumb"));
+  EXPECT_FALSE(testARMExtension("arm3", 0, "thumb"));
+  EXPECT_FALSE(testARMExtension("arm6", 0, "thumb"));
+  EXPECT_FALSE(testARMExtension("arm7m", 0, "thumb"));
+  EXPECT_FALSE(testARMExtension("strongarm", 0, "dsp"));
+  EXPECT_FALSE(testARMExtension("arm7tdmi", 0, "dsp"));
+  EXPECT_FALSE(testARMExtension("arm10tdmi", 0, "simd"));
+  EXPECT_FALSE(testARMExtension("arm1022e", 0, "simd"));
+  EXPECT_FALSE(testARMExtension("arm926ej-s", 0, "simd"));
+  EXPECT_FALSE(testARMExtension("arm1136jf-s", 0, "crypto"));
+  EXPECT_FALSE(testARMExtension("arm1176j-s", 0, "crypto"));
+  EXPECT_FALSE(testARMExtension("arm1156t2-s", 0, "crypto"));
+  EXPECT_FALSE(testARMExtension("arm1176jzf-s", 0, "crypto"));
+  EXPECT_FALSE(testARMExtension("cortex-m0", 0, "crypto"));
+  EXPECT_FALSE(testARMExtension("cortex-a8", 0, "crypto"));
+  EXPECT_FALSE(testARMExtension("cortex-r4", 0, "crypto"));
+  EXPECT_FALSE(testARMExtension("cortex-m3", 0, "crypto"));
+  EXPECT_FALSE(testARMExtension("cortex-a53", 0, "ras"));
+  EXPECT_FALSE(testARMExtension("cortex-r52", 0, "ras"));
+  EXPECT_FALSE(testARMExtension("iwmmxt", 0, "crc"));
+  EXPECT_FALSE(testARMExtension("xscale", 0, "crc"));
+  EXPECT_FALSE(testARMExtension("swift", 0, "crc"));
+
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV2, "thumb"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV2A, "thumb"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV3, "thumb"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV3M, "thumb"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV4, "dsp"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV4T, "dsp"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV5T, "simd"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV5TE, "simd"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV5TEJ, "simd"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV6, "crypto"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV6K, "crypto"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV6T2, "crypto"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV6KZ, "crypto"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV6M, "crypto"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV7A, "crypto"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV7R, "crypto"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV7M, "crypto"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV7EM, "crypto"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV8A, "ras"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV8_1A, "ras"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV8_2A, "spe"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV8R, "ras"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV8MBaseline, "crc"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV8MMainline, "crc"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_IWMMXT, "crc"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_IWMMXT2, "crc"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_XSCALE, "crc"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV7S, "crypto"));
+  EXPECT_FALSE(testARMExtension("generic", ARM::AK_ARMV7K, "crypto"));
 }
 
 TEST(TargetParserTest, ARMFPUVersion) {
@@ -153,31 +439,6 @@
       EXPECT_LE(0U, ARM::getFPURestriction(FK));
 }
 
-TEST(TargetParserTest, ARMDefaultFPU) {
-  for (ARM::ArchKind AK = static_cast<ARM::ArchKind>(0);
-       AK < ARM::ArchKind::AK_LAST;
-       AK = static_cast<ARM::ArchKind>(static_cast<unsigned>(AK) + 1))
-    EXPECT_EQ(kARMARCHNames[AK].DefaultFPU,
-              ARM::getDefaultFPU(StringRef("generic"), AK));
-
-  for (const auto &ARMCPUName : kARMCPUNames)
-    EXPECT_EQ(ARMCPUName.DefaultFPU, ARM::getDefaultFPU(ARMCPUName.Name, 0));
-}
-
-TEST(TargetParserTest, ARMDefaultExtensions) {
-  for (ARM::ArchKind AK = static_cast<ARM::ArchKind>(0);
-       AK < ARM::ArchKind::AK_LAST;
-       AK = static_cast<ARM::ArchKind>(static_cast<unsigned>(AK) + 1))
-    EXPECT_EQ(kARMARCHNames[AK].ArchBaseExtensions,
-              ARM::getDefaultExtensions(StringRef("generic"), AK));
-
-  for (const auto &ARMCPUName : kARMCPUNames) {
-    unsigned DefaultExt =
-        kARMARCHNames[ARMCPUName.ID].ArchBaseExtensions | ARMCPUName.DefaultExt;
-    EXPECT_EQ(DefaultExt, ARM::getDefaultExtensions(ARMCPUName.Name, 0));
-  }
-}
-
 TEST(TargetParserTest, ARMExtensionFeatures) {
   std::vector<StringRef> Features;
   unsigned Extensions = ARM::AEK_CRC | ARM::AEK_CRYPTO | ARM::AEK_DSP |
@@ -199,15 +460,6 @@
                     : ARM::getFPUFeatures(FK, Features));
 }
 
-TEST(TargetParserTest, ARMArchAttr) {
-  for (ARM::ArchKind AK = static_cast<ARM::ArchKind>(0);
-       AK <= ARM::ArchKind::AK_LAST;
-       AK = static_cast<ARM::ArchKind>(static_cast<unsigned>(AK) + 1))
-    EXPECT_TRUE(AK == ARM::AK_LAST
-                    ? (ARMBuildAttrs::CPUArch::Pre_v4 == ARM::getArchAttr(AK))
-                    : (kARMARCHNames[AK].ArchAttr == ARM::getArchAttr(AK)));
-}
-
 TEST(TargetParserTest, ARMArchExtFeature) {
   const char *ArchExt[][4] = {{"crc", "nocrc", "+crc", "-crc"},
                               {"crypto", "nocrypto", "+crypto", "-crypto"},
@@ -232,11 +484,6 @@
   }
 }
 
-TEST(TargetParserTest, ARMDefaultCPU) {
-  for (unsigned i = 0; i < array_lengthof(ARMArch); i++)
-    EXPECT_FALSE(ARM::getDefaultCPU(ARMArch[i]).empty());
-}
-
 TEST(TargetParserTest, ARMparseHWDiv) {
   const char *hwdiv[] = {"thumb", "arm", "arm,thumb", "thumb,arm"};
 
@@ -244,90 +491,6 @@
     EXPECT_NE(ARM::AEK_INVALID, ARM::parseHWDiv((StringRef)hwdiv[i]));
 }
 
-TEST(TargetParserTest, ARMparseFPU) {
-  const char *FPU[] = {"vfp",
-                       "vfpv2",
-                       "vfp2",
-                       "vfpv3",
-                       "vfp3",
-                       "vfpv3-fp16",
-                       "vfpv3-d16",
-                       "vfp3-d16",
-                       "vfpv3-d16-fp16",
-                       "vfpv3xd",
-                       "vfpv3xd-fp16",
-                       "vfpv4",
-                       "vfp4",
-                       "vfpv4-d16",
-                       "vfp4-d16",
-                       "fp4-dp-d16",
-                       "fpv4-dp-d16",
-                       "fpv4-sp-d16",
-                       "fp4-sp-d16",
-                       "vfpv4-sp-d16",
-                       "fpv5-d16",
-                       "fp5-dp-d16",
-                       "fpv5-dp-d16",
-                       "fpv5-sp-d16",
-                       "fp5-sp-d16",
-                       "fp-armv8",
-                       "neon",
-                       "neon-vfpv3",
-                       "neon-fp16",
-                       "neon-vfpv4",
-                       "neon-fp-armv8",
-                       "crypto-neon-fp-armv8",
-                       "softvfp"};
-
-  for (unsigned i = 0; i < array_lengthof(FPU); i++)
-    EXPECT_NE(ARM::FK_INVALID, ARM::parseFPU((StringRef)FPU[i]));
-}
-
-TEST(TargetParserTest, ARMparseArch) {
-  for (unsigned i = 0; i < array_lengthof(ARMArch); i++)
-    EXPECT_NE(ARM::AEK_INVALID, ARM::parseArch(ARMArch[i]));
-}
-
-TEST(TargetParserTest, ARMparseArchExt) {
-  const char *ArchExt[] = {"none",     "crc",   "crypto", "dsp",    "fp",
-                           "idiv",     "mp",    "simd",   "sec",    "virt",
-                           "fp16",     "ras",   "os",     "iwmmxt", "iwmmxt2",
-                           "maverick", "xscale"};
-
-  for (unsigned i = 0; i < array_lengthof(ArchExt); i++)
-    EXPECT_NE(ARM::AEK_INVALID, ARM::parseArchExt(ArchExt[i]));
-}
-
-TEST(TargetParserTest, ARMparseCPUArch) {
-  const char *CPU[] = {
-      "arm2",          "arm3",          "arm6",        "arm7m",
-      "arm8",          "arm810",        "strongarm",   "strongarm110",
-      "strongarm1100", "strongarm1110", "arm7tdmi",    "arm7tdmi-s",
-      "arm710t",       "arm720t",       "arm9",        "arm9tdmi",
-      "arm920",        "arm920t",       "arm922t",     "arm9312",
-      "arm940t",       "ep9312",        "arm10tdmi",   "arm1020t",
-      "arm9e",         "arm946e-s",     "arm966e-s",   "arm968e-s",
-      "arm10e",        "arm1020e",      "arm1022e",    "arm926ej-s",
-      "arm1136j-s",    "arm1136jf-s",   "arm1136jz-s", "arm1176j-s",
-      "arm1176jz-s",   "mpcore",        "mpcorenovfp", "arm1176jzf-s",
-      "arm1156t2-s",   "arm1156t2f-s",  "cortex-m0",   "cortex-m0plus",
-      "cortex-m1",     "sc000",         "cortex-a5",   "cortex-a7",
-      "cortex-a8",     "cortex-a9",     "cortex-a12",  "cortex-a15",
-      "cortex-a17",    "krait",         "cortex-r4",   "cortex-r4f",
-      "cortex-r5",     "cortex-r7",     "cortex-r8",   "sc300",
-      "cortex-m3",     "cortex-m4",     "cortex-m7",   "cortex-a32",
-      "cortex-a35",    "cortex-a53",    "cortex-a57",  "cortex-a72",
-      "cortex-a73",    "cyclone",       "exynos-m1",   "exynos-m2",   
-      "iwmmxt",        "xscale",        "swift",       "cortex-r52"};
-
-  for (const auto &ARMCPUName : kARMCPUNames) {
-    if (contains(CPU, ARMCPUName.Name))
-      EXPECT_NE(ARM::AK_INVALID, ARM::parseCPUArch(ARMCPUName.Name));
-    else
-      EXPECT_EQ(ARM::AK_INVALID, ARM::parseCPUArch(ARMCPUName.Name));
-  }
-}
-
 TEST(TargetParserTest, ARMparseArchEndianAndISA) {
   const char *Arch[] = {
       "v2",    "v2a",    "v3",    "v3m",  "v4",   "v4t",  "v5",    "v5t",
@@ -383,6 +546,7 @@
       EXPECT_EQ(ARM::PK_M, ARM::parseArchProfile(ARMArch[i]));
       continue;
     case ARM::AK_ARMV7R:
+    case ARM::AK_ARMV8R:
       EXPECT_EQ(ARM::PK_R, ARM::parseArchProfile(ARMArch[i]));
       continue;
     case ARM::AK_ARMV7A:
@@ -405,29 +569,109 @@
       EXPECT_EQ(5u, ARM::parseArchVersion(ARMArch[i]));
 }
 
-TEST(TargetParserTest, AArch64DefaultFPU) {
-  for (unsigned AK = 0; AK < static_cast<unsigned>(AArch64::ArchKind::AK_LAST);
-       AK++)
-    EXPECT_EQ(kAArch64ARCHNames[AK].DefaultFPU,
-              AArch64::getDefaultFPU(StringRef("generic"), AK));
+bool testAArch64CPU(StringRef CPUName, StringRef ExpectedArch,
+                    StringRef ExpectedFPU, unsigned ExpectedFlags,
+                    StringRef CPUAttr) {
+  unsigned ArchKind = AArch64::parseCPUArch(CPUName);
+  bool pass = AArch64::getArchName(ArchKind).equals(ExpectedArch);
+  unsigned FPUKind = AArch64::getDefaultFPU(CPUName, ArchKind);
+  pass &= AArch64::getFPUName(FPUKind).equals(ExpectedFPU);
 
-  for (const auto &AArch64CPUName : kAArch64CPUNames)
-    EXPECT_EQ(AArch64CPUName.DefaultFPU,
-              AArch64::getDefaultFPU(AArch64CPUName.Name,
-                                     static_cast<unsigned>(AArch64CPUName.ID)));
+  unsigned ExtKind = AArch64::getDefaultExtensions(CPUName, ArchKind);
+  if (ExtKind > 1 && (ExtKind & AArch64::AEK_NONE))
+    pass &= ((ExtKind ^ AArch64::AEK_NONE) == ExpectedFlags);
+  else
+    pass &= (ExtKind == ExpectedFlags);
+
+  pass &= AArch64::getCPUAttr(ArchKind).equals(CPUAttr);
+
+  return pass;
 }
 
-TEST(TargetParserTest, AArch64DefaultExt) {
-  for (unsigned AK = 0; AK < static_cast<unsigned>(AArch64::ArchKind::AK_LAST);
-       AK++)
-    EXPECT_EQ(kAArch64ARCHNames[AK].ArchBaseExtensions,
-              AArch64::getDefaultExtensions(StringRef("generic"), AK));
+TEST(TargetParserTest, testAArch64CPU) {
+  EXPECT_TRUE(testAArch64CPU(
+      "invalid", "invalid", "invalid",
+      AArch64::AEK_INVALID, ""));
+  EXPECT_TRUE(testAArch64CPU(
+      "generic", "invalid", "none",
+      AArch64::AEK_NONE, ""));
 
-  for (const auto &AArch64CPUName : kAArch64CPUNames)
-    EXPECT_EQ(
-        AArch64CPUName.DefaultExt,
-        AArch64::getDefaultExtensions(
-            AArch64CPUName.Name, static_cast<unsigned>(AArch64CPUName.ID)));
+  EXPECT_TRUE(testAArch64CPU(
+      "cortex-a35", "armv8-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD, "8-A"));
+  EXPECT_TRUE(testAArch64CPU(
+      "cortex-a53", "armv8-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD, "8-A"));
+  EXPECT_TRUE(testAArch64CPU(
+      "cortex-a57", "armv8-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD, "8-A"));
+  EXPECT_TRUE(testAArch64CPU(
+      "cortex-a72", "armv8-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD, "8-A"));
+  EXPECT_TRUE(testAArch64CPU(
+      "cortex-a73", "armv8-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD, "8-A"));
+  EXPECT_TRUE(testAArch64CPU(
+      "cyclone", "armv8-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRYPTO | AArch64::AEK_SIMD, "8-A"));
+  EXPECT_TRUE(testAArch64CPU(
+      "exynos-m1", "armv8-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD, "8-A"));
+  EXPECT_TRUE(testAArch64CPU(
+      "exynos-m2", "armv8-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD, "8-A"));
+  EXPECT_TRUE(testAArch64CPU(
+      "falkor", "armv8-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD, "8-A"));
+  EXPECT_TRUE(testAArch64CPU(
+      "kryo", "armv8-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD, "8-A"));
+  EXPECT_TRUE(testAArch64CPU(
+      "vulcan", "armv8.1-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD, "8.1-A"));
+}
+
+bool testAArch64Arch(StringRef Arch, StringRef DefaultCPU, StringRef SubArch,
+                     unsigned ArchAttr) {
+  unsigned ArchKind = AArch64::parseArch(Arch);
+  return (ArchKind != static_cast<unsigned>(AArch64::ArchKind::AK_INVALID)) &
+         AArch64::getDefaultCPU(Arch).equals(DefaultCPU) &
+         AArch64::getSubArch(ArchKind).equals(SubArch) &
+         (AArch64::getArchAttr(ArchKind) == ArchAttr);
+}
+
+TEST(TargetParserTest, testAArch64Arch) {
+  EXPECT_TRUE(testAArch64Arch("armv8-a", "cortex-a53", "v8",
+                              ARMBuildAttrs::CPUArch::v8_A));
+  EXPECT_TRUE(testAArch64Arch("armv8.1-a", "generic", "v8.1a",
+                              ARMBuildAttrs::CPUArch::v8_A));
+  EXPECT_TRUE(testAArch64Arch("armv8.2-a", "generic", "v8.2a",
+                              ARMBuildAttrs::CPUArch::v8_A));
+}
+
+bool testAArch64Extension(StringRef CPUName, unsigned ArchKind,
+                          StringRef ArchExt) {
+  return AArch64::getDefaultExtensions(CPUName, ArchKind) &
+         AArch64::parseArchExt(ArchExt);
+}
+
+TEST(TargetParserTest, testAArch64Extension) {
+  EXPECT_FALSE(testAArch64Extension("cortex-a35", 0, "ras"));
+  EXPECT_FALSE(testAArch64Extension("cortex-a53", 0, "ras"));
+  EXPECT_FALSE(testAArch64Extension("cortex-a57", 0, "ras"));
+  EXPECT_FALSE(testAArch64Extension("cortex-a72", 0, "ras"));
+  EXPECT_FALSE(testAArch64Extension("cortex-a73", 0, "ras"));
+  EXPECT_FALSE(testAArch64Extension("cyclone", 0, "ras"));
+  EXPECT_FALSE(testAArch64Extension("exynos-m1", 0, "ras"));
+  EXPECT_FALSE(testAArch64Extension("kryo", 0, "ras"));
+  EXPECT_FALSE(testAArch64Extension("vulcan", 0, "ras"));
+
+  EXPECT_FALSE(testAArch64Extension(
+      "generic", static_cast<unsigned>(AArch64::ArchKind::AK_ARMV8A), "ras"));
+  EXPECT_FALSE(testAArch64Extension(
+      "generic", static_cast<unsigned>(AArch64::ArchKind::AK_ARMV8_1A), "ras"));
+  EXPECT_FALSE(testAArch64Extension(
+      "generic", static_cast<unsigned>(AArch64::ArchKind::AK_ARMV8_2A), "spe"));
 }
 
 TEST(TargetParserTest, AArch64ExtensionFeatures) {
@@ -453,50 +697,6 @@
                     : AArch64::getArchFeatures(AK, Features));
 }
 
-TEST(TargetParserTest, AArch64ArchName) {
-  for (unsigned AK = 0; AK < static_cast<unsigned>(AArch64::ArchKind::AK_LAST);
-       AK++)
-    EXPECT_TRUE(AK == static_cast<unsigned>(AArch64::ArchKind::AK_LAST)
-                    ? AArch64::getArchName(AK).empty()
-                    : !AArch64::getArchName(AK).empty());
-}
-
-TEST(TargetParserTest, AArch64CPUAttr) {
-  for (unsigned AK = 0; AK < static_cast<unsigned>(AArch64::ArchKind::AK_LAST);
-       AK++)
-    EXPECT_TRUE((AK == static_cast<unsigned>(AArch64::ArchKind::AK_INVALID) ||
-                 AK == static_cast<unsigned>(AArch64::ArchKind::AK_LAST))
-                    ? AArch64::getCPUAttr(AK).empty()
-                    : !AArch64::getCPUAttr(AK).empty());
-}
-
-TEST(TargetParserTest, AArch64SubArch) {
-  for (unsigned AK = 0; AK < static_cast<unsigned>(AArch64::ArchKind::AK_LAST);
-       AK++)
-    EXPECT_TRUE((AK == static_cast<unsigned>(AArch64::ArchKind::AK_INVALID) ||
-                 AK == static_cast<unsigned>(AArch64::ArchKind::AK_LAST))
-                    ? AArch64::getSubArch(AK).empty()
-                    : !AArch64::getSubArch(AK).empty());
-}
-
-TEST(TargetParserTest, AArch64ArchAttr) {
-  for (unsigned AK = 0; AK < static_cast<unsigned>(AArch64::ArchKind::AK_LAST);
-       AK++)
-    EXPECT_TRUE(
-        AK == static_cast<unsigned>(AArch64::ArchKind::AK_LAST)
-            ? (ARMBuildAttrs::CPUArch::v8_A == AArch64::getArchAttr(AK))
-            : (kAArch64ARCHNames[AK].ArchAttr == AArch64::getArchAttr(AK)));
-}
-
-TEST(TargetParserTest, AArch64ArchExtName) {
-  for (AArch64::ArchExtKind AEK = static_cast<AArch64::ArchExtKind>(0);
-       AEK <= AArch64::ArchExtKind::AEK_RAS;
-       AEK = static_cast<AArch64::ArchExtKind>(static_cast<unsigned>(AEK) + 1))
-    EXPECT_TRUE(contains(kAArch64ArchExtKinds, static_cast<unsigned>(AEK))
-                    ? !AArch64::getArchExtName(AEK).empty()
-                    : AArch64::getArchExtName(AEK).empty());
-}
-
 TEST(TargetParserTest, AArch64ArchExtFeature) {
   const char *ArchExt[][4] = {{"crc", "nocrc", "+crc", "-crc"},
                               {"crypto", "nocrypto", "+crypto", "-crypto"},
@@ -507,58 +707,10 @@
                               {"ras", "noras", "+ras", "-ras"}};
 
   for (unsigned i = 0; i < array_lengthof(ArchExt); i++) {
-    EXPECT_EQ(StringRef(ArchExt[i][2]), AArch64::getArchExtFeature(ArchExt[i][0]));
-    EXPECT_EQ(StringRef(ArchExt[i][3]), AArch64::getArchExtFeature(ArchExt[i][1]));
+    EXPECT_EQ(StringRef(ArchExt[i][2]),
+              AArch64::getArchExtFeature(ArchExt[i][0]));
+    EXPECT_EQ(StringRef(ArchExt[i][3]),
+              AArch64::getArchExtFeature(ArchExt[i][1]));
   }
 }
-
-TEST(TargetParserTest, AArch64DefaultCPU) {
-  const char *Arch[] = {"armv8a",    "armv8-a",  "armv8",    "armv8.1a",
-                        "armv8.1-a", "armv8.2a", "armv8.2-a"};
-
-  for (unsigned i = 0; i < array_lengthof(Arch); i++)
-    EXPECT_FALSE(AArch64::getDefaultCPU(Arch[i]).empty());
-}
-
-TEST(TargetParserTest, AArch64parseArch) {
-  const char *Arch[] = {"armv8",     "armv8a",   "armv8-a",  "armv8.1a",
-                        "armv8.1-a", "armv8.2a", "armv8.2-a"};
-
-  for (unsigned i = 0; i < array_lengthof(Arch); i++)
-    EXPECT_NE(static_cast<unsigned>(AArch64::ArchKind::AK_INVALID),
-              AArch64::parseArch(Arch[i]));
-  EXPECT_EQ(static_cast<unsigned>(AArch64::ArchKind::AK_INVALID),
-            AArch64::parseArch("aarch64"));
-  EXPECT_EQ(static_cast<unsigned>(AArch64::ArchKind::AK_INVALID),
-            AArch64::parseArch("arm64"));
-}
-
-TEST(TargetParserTest, AArch64parseArchExt) {
-  const char *ArchExt[] = {"none", "crc",  "crypto",  "fp",
-                           "simd", "fp16", "profile", "ras"};
-
-  for (unsigned i = 0; i < array_lengthof(ArchExt); i++)
-    EXPECT_NE(AArch64::AEK_INVALID, AArch64::parseArchExt(ArchExt[i]));
-}
-
-TEST(TargetParserTest, AArch64parseCPUArch) {
-  const char *CPU[] = {"cortex-a35",
-                       "cortex-a53",
-                       "cortex-a57",
-                       "cortex-a72",
-                       "cortex-a73",
-                       "cyclone",
-                       "exynos-m1",
-                       "exynos-m2",
-                       "falkor",
-                       "kryo",
-                       "vulcan"};
-
-  for (const auto &AArch64CPUName : kAArch64CPUNames)
-    EXPECT_TRUE(contains(CPU, AArch64CPUName.Name)
-                    ? (static_cast<unsigned>(AArch64::ArchKind::AK_INVALID) !=
-                       AArch64::parseCPUArch(AArch64CPUName.Name))
-                    : (static_cast<unsigned>(AArch64::ArchKind::AK_INVALID) ==
-                       AArch64::parseCPUArch(AArch64CPUName.Name)));
-}
 }
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index a8a984d..1272d2b 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -1595,15 +1595,15 @@
   // Reorder classes so that classes precede super classes.
   Classes.sort();
 
-#ifndef NDEBUG
-  // Verify that the table is now sorted
+#ifdef EXPENSIVE_CHECKS
+  // Verify that the table is sorted and operator < works transitively.
   for (auto I = Classes.begin(), E = Classes.end(); I != E; ++I) {
     for (auto J = I; J != E; ++J) {
       assert(!(*J < *I));
       assert(I == J || !J->isSubsetOf(*I));
     }
   }
-#endif // NDEBUG
+#endif
 }
 
 /// buildInstructionOperandReference - The specified operand is a reference to a
@@ -2719,6 +2719,16 @@
                       const std::unique_ptr<MatchableInfo> &b){
                      return *a < *b;});
 
+#ifdef EXPENSIVE_CHECKS
+  // Verify that the table is sorted and operator < works transitively.
+  for (auto I = Info.Matchables.begin(), E = Info.Matchables.end(); I != E;
+       ++I) {
+    for (auto J = I; J != E; ++J) {
+      assert(!(**J < **I));
+    }
+  }
+#endif
+
   DEBUG_WITH_TYPE("instruction_info", {
       for (const auto &MI : Info.Matchables)
         MI->dump();
@@ -3189,7 +3199,9 @@
 
   if (HasDeprecation) {
     OS << "    std::string Info;\n";
-    OS << "    if (MII.get(Inst.getOpcode()).getDeprecatedInfo(Inst, getSTI(), Info)) {\n";
+    OS << "    if (!getParser().getTargetParser().\n";
+    OS << "        getTargetOptions().MCNoDeprecatedWarn &&\n";
+    OS << "        MII.get(Inst.getOpcode()).getDeprecatedInfo(Inst, getSTI(), Info)) {\n";
     OS << "      SMLoc Loc = ((" << Target.getName()
        << "Operand&)*Operands[0]).getStartLoc();\n";
     OS << "      getParser().Warning(Loc, Info, None);\n";
diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 8e780a2..942ae38 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -630,7 +630,7 @@
     # version.
 
     keywords_re = re.compile(
-        to_bytes("(%s)(.*)\n" % ("|".join(k for k in keywords),)))
+        to_bytes("(%s)(.*)\n" % ("|".join(re.escape(k) for k in keywords),)))
 
     f = open(source_path, 'rb')
     try:
@@ -657,7 +657,7 @@
             # Python 2, to avoid other code having to differentiate between the
             # str and unicode types.
             keyword,ln = match.groups()
-            yield (line_number, to_string(keyword[:-1].decode('utf-8')),
+            yield (line_number, to_string(keyword.decode('utf-8')),
                    to_string(ln.decode('utf-8')))
     finally:
         f.close()
@@ -739,10 +739,119 @@
     # convert to list before returning.
     return list(map(processLine, script))
 
-def parseIntegratedTestScript(test, require_script=True):
+
+class ParserKind(object):
+    """
+    An enumeration representing the style of an integrated test keyword or
+    command.
+
+    TAG: A keyword taking no value. Ex 'END.'
+    COMMAND: A Keyword taking a list of shell commands. Ex 'RUN:'
+    LIST: A keyword taking a comma separated list of value. Ex 'XFAIL:'
+    CUSTOM: A keyword with custom parsing semantics.
+    """
+    TAG = 0
+    COMMAND = 1
+    LIST = 2
+    CUSTOM = 3
+
+
+class IntegratedTestKeywordParser(object):
+    """A parser for LLVM/Clang style integrated test scripts.
+
+    keyword: The keyword to parse for. It must end in either '.' or ':'.
+    kind: An value of ParserKind.
+    parser: A custom parser. This value may only be specified with
+            ParserKind.CUSTOM.
+    """
+    def __init__(self, keyword, kind, parser=None, initial_value=None):
+        if not keyword.endswith('.') and not keyword.endswith(':'):
+            raise ValueError("keyword '%s' must end with either '.' or ':' "
+                             % keyword)
+        if keyword.endswith('.') and kind in \
+                [ParserKind.LIST, ParserKind.COMMAND]:
+            raise ValueError("Keyword '%s' should end in ':'" % keyword)
+
+        elif keyword.endswith(':') and kind in [ParserKind.TAG]:
+            raise ValueError("Keyword '%s' should end in '.'" % keyword)
+        if parser is not None and kind != ParserKind.CUSTOM:
+            raise ValueError("custom parsers can only be specified with "
+                             "ParserKind.CUSTOM")
+        self.keyword = keyword
+        self.kind = kind
+        self.parsed_lines = []
+        self.value = initial_value
+        self.parser = parser
+
+        if kind == ParserKind.COMMAND:
+            self.parser = self._handleCommand
+        elif kind == ParserKind.LIST:
+            self.parser = self._handleList
+        elif kind == ParserKind.TAG:
+            if not keyword.endswith('.'):
+                raise ValueError("keyword '%s' should end with '.'" % keyword)
+            self.parser = self._handleTag
+        elif kind == ParserKind.CUSTOM:
+            if parser is None:
+                raise ValueError("ParserKind.CUSTOM requires a custom parser")
+            self.parser = parser
+        else:
+            raise ValueError("Unknown kind '%s'" % kind)
+
+    def parseLine(self, line_number, line):
+        self.parsed_lines += [(line_number, line)]
+        self.value = self.parser(line_number, line, self.value)
+
+    def getValue(self):
+        return self.value
+
+    @staticmethod
+    def _handleTag(line_number, line, output):
+        """A helper for parsing TAG type keywords"""
+        return (not line.strip() or output)
+
+    @staticmethod
+    def _handleCommand(line_number, line, output):
+        """A helper for parsing COMMAND type keywords"""
+        # Trim trailing whitespace.
+        line = line.rstrip()
+        # Substitute line number expressions
+        line = re.sub('%\(line\)', str(line_number), line)
+
+        def replace_line_number(match):
+            if match.group(1) == '+':
+                return str(line_number + int(match.group(2)))
+            if match.group(1) == '-':
+                return str(line_number - int(match.group(2)))
+        line = re.sub('%\(line *([\+-]) *(\d+)\)', replace_line_number, line)
+        # Collapse lines with trailing '\\'.
+        if output and output[-1][-1] == '\\':
+            output[-1] = output[-1][:-1] + line
+        else:
+            if output is None:
+                output = []
+            output.append(line)
+        return output
+
+    @staticmethod
+    def _handleList(line_number, line, output):
+        """A parser for LIST type keywords"""
+        if output is None:
+            output = []
+        output.extend([s.strip() for s in line.split(',')])
+        return output
+
+
+def parseIntegratedTestScript(test, additional_parsers=[],
+                              require_script=True):
     """parseIntegratedTestScript - Scan an LLVM/Clang style integrated test
     script and extract the lines to 'RUN' as well as 'XFAIL' and 'REQUIRES'
-    and 'UNSUPPORTED' information. If 'require_script' is False an empty script
+    'REQUIRES-ANY' and 'UNSUPPORTED' information.
+
+    If additional parsers are specified then the test is also scanned for the
+    keywords they specify and all matches are passed to the custom parser.
+
+    If 'require_script' is False an empty script
     may be returned. This can be used for test formats where the actual script
     is optional or ignored.
     """
@@ -752,43 +861,36 @@
     requires = []
     requires_any = []
     unsupported = []
-    keywords = ['RUN:', 'XFAIL:', 'REQUIRES:', 'REQUIRES-ANY:',
-                'UNSUPPORTED:', 'END.']
+    builtin_parsers = [
+        IntegratedTestKeywordParser('RUN:', ParserKind.COMMAND,
+                                    initial_value=script),
+        IntegratedTestKeywordParser('XFAIL:', ParserKind.LIST,
+                                    initial_value=test.xfails),
+        IntegratedTestKeywordParser('REQUIRES:', ParserKind.LIST,
+                                    initial_value=requires),
+        IntegratedTestKeywordParser('REQUIRES-ANY:', ParserKind.LIST,
+                                    initial_value=requires_any),
+        IntegratedTestKeywordParser('UNSUPPORTED:', ParserKind.LIST,
+                                    initial_value=unsupported),
+        IntegratedTestKeywordParser('END.', ParserKind.TAG)
+    ]
+    keyword_parsers = {p.keyword: p for p in builtin_parsers}
+    for parser in additional_parsers:
+        if not isinstance(parser, IntegratedTestKeywordParser):
+            raise ValueError('additional parser must be an instance of '
+                             'IntegratedTestKeywordParser')
+        if parser.keyword in keyword_parsers:
+            raise ValueError("Parser for keyword '%s' already exists"
+                             % parser.keyword)
+        keyword_parsers[parser.keyword] = parser
+
     for line_number, command_type, ln in \
-            parseIntegratedTestScriptCommands(sourcepath, keywords):
-        if command_type == 'RUN':
-            # Trim trailing whitespace.
-            ln = ln.rstrip()
-
-            # Substitute line number expressions
-            ln = re.sub('%\(line\)', str(line_number), ln)
-            def replace_line_number(match):
-                if match.group(1) == '+':
-                    return str(line_number + int(match.group(2)))
-                if match.group(1) == '-':
-                    return str(line_number - int(match.group(2)))
-            ln = re.sub('%\(line *([\+-]) *(\d+)\)', replace_line_number, ln)
-
-            # Collapse lines with trailing '\\'.
-            if script and script[-1][-1] == '\\':
-                script[-1] = script[-1][:-1] + ln
-            else:
-                script.append(ln)
-        elif command_type == 'XFAIL':
-            test.xfails.extend([s.strip() for s in ln.split(',')])
-        elif command_type == 'REQUIRES':
-            requires.extend([s.strip() for s in ln.split(',')])
-        elif command_type == 'REQUIRES-ANY':
-            requires_any.extend([s.strip() for s in ln.split(',')])
-        elif command_type == 'UNSUPPORTED':
-            unsupported.extend([s.strip() for s in ln.split(',')])
-        elif command_type == 'END':
-            # END commands are only honored if the rest of the line is empty.
-            if not ln.strip():
-                break
-        else:
-            raise ValueError("unknown script command type: %r" % (
-                    command_type,))
+            parseIntegratedTestScriptCommands(sourcepath,
+                                              keyword_parsers.keys()):
+        parser = keyword_parsers[command_type]
+        parser.parseLine(line_number, ln)
+        if command_type == 'END.' and parser.getValue() is True:
+            break
 
     # Verify the script contains a run line.
     if require_script and not script:
@@ -805,26 +907,30 @@
     if missing_required_features:
         msg = ', '.join(missing_required_features)
         return lit.Test.Result(Test.UNSUPPORTED,
-                               "Test requires the following features: %s" % msg)
+                               "Test requires the following features: %s"
+                               % msg)
     requires_any_features = [f for f in requires_any
                              if f in test.config.available_features]
     if requires_any and not requires_any_features:
         msg = ' ,'.join(requires_any)
         return lit.Test.Result(Test.UNSUPPORTED,
-            "Test requires any of the following features: %s" % msg)
+                               "Test requires any of the following features: "
+                               "%s" % msg)
     unsupported_features = [f for f in unsupported
                             if f in test.config.available_features]
     if unsupported_features:
         msg = ', '.join(unsupported_features)
-        return lit.Test.Result(Test.UNSUPPORTED,
-                    "Test is unsupported with the following features: %s" % msg)
+        return lit.Test.Result(
+            Test.UNSUPPORTED,
+            "Test is unsupported with the following features: %s" % msg)
 
     unsupported_targets = [f for f in unsupported
                            if f in test.suite.config.target_triple]
     if unsupported_targets:
-      return lit.Test.Result(Test.UNSUPPORTED,
-                  "Test is unsupported with the following triple: %s" % (
-                      test.suite.config.target_triple,))
+        return lit.Test.Result(
+            Test.UNSUPPORTED,
+            "Test is unsupported with the following triple: %s" % (
+             test.suite.config.target_triple,))
 
     if test.config.limit_to_features:
         # Check that we have one of the limit_to_features features in requires.
@@ -832,11 +938,12 @@
                                    if f in requires]
         if not limit_to_features_tests:
             msg = ', '.join(test.config.limit_to_features)
-            return lit.Test.Result(Test.UNSUPPORTED,
-                 "Test requires one of the limit_to_features features %s" % msg)
-
+            return lit.Test.Result(
+                Test.UNSUPPORTED,
+                "Test requires one of the limit_to_features features %s" % msg)
     return script
 
+
 def _runShTest(test, litConfig, useExternalSh, script, tmpBase):
     # Create the output directory if it does not already exist.
     lit.util.mkdir_p(os.path.dirname(tmpBase))
diff --git a/utils/lit/tests/Inputs/testrunner-custom-parsers/lit.cfg b/utils/lit/tests/Inputs/testrunner-custom-parsers/lit.cfg
new file mode 100644
index 0000000..cf46c16
--- /dev/null
+++ b/utils/lit/tests/Inputs/testrunner-custom-parsers/lit.cfg
@@ -0,0 +1,14 @@
+import lit.formats
+import os
+import lit.Test
+
+class TestParserFormat(lit.formats.FileBasedTest):
+  def execute(self, test, lit_config):
+      return lit.Test.PASS, ''
+
+config.name = 'custom-parsers'
+config.suffixes = ['.txt']
+config.test_format = TestParserFormat()
+config.test_source_root = None
+config.test_exec_root = None
+config.target_triple = 'x86_64-unknown-unknown'
diff --git a/utils/lit/tests/Inputs/testrunner-custom-parsers/test.txt b/utils/lit/tests/Inputs/testrunner-custom-parsers/test.txt
new file mode 100644
index 0000000..ed118f3
--- /dev/null
+++ b/utils/lit/tests/Inputs/testrunner-custom-parsers/test.txt
@@ -0,0 +1,13 @@
+
+// MY_TAG.
+// foo bar baz
+// MY_RUN: baz
+// MY_LIST: one, two
+// MY_LIST: three, four
+// MY_RUN: foo \
+// MY_RUN: bar
+//
+// MY_CUSTOM: a b c
+//
+// END.
+// MY_LIST: five
diff --git a/utils/lit/tests/unit/TestRunner.py b/utils/lit/tests/unit/TestRunner.py
new file mode 100644
index 0000000..ff11834
--- /dev/null
+++ b/utils/lit/tests/unit/TestRunner.py
@@ -0,0 +1,114 @@
+# RUN: %{python} %s
+#
+# END.
+
+
+import unittest
+import platform
+import os.path
+import tempfile
+
+import lit
+from lit.TestRunner import ParserKind, IntegratedTestKeywordParser, \
+                           parseIntegratedTestScript
+
+
+class TestIntegratedTestKeywordParser(unittest.TestCase):
+    inputTestCase = None
+
+    @staticmethod
+    def load_keyword_parser_lit_tests():
+        """
+        Create and load the LIT test suite and test objects used by
+        TestIntegratedTestKeywordParser
+        """
+        # Create the global config object.
+        lit_config = lit.LitConfig.LitConfig(progname='lit',
+                                             path=[],
+                                             quiet=False,
+                                             useValgrind=False,
+                                             valgrindLeakCheck=False,
+                                             valgrindArgs=[],
+                                             noExecute=False,
+                                             debug=False,
+                                             isWindows=(
+                                               platform.system() == 'Windows'),
+                                             params={})
+        TestIntegratedTestKeywordParser.litConfig = lit_config
+        # Perform test discovery.
+        test_path = os.path.dirname(os.path.dirname(__file__))
+        inputs = [os.path.join(test_path, 'Inputs/testrunner-custom-parsers/')]
+        assert os.path.isdir(inputs[0])
+        run = lit.run.Run(lit_config,
+                          lit.discovery.find_tests_for_inputs(lit_config, inputs))
+        assert len(run.tests) == 1 and "there should only be one test"
+        TestIntegratedTestKeywordParser.inputTestCase = run.tests[0]
+
+    @staticmethod
+    def make_parsers():
+        def custom_parse(line_number, line, output):
+            if output is None:
+                output = []
+            output += [part for part in line.split(' ') if part.strip()]
+            return output
+
+        return [
+            IntegratedTestKeywordParser("MY_TAG.", ParserKind.TAG),
+            IntegratedTestKeywordParser("MY_DNE_TAG.", ParserKind.TAG),
+            IntegratedTestKeywordParser("MY_LIST:", ParserKind.LIST),
+            IntegratedTestKeywordParser("MY_RUN:", ParserKind.COMMAND),
+            IntegratedTestKeywordParser("MY_CUSTOM:", ParserKind.CUSTOM,
+                                        custom_parse)
+        ]
+
+    @staticmethod
+    def get_parser(parser_list, keyword):
+        for p in parser_list:
+            if p.keyword == keyword:
+                return p
+        assert False and "parser not found"
+
+    @staticmethod
+    def parse_test(parser_list):
+        script = parseIntegratedTestScript(
+            TestIntegratedTestKeywordParser.inputTestCase,
+            additional_parsers=parser_list, require_script=False)
+        assert not isinstance(script, lit.Test.Result)
+        assert isinstance(script, list)
+        assert len(script) == 0
+
+    def test_tags(self):
+        parsers = self.make_parsers()
+        self.parse_test(parsers)
+        tag_parser = self.get_parser(parsers, 'MY_TAG.')
+        dne_tag_parser = self.get_parser(parsers, 'MY_DNE_TAG.')
+        self.assertTrue(tag_parser.getValue())
+        self.assertFalse(dne_tag_parser.getValue())
+
+    def test_lists(self):
+        parsers = self.make_parsers()
+        self.parse_test(parsers)
+        list_parser = self.get_parser(parsers, 'MY_LIST:')
+        self.assertItemsEqual(list_parser.getValue(),
+                              ['one', 'two', 'three', 'four'])
+
+    def test_commands(self):
+        parsers = self.make_parsers()
+        self.parse_test(parsers)
+        cmd_parser = self.get_parser(parsers, 'MY_RUN:')
+        value = cmd_parser.getValue()
+        self.assertEqual(len(value), 2)  # there are only two run lines
+        self.assertEqual(value[0].strip(), 'baz')
+        self.assertEqual(value[1].strip(), 'foo  bar')
+
+    def test_custom(self):
+        parsers = self.make_parsers()
+        self.parse_test(parsers)
+        custom_parser = self.get_parser(parsers, 'MY_CUSTOM:')
+        value = custom_parser.getValue()
+        self.assertItemsEqual(value, ['a', 'b', 'c'])
+
+
+if __name__ == '__main__':
+    TestIntegratedTestKeywordParser.load_keyword_parser_lit_tests()
+    unittest.main(verbosity=2)