Creating branches/google/testing and tags/google/testing/ from r317203

git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/google/testing@317856 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/.gitignore b/.gitignore
index 8a8ae6f..224bd2f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,8 +65,9 @@
 tools/avrlit
 # Sphinx build tree, if building in-source dir.
 docs/_build
-# VSCode config files.
+# VS2017 and VSCode config files.
 .vscode
+.vs
 
 #==============================================================================#
 # Files created in tree by the Go bindings.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3937aa0..0456503 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,12 +15,16 @@
   cmake_policy(SET CMP0051 OLD)
 endif()
 
+if(POLICY CMP0056)
+  cmake_policy(SET CMP0056 NEW)
+endif()
+
 if(POLICY CMP0057)
   cmake_policy(SET CMP0057 NEW)
 endif()
 
 if(NOT DEFINED LLVM_VERSION_MAJOR)
-  set(LLVM_VERSION_MAJOR 5)
+  set(LLVM_VERSION_MAJOR 6)
 endif()
 if(NOT DEFINED LLVM_VERSION_MINOR)
   set(LLVM_VERSION_MINOR 0)
@@ -44,6 +48,13 @@
     "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX}")
 endif()
 
+if ((CMAKE_GENERATOR MATCHES "Visual Studio") AND (CMAKE_GENERATOR_TOOLSET STREQUAL ""))
+  message(WARNING "Visual Studio generators use the x86 host compiler by "
+                  "default, even for 64-bit targets. This can result in linker "
+                  "instability and out of memory errors. To use the 64-bit "
+                  "host compiler, pass -Thost=x64 on the CMake command line.")
+endif()
+
 project(LLVM
   ${cmake_3_0_PROJ_VERSION}
   ${cmake_3_0_LANGUAGES}
@@ -87,7 +98,7 @@
         set(LIBTOOL_NO_WARNING_FLAG "-no_warning_for_no_symbols")
       endif()
     endif()
-    
+
     foreach(lang ${languages})
       set(CMAKE_${lang}_CREATE_STATIC_LIBRARY
         "${CMAKE_LIBTOOL} -static ${LIBTOOL_NO_WARNING_FLAG} -o <TARGET> \
@@ -169,11 +180,6 @@
   endif()
 endif()
 
-option(LLVM_BUILD_GLOBAL_ISEL "Experimental: Build GlobalISel" ON)
-if(LLVM_BUILD_GLOBAL_ISEL)
-  add_definitions(-DLLVM_BUILD_GLOBAL_ISEL)
-endif()
-
 option(LLVM_ENABLE_DAGISEL_COV "Debug: Prints tablegen patterns that were used for selecting" OFF)
 
 # Add path for custom modules
@@ -199,11 +205,7 @@
 include(VersionFromVCS)
 
 option(LLVM_APPEND_VC_REV
-  "Append the version control system revision id to LLVM version" OFF)
-
-if( LLVM_APPEND_VC_REV )
-  add_version_info_from_vcs(PACKAGE_VERSION)
-endif()
+  "Embed the version control system revision id in LLVM" ON)
 
 set(PACKAGE_NAME LLVM)
 set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
@@ -239,11 +241,11 @@
 include(CPack)
 
 # Sanity check our source directory to make sure that we are not trying to
-# generate an in-tree build (unless on MSVC_IDE, where it is ok), and to make
+# generate an in-source build (unless on MSVC_IDE, where it is ok), and to make
 # sure that we don't have any stray generated files lying around in the tree
 # (which would end up getting picked up by header search, instead of the correct
 # versions).
-if( CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR AND NOT MSVC_IDE )
+if( CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR AND NOT MSVC_IDE )
   message(FATAL_ERROR "In-source builds are not allowed.
 CMake would overwrite the makefiles distributed with LLVM.
 Please create a directory and run cmake from there, passing the path
@@ -281,6 +283,10 @@
 set(LLVM_TOOLS_INSTALL_DIR "bin" CACHE STRING "Path for binary subdirectory (defaults to 'bin')")
 mark_as_advanced(LLVM_TOOLS_INSTALL_DIR)
 
+set(LLVM_UTILS_INSTALL_DIR "bin" CACHE STRING
+    "Path to install LLVM utilities (enabled by LLVM_INSTALL_UTILS=ON) (defaults to LLVM_TOOLS_INSTALL_DIR)")
+mark_as_advanced(LLVM_TOOLS_INSTALL_DIR)
+
 # They are used as destination of target generators.
 set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin)
 set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX})
@@ -303,6 +309,7 @@
 set(LLVM_EXAMPLES_BINARY_DIR ${LLVM_BINARY_DIR}/examples)
 set(LLVM_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
 
+# List of all targets to be built by default:
 set(LLVM_ALL_TARGETS
   AArch64
   AMDGPU
@@ -314,7 +321,6 @@
   MSP430
   NVPTX
   PowerPC
-  RISCV
   Sparc
   SystemZ
   X86
@@ -352,6 +358,8 @@
 
 option(LLVM_ENABLE_TERMINFO "Use terminfo database if available." ON)
 
+option(LLVM_ENABLE_LIBXML2 "Use libxml2 if available." ON)
+
 option(LLVM_ENABLE_LIBEDIT "Use libedit if available." ON)
 
 option(LLVM_ENABLE_THREADS "Use threads if available." ON)
@@ -367,8 +375,6 @@
    ${LLVM_EXPERIMENTAL_TARGETS_TO_BUILD})
 list(REMOVE_DUPLICATES LLVM_TARGETS_TO_BUILD)
 
-include(AddLLVMDefinitions)
-
 option(LLVM_ENABLE_PIC "Build Position-Independent Code" ON)
 option(LLVM_ENABLE_WARNINGS "Enable compiler warnings." ON)
 option(LLVM_ENABLE_MODULES "Compile with C++ modules enabled." OFF)
@@ -380,17 +386,24 @@
   option(LLVM_ENABLE_LOCAL_SUBMODULE_VISIBILITY "Compile with -fmodules-local-submodule-visibility." ON)
 endif()
 option(LLVM_ENABLE_CXX1Y "Compile with C++1y enabled." OFF)
+option(LLVM_ENABLE_CXX1Z "Compile with C++1z enabled." OFF)
 option(LLVM_ENABLE_LIBCXX "Use libc++ if available." OFF)
 option(LLVM_ENABLE_LLD "Use lld as C and C++ linker." OFF)
 option(LLVM_ENABLE_PEDANTIC "Compile with pedantic enabled." ON)
 option(LLVM_ENABLE_WERROR "Fail and stop if a warning is triggered." OFF)
 
+option(LLVM_ENABLE_DUMP "Enable dump functions in release builds" OFF)
+
 if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
   option(LLVM_ENABLE_ASSERTIONS "Enable assertions" OFF)
 else()
   option(LLVM_ENABLE_ASSERTIONS "Enable assertions" ON)
 endif()
 
+if( LLVM_ENABLE_ASSERTIONS )
+  set(LLVM_ENABLE_DUMP ON)
+endif()
+
 option(LLVM_ENABLE_EXPENSIVE_CHECKS "Enable expensive checks" OFF)
 
 set(LLVM_ABI_BREAKING_CHECKS "WITH_ASSERTS" CACHE STRING
@@ -426,6 +439,8 @@
 
 set(LLVM_USE_SANITIZER "" CACHE STRING
   "Define the sanitizer used to build binaries and tests.")
+set(LLVM_LIB_FUZZING_ENGINE "" CACHE PATH
+  "Path to fuzzing library for linking with fuzz targets")
 
 option(LLVM_USE_SPLIT_DWARF
   "Use -gsplit-dwarf when compiling llvm." OFF)
@@ -485,6 +500,10 @@
 option(LLVM_BUILD_UTILS
   "Build LLVM utility binaries. If OFF, just generate build targets." ON)
 
+option(LLVM_INCLUDE_RUNTIMES "Generate build targets for the LLVM runtimes." ON)
+option(LLVM_BUILD_RUNTIMES
+  "Build the LLVM runtimes. If OFF, just generate build targets." ON)
+
 option(LLVM_BUILD_RUNTIME
   "Build the LLVM runtime libraries." ON)
 option(LLVM_BUILD_EXAMPLES
@@ -510,6 +529,9 @@
 option (LLVM_BUILD_EXTERNAL_COMPILER_RT
   "Build compiler-rt as an external project." OFF)
 
+option (LLVM_VERSION_PRINTER_SHOW_HOST_TARGET_INFO
+  "Show target and host info when tools are invoked with --version." ON)
+
 # You can configure which libraries from LLVM you want to include in the
 # shared library by setting LLVM_DYLIB_COMPONENTS to a semi-colon delimited
 # list of LLVM components. All component names handled by llvm-config are valid.
@@ -525,6 +547,8 @@
 endif()
 option(LLVM_BUILD_LLVM_DYLIB "Build libllvm dynamic library" ${LLVM_BUILD_LLVM_DYLIB_default})
 
+option(LLVM_DYLIB_SYMBOL_VERSIONING OFF)
+
 option(LLVM_OPTIMIZED_TABLEGEN "Force TableGen to be built with optimization" OFF)
 if(CMAKE_CROSSCOMPILING OR (LLVM_OPTIMIZED_TABLEGEN AND (LLVM_ENABLE_ASSERTIONS OR CMAKE_CONFIGURATION_TYPES)))
   set(LLVM_USE_HOST_TOOLS ON)
@@ -536,7 +560,8 @@
   set(LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION FALSE CACHE INTERNAL "For Visual Studio 2013, manually copy natvis files to Documents\\Visual Studio 2013\\Visualizers" FORCE)
 endif()
 
-if (LLVM_BUILD_INSTRUMENTED OR LLVM_BUILD_INSTRUMENTED_COVERAGE)
+if (LLVM_BUILD_INSTRUMENTED OR LLVM_BUILD_INSTRUMENTED_COVERAGE OR
+    LLVM_ENABLE_IR_PGO)
   if(NOT LLVM_PROFILE_MERGE_POOL_SIZE)
     # A pool size of 1-2 is probably sufficient on a SSD. 3-4 should be fine
     # for spining disks. Anything higher may only help on slower mediums.
@@ -544,10 +569,9 @@
   endif()
   if(NOT LLVM_PROFILE_FILE_PATTERN)
     if(NOT LLVM_PROFILE_DATA_DIR)
-      file(TO_NATIVE_PATH "${LLVM_BINARY_DIR}/profiles/%${LLVM_PROFILE_MERGE_POOL_SIZE}m.profraw" LLVM_PROFILE_FILE_PATTERN)
-    else()
-      file(TO_NATIVE_PATH "${LLVM_PROFILE_DATA_DIR}/%${LLVM_PROFILE_MERGE_POOL_SIZE}m.profraw" LLVM_PROFILE_FILE_PATTERN)
+      file(TO_NATIVE_PATH "${LLVM_BINARY_DIR}/profiles" LLVM_PROFILE_DATA_DIR)
     endif()
+		file(TO_NATIVE_PATH "${LLVM_PROFILE_DATA_DIR}/%${LLVM_PROFILE_MERGE_POOL_SIZE}m.profraw" LLVM_PROFILE_FILE_PATTERN)
   endif()
 endif()
 
@@ -555,6 +579,10 @@
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static")
 endif()
 
+# Override the default target with an environment variable named by LLVM_TARGET_TRIPLE_ENV.
+set(LLVM_TARGET_TRIPLE_ENV CACHE STRING "The name of environment variable to override default target. Disabled by blank.")
+mark_as_advanced(LLVM_TARGET_TRIPLE_ENV)
+
 # All options referred to from HandleLLVMOptions have to be specified
 # BEFORE this include, otherwise options will not be correctly set on
 # first cmake run
@@ -620,7 +648,7 @@
 
 message(STATUS "Constructing LLVMBuild project information")
 execute_process(
-  COMMAND ${PYTHON_EXECUTABLE} ${LLVMBUILDTOOL}
+  COMMAND ${PYTHON_EXECUTABLE} -B ${LLVMBUILDTOOL}
             --native-target "${LLVM_NATIVE_ARCH}"
             --enable-targets "${LLVM_TARGETS_TO_BUILD}"
             --enable-optional-components "${LLVMOPTIONALCOMPONENTS}"
@@ -773,19 +801,20 @@
   include(CrossCompile)
 endif(LLVM_USE_HOST_TOOLS)
 if(LLVM_TARGET_IS_CROSSCOMPILE_HOST)
-# Dummy use to avoid CMake Wraning: Manually-specified variables were not used
+# Dummy use to avoid CMake Warning: Manually-specified variables were not used
 # (this is a variable that CrossCompile sets on recursive invocations)
 endif()
 
 if(${CMAKE_SYSTEM_NAME} MATCHES "(FreeBSD|DragonFly)")
   # On FreeBSD, /usr/local/* is not used by default. In order to build LLVM
   # with libxml2, iconv.h, etc., we must add /usr/local paths.
-  include_directories("/usr/local/include")
+  include_directories(SYSTEM "/usr/local/include")
   link_directories("/usr/local/lib")
 endif(${CMAKE_SYSTEM_NAME} MATCHES "(FreeBSD|DragonFly)")
 
 if( ${CMAKE_SYSTEM_NAME} MATCHES SunOS )
-   SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -include llvm/Support/Solaris.h")
+   # special hack for Solaris to handle crazy system sys/regset.h
+   include_directories("${LLVM_MAIN_INCLUDE_DIR}/llvm/Support/Solaris")
 endif( ${CMAKE_SYSTEM_NAME} MATCHES SunOS )
 
 # Make sure we don't get -rdynamic in every binary. For those that need it,
@@ -819,15 +848,6 @@
 
 add_subdirectory(utils/TableGen)
 
-# Force target to be built as soon as possible. Clang modules builds depend
-# header-wise on it as they ship all headers from the umbrella folders. Building
-# an entire module might include header, which depends on intrinsics_gen. This
-# should be right after LLVMSupport and LLVMTableGen otherwise we introduce a
-# circular dependence.
-if (LLVM_ENABLE_MODULES)
-  list(APPEND LLVM_COMMON_DEPENDS intrinsics_gen)
-endif(LLVM_ENABLE_MODULES)
-
 add_subdirectory(include/llvm)
 
 add_subdirectory(lib)
@@ -837,9 +857,7 @@
   add_subdirectory(utils/PerfectShuffle)
   add_subdirectory(utils/count)
   add_subdirectory(utils/not)
-  add_subdirectory(utils/llvm-lit)
   add_subdirectory(utils/yaml-bench)
-  add_subdirectory(utils/unittest)
 else()
   if ( LLVM_INCLUDE_TESTS )
     message(FATAL_ERROR "Including tests when not building utils will not work.
@@ -864,7 +882,9 @@
   add_subdirectory(tools)
 endif()
 
-add_subdirectory(runtimes)
+if( LLVM_INCLUDE_RUNTIMES )
+  add_subdirectory(runtimes)
+endif()
 
 if( LLVM_INCLUDE_EXAMPLES )
   add_subdirectory(examples)
@@ -879,8 +899,13 @@
       NO_INSTALL
       ALWAYS_CLEAN)
   endif()
+  add_subdirectory(utils/lit)
   add_subdirectory(test)
   add_subdirectory(unittests)
+  if( LLVM_INCLUDE_UTILS )
+    add_subdirectory(utils/unittest)
+  endif()
+
   if (WIN32)
     # This utility is used to prevent crashing tests from calling Dr. Watson on
     # Windows.
@@ -918,6 +943,11 @@
 
 add_subdirectory(cmake/modules)
 
+# Do this last so that all lit targets have already been created.
+if (LLVM_INCLUDE_UTILS)
+  add_subdirectory(utils/llvm-lit)
+endif()
+
 if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
   install(DIRECTORY include/llvm include/llvm-c
     DESTINATION include
@@ -971,13 +1001,13 @@
     if(TARGET ${target})
       add_dependencies(distribution ${target})
     else()
-      message(FATAL_ERROR "Specified distribution component '${target}' doesn't have a target")
+      message(SEND_ERROR "Specified distribution component '${target}' doesn't have a target")
     endif()
 
     if(TARGET install-${target})
       add_dependencies(install-distribution install-${target})
     else()
-      message(FATAL_ERROR "Specified distribution component '${target}' doesn't have an install target")
+      message(SEND_ERROR "Specified distribution component '${target}' doesn't have an install target")
     endif()
   endforeach()
 endif()
@@ -986,3 +1016,4 @@
 if (MSVC)
   include(InstallRequiredSystemLibraries)
 endif()
+
diff --git a/CODE_OWNERS.TXT b/CODE_OWNERS.TXT
index ec4561d..8bc1c5d 100644
--- a/CODE_OWNERS.TXT
+++ b/CODE_OWNERS.TXT
@@ -41,9 +41,13 @@
 D: OpenMP runtime library
 
 N: Greg Clayton
-E: gclayton@apple.com
+E: clayborg@gmail.com
 D: LLDB
 
+N: Pete Couperus
+E: petecoup@synopsys.com
+D: ARC backend (lib/Target/ARC/*)
+
 N: Sanjoy Das
 E: sanjoy@playingwithpointers.com
 D: IndVar Simplify, Scalar Evolution
@@ -61,7 +65,7 @@
 D: Loop Strength Reduction, Register allocators
 
 N: Simon Dardis
-E: simon.dardis@imgtec.com
+E: simon.dardis@mips.com
 D: MIPS Backend (lib/Target/Mips/*)
 
 N: Duncan P. N. Exon Smith
@@ -70,7 +74,7 @@
 
 N: Hal Finkel
 E: hfinkel@anl.gov
-D: BBVectorize, the loop reroller, alias analysis and the PowerPC target
+D: The loop reroller, alias analysis and the PowerPC target
 
 N: Dan Gohman
 E: sunfish@mozilla.com
@@ -195,6 +199,7 @@
 
 N: Craig Topper
 E: craig.topper@gmail.com
+E: craig.topper@intel.com
 D: X86 Backend
 
 N: Ulrich Weigand
diff --git a/CREDITS.TXT b/CREDITS.TXT
index 15d822a..bd92388 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -43,6 +43,10 @@
 E: neil@daikokuya.co.uk
 D: APFloat implementation.
 
+N: Alex Bradbury
+E: asb@lowrisc.org
+D: RISC-V backend
+
 N: Misha Brukman
 E: brukman+llvm@uiuc.edu
 W: http://misha.brukman.net
@@ -220,7 +224,7 @@
 D: llvm-config script
 
 N: Anton Korobeynikov
-E: asl@math.spbu.ru
+E: anton at korobeynikov dot info
 D: Mingw32 fixes, cross-compiling support, stdcall/fastcall calling conv.
 D: x86/linux PIC codegen, aliases, regparm/visibility attributes
 D: Switch lowering refactoring
@@ -265,7 +269,7 @@
 N: Sylvestre Ledru
 E: sylvestre@debian.org
 W: http://sylvestre.ledru.info/
-W: http://llvm.org/apt/
+W: http://apt.llvm.org/
 D: Debian and Ubuntu packaging
 D: Continuous integration with jenkins
 
@@ -318,11 +322,12 @@
 D: Dumping of Win64 EH structures
 
 N: Takumi Nakamura
+I: chapuni
 E: geek4civic@gmail.com
 E: chapuni@hf.rim.or.jp
-D: Cygwin and MinGW support.
-D: Win32 tweaks.
-S: Yokohama, Japan
+D: Maintaining the Git monorepo
+W: https://github.com/llvm-project/
+S: Ebina, Japan
 
 N: Edward O'Callaghan
 E: eocallaghan@auroraux.org
diff --git a/README.txt b/README.txt
index a8c3ee9..b0980b3 100644
--- a/README.txt
+++ b/README.txt
@@ -16,3 +16,4 @@
 If you are writing a package for LLVM, see docs/Packaging.rst for our
 suggestions.
 
+
diff --git a/RELEASE_TESTERS.TXT b/RELEASE_TESTERS.TXT
index 7bfa88c..0505a4a 100644
--- a/RELEASE_TESTERS.TXT
+++ b/RELEASE_TESTERS.TXT
@@ -41,17 +41,12 @@
 T: x86
 O: Windows
 
-N: Renato Golin
-E: renato.golin@linaro.org
-T: ARM
-O: Linux
-
 N: Diana Picus
 E: diana.picus@linaro.org
-T: AArch64
+T: ARM, AArch64
 O: Linux
 
 N: Simon Dardis
-E: simon.dardis@imgtec.com
+E: simon.dardis@mips.com
 T: MIPS
 O: Linux
diff --git a/bindings/go/llvm/DIBuilderBindings.cpp b/bindings/go/llvm/DIBuilderBindings.cpp
index 53e223d..ea53694 100644
--- a/bindings/go/llvm/DIBuilderBindings.cpp
+++ b/bindings/go/llvm/DIBuilderBindings.cpp
@@ -19,8 +19,6 @@
 
 using namespace llvm;
 
-DEFINE_SIMPLE_CONVERSION_FUNCTIONS(DIBuilder, LLVMDIBuilderRef)
-
 LLVMDIBuilderRef LLVMNewDIBuilder(LLVMModuleRef mref) {
   Module *m = unwrap(mref);
   return wrap(new DIBuilder(*m));
@@ -31,25 +29,6 @@
   delete d;
 }
 
-void LLVMDIBuilderFinalize(LLVMDIBuilderRef dref) { unwrap(dref)->finalize(); }
-
-LLVMMetadataRef LLVMDIBuilderCreateCompileUnit(LLVMDIBuilderRef Dref,
-                                               unsigned Lang, const char *File,
-                                               const char *Dir,
-                                               const char *Producer,
-                                               int Optimized, const char *Flags,
-                                               unsigned RuntimeVersion) {
-  DIBuilder *D = unwrap(Dref);
-  return wrap(D->createCompileUnit(Lang, D->createFile(File, Dir), Producer,
-                                   Optimized, Flags, RuntimeVersion));
-}
-
-LLVMMetadataRef LLVMDIBuilderCreateFile(LLVMDIBuilderRef Dref, const char *File,
-                                        const char *Dir) {
-  DIBuilder *D = unwrap(Dref);
-  return wrap(D->createFile(File, Dir));
-}
-
 LLVMMetadataRef LLVMDIBuilderCreateLexicalBlock(LLVMDIBuilderRef Dref,
                                                 LLVMMetadataRef Scope,
                                                 LLVMMetadataRef File,
@@ -239,7 +218,7 @@
 }
 
 LLVMValueRef LLVMDIBuilderInsertValueAtEnd(LLVMDIBuilderRef Dref,
-                                           LLVMValueRef Val, uint64_t Offset,
+                                           LLVMValueRef Val,
                                            LLVMMetadataRef VarInfo,
                                            LLVMMetadataRef Expr,
                                            LLVMBasicBlockRef Block) {
@@ -249,7 +228,7 @@
 
   DIBuilder *D = unwrap(Dref);
   Instruction *Instr = D->insertDbgValueIntrinsic(
-      unwrap(Val), Offset, unwrap<DILocalVariable>(VarInfo),
-      unwrap<DIExpression>(Expr), /* DebugLoc */ nullptr, unwrap(Block));
+      unwrap(Val), unwrap<DILocalVariable>(VarInfo), unwrap<DIExpression>(Expr),
+      /* DebugLoc */ nullptr, unwrap(Block));
   return wrap(Instr);
 }
diff --git a/bindings/go/llvm/DIBuilderBindings.h b/bindings/go/llvm/DIBuilderBindings.h
index dee8821..cc5d2c1 100644
--- a/bindings/go/llvm/DIBuilderBindings.h
+++ b/bindings/go/llvm/DIBuilderBindings.h
@@ -16,6 +16,7 @@
 
 #include "IRBindings.h"
 #include "llvm-c/Core.h"
+#include "llvm-c/DebugInfo.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -30,16 +31,6 @@
 LLVMDIBuilderRef LLVMNewDIBuilder(LLVMModuleRef m);
 
 void LLVMDIBuilderDestroy(LLVMDIBuilderRef d);
-void LLVMDIBuilderFinalize(LLVMDIBuilderRef d);
-
-LLVMMetadataRef
-LLVMDIBuilderCreateCompileUnit(LLVMDIBuilderRef D, unsigned Language,
-                               const char *File, const char *Dir,
-                               const char *Producer, int Optimized,
-                               const char *Flags, unsigned RuntimeVersion);
-
-LLVMMetadataRef LLVMDIBuilderCreateFile(LLVMDIBuilderRef D, const char *File,
-                                        const char *Dir);
 
 LLVMMetadataRef LLVMDIBuilderCreateLexicalBlock(LLVMDIBuilderRef D,
                                                 LLVMMetadataRef Scope,
@@ -132,7 +123,6 @@
                                              LLVMBasicBlockRef Block);
 
 LLVMValueRef LLVMDIBuilderInsertValueAtEnd(LLVMDIBuilderRef D, LLVMValueRef Val,
-                                           uint64_t Offset,
                                            LLVMMetadataRef VarInfo,
                                            LLVMMetadataRef Expr,
                                            LLVMBasicBlockRef Block);
diff --git a/bindings/go/llvm/IRBindings.cpp b/bindings/go/llvm/IRBindings.cpp
index 20cc050..4bfa1bb 100644
--- a/bindings/go/llvm/IRBindings.cpp
+++ b/bindings/go/llvm/IRBindings.cpp
@@ -14,6 +14,7 @@
 #include "IRBindings.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
@@ -71,6 +72,18 @@
                     InlinedAt ? unwrap<MDNode>(InlinedAt) : nullptr));
 }
 
+LLVMDebugLocMetadata LLVMGetCurrentDebugLocation2(LLVMBuilderRef Bref) {
+  const auto& Loc = unwrap(Bref)->getCurrentDebugLocation();
+  const auto* InlinedAt = Loc.getInlinedAt();
+  const LLVMDebugLocMetadata md{
+    Loc.getLine(),
+    Loc.getCol(),
+    wrap(Loc.getScope()),
+    InlinedAt == nullptr ? nullptr : wrap(InlinedAt->getRawInlinedAt()),
+  };
+  return md;
+}
+
 void LLVMSetSubprogram(LLVMValueRef Func, LLVMMetadataRef SP) {
   unwrap<Function>(Func)->setSubprogram(unwrap<DISubprogram>(SP));
 }
diff --git a/bindings/go/llvm/IRBindings.h b/bindings/go/llvm/IRBindings.h
index 2114771..25a00b1 100644
--- a/bindings/go/llvm/IRBindings.h
+++ b/bindings/go/llvm/IRBindings.h
@@ -26,7 +26,12 @@
 extern "C" {
 #endif
 
-typedef struct LLVMOpaqueMetadata *LLVMMetadataRef;
+struct LLVMDebugLocMetadata{
+    unsigned Line;
+    unsigned Col;
+    LLVMMetadataRef Scope;
+    LLVMMetadataRef InlinedAt;
+};
 
 LLVMMetadataRef LLVMConstantAsMetadata(LLVMValueRef Val);
 
@@ -46,21 +51,13 @@
                                   unsigned Col, LLVMMetadataRef Scope,
                                   LLVMMetadataRef InlinedAt);
 
+struct LLVMDebugLocMetadata LLVMGetCurrentDebugLocation2(LLVMBuilderRef Bref);
+
 void LLVMSetSubprogram(LLVMValueRef Fn, LLVMMetadataRef SP);
 
 #ifdef __cplusplus
 }
 
-namespace llvm {
-
-DEFINE_ISA_CONVERSION_FUNCTIONS(Metadata, LLVMMetadataRef)
-
-inline Metadata **unwrap(LLVMMetadataRef *Vals) {
-  return reinterpret_cast<Metadata**>(Vals);
-}
-
-}
-
 #endif
 
 #endif
diff --git a/bindings/go/llvm/dibuilder.go b/bindings/go/llvm/dibuilder.go
index 56a0a8a..475fa68 100644
--- a/bindings/go/llvm/dibuilder.go
+++ b/bindings/go/llvm/dibuilder.go
@@ -132,12 +132,17 @@
 	defer C.free(unsafe.Pointer(flags))
 	result := C.LLVMDIBuilderCreateCompileUnit(
 		d.ref,
-		C.unsigned(cu.Language),
-		file, dir,
-		producer,
-		boolToCInt(cu.Optimized),
-		flags,
+		C.LLVMDWARFSourceLanguage(cu.Language),
+		C.LLVMDIBuilderCreateFile(d.ref, file, C.size_t(len(cu.File)), dir, C.size_t(len(cu.Dir))),
+		producer, C.size_t(len(cu.Producer)),
+		C.LLVMBool(boolToCInt(cu.Optimized)),
+		flags, C.size_t(len(cu.Flags)),
 		C.unsigned(cu.RuntimeVersion),
+		/*SplitName=*/ nil, 0,
+		C.LLVMDWARFEmissionFull,
+		/*DWOId=*/ 0,
+		/*SplitDebugInlining*/ C.LLVMBool(boolToCInt(true)),
+		/*DebugInfoForProfiling*/ C.LLVMBool(boolToCInt(false)),
 	)
 	return Metadata{C: result}
 }
@@ -148,7 +153,9 @@
 	defer C.free(unsafe.Pointer(cfilename))
 	cdir := C.CString(dir)
 	defer C.free(unsafe.Pointer(cdir))
-	result := C.LLVMDIBuilderCreateFile(d.ref, cfilename, cdir)
+	result := C.LLVMDIBuilderCreateFile(d.ref,
+		cfilename, C.size_t(len(filename)),
+		cdir, C.size_t(len(dir)))
 	return Metadata{C: result}
 }
 
@@ -533,8 +540,8 @@
 
 // InsertValueAtEnd inserts a call to llvm.dbg.value at the end of the
 // specified basic block for the given value and associated debug metadata.
-func (d *DIBuilder) InsertValueAtEnd(v Value, diVarInfo, expr Metadata, offset uint64, bb BasicBlock) Value {
-	result := C.LLVMDIBuilderInsertValueAtEnd(d.ref, v.C, C.uint64_t(offset), diVarInfo.C, expr.C, bb.C)
+func (d *DIBuilder) InsertValueAtEnd(v Value, diVarInfo, expr Metadata, bb BasicBlock) Value {
+	result := C.LLVMDIBuilderInsertValueAtEnd(d.ref, v.C, diVarInfo.C, expr.C, bb.C)
 	return Value{C: result}
 }
 
diff --git a/bindings/go/llvm/ir.go b/bindings/go/llvm/ir.go
index b263c07..2220970 100644
--- a/bindings/go/llvm/ir.go
+++ b/bindings/go/llvm/ir.go
@@ -611,6 +611,12 @@
 }
 
 // Operations on array, pointer, and vector types (sequence types)
+func (t Type) Subtypes() (ret []Type) {
+	ret = make([]Type, C.LLVMGetNumContainedTypes(t.C))
+	C.LLVMGetSubtypes(t.C, llvmTypeRefPtr(&ret[0]))
+	return
+}
+
 func ArrayType(elementType Type, elementCount int) (t Type) {
 	t.C = C.LLVMArrayType(elementType.C, C.unsigned(elementCount))
 	return
@@ -1226,9 +1232,23 @@
 func (b Builder) Dispose() { C.LLVMDisposeBuilder(b.C) }
 
 // Metadata
+type DebugLoc struct {
+	Line, Col      uint
+	Scope          Metadata
+	InlinedAt      Metadata
+}
 func (b Builder) SetCurrentDebugLocation(line, col uint, scope, inlinedAt Metadata) {
 	C.LLVMSetCurrentDebugLocation2(b.C, C.unsigned(line), C.unsigned(col), scope.C, inlinedAt.C)
 }
+// Get current debug location. Please do not call this function until setting debug location with SetCurrentDebugLocation()
+func (b Builder) GetCurrentDebugLocation() (loc DebugLoc) {
+	md := C.LLVMGetCurrentDebugLocation2(b.C)
+	loc.Line = uint(md.Line)
+	loc.Col = uint(md.Col)
+	loc.Scope = Metadata{C: md.Scope}
+	loc.InlinedAt = Metadata{C: md.InlinedAt}
+	return
+}
 func (b Builder) SetInstDebugLocation(v Value) { C.LLVMSetInstDebugLocation(b.C, v.C) }
 func (b Builder) InsertDeclare(module Module, storage Value, md Value) Value {
 	f := module.NamedFunction("llvm.dbg.declare")
diff --git a/bindings/go/llvm/ir_test.go b/bindings/go/llvm/ir_test.go
index 13e1139..325ee48 100644
--- a/bindings/go/llvm/ir_test.go
+++ b/bindings/go/llvm/ir_test.go
@@ -95,3 +95,68 @@
 		testAttribute(t, name)
 	}
 }
+
+func TestDebugLoc(t *testing.T) {
+	mod := NewModule("")
+	defer mod.Dispose()
+
+	ctx := mod.Context()
+
+	b := ctx.NewBuilder()
+	defer b.Dispose()
+
+	d := NewDIBuilder(mod)
+	defer func() {
+		d.Destroy()
+	}()
+	file := d.CreateFile("dummy_file", "dummy_dir")
+	voidInfo := d.CreateBasicType(DIBasicType{Name: "void"})
+	typeInfo := d.CreateSubroutineType(DISubroutineType{file, []Metadata{voidInfo}})
+	scope := d.CreateFunction(file, DIFunction{
+		Name:         "foo",
+		LinkageName:  "foo",
+		Line:         10,
+		ScopeLine:    10,
+		Type:         typeInfo,
+		File:         file,
+		IsDefinition: true,
+	})
+
+	b.SetCurrentDebugLocation(10, 20, scope, Metadata{})
+	loc := b.GetCurrentDebugLocation()
+	if loc.Line != 10 {
+		t.Errorf("Got line %d, though wanted 10", loc.Line)
+	}
+	if loc.Col != 20 {
+		t.Errorf("Got column %d, though wanted 20", loc.Col)
+	}
+	if loc.Scope.C != scope.C {
+		t.Errorf("Got metadata %v as scope, though wanted %v", loc.Scope.C, scope.C)
+	}
+}
+
+func TestSubtypes(t *testing.T) {
+	cont := NewContext()
+	defer cont.Dispose()
+
+	int_pointer := PointerType(cont.Int32Type(), 0)
+	int_inner := int_pointer.Subtypes()
+	if len(int_inner) != 1 {
+		t.Errorf("Got size %d, though wanted 1")
+	}
+	if int_inner[0] != cont.Int32Type() {
+		t.Errorf("Expected int32 type")
+	}
+
+	st_pointer := cont.StructType([]Type{cont.Int32Type(), cont.Int8Type()}, false)
+	st_inner := st_pointer.Subtypes()
+	if len(st_inner) != 2 {
+		t.Errorf("Got size %d, though wanted 2")
+	}
+	if st_inner[0] != cont.Int32Type() {
+		t.Errorf("Expected first struct field to be int32")
+	}
+	if st_inner[1] != cont.Int8Type() {
+		t.Errorf("Expected second struct field to be int8")
+	}
+}
diff --git a/bindings/ocaml/llvm/llvm.ml b/bindings/ocaml/llvm/llvm.ml
index 399fd2d..59f0f17 100644
--- a/bindings/ocaml/llvm/llvm.ml
+++ b/bindings/ocaml/llvm/llvm.ml
@@ -20,6 +20,10 @@
 type llmemorybuffer
 type llmdkind
 
+exception FeatureDisabled of string
+
+let () = Callback.register_exception "Llvm.FeatureDisabled" (FeatureDisabled "")
+
 module TypeKind = struct
   type t =
   | Void
@@ -459,6 +463,8 @@
 external is_opaque : lltype -> bool = "llvm_is_opaque"
 
 (*--... Operations on pointer, vector, and array types .....................--*)
+
+external subtypes : lltype -> lltype array = "llvm_subtypes"
 external array_type : lltype -> int -> lltype = "llvm_array_type"
 external pointer_type : lltype -> lltype = "llvm_pointer_type"
 external qualified_pointer_type : lltype -> int -> lltype
diff --git a/bindings/ocaml/llvm/llvm.mli b/bindings/ocaml/llvm/llvm.mli
index 4068126..3387c1e 100644
--- a/bindings/ocaml/llvm/llvm.mli
+++ b/bindings/ocaml/llvm/llvm.mli
@@ -371,6 +371,8 @@
 
 (** {6 Exceptions} *)
 
+exception FeatureDisabled of string
+
 exception IoError of string
 
 
@@ -658,6 +660,9 @@
 
 (** {7 Operations on pointer, vector, and array types} *)
 
+(** [subtypes ty] returns [ty]'s subtypes *)
+val subtypes : lltype -> lltype array
+
 (** [array_type ty n] returns the array type containing [n] elements of type
     [ty]. See the method [llvm::ArrayType::get]. *)
 val array_type : lltype -> int -> lltype
diff --git a/bindings/ocaml/llvm/llvm_ocaml.c b/bindings/ocaml/llvm/llvm_ocaml.c
index af04ea2..137b17f 100644
--- a/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/bindings/ocaml/llvm/llvm_ocaml.c
@@ -336,7 +336,12 @@
 
 /* lltype -> unit */
 CAMLprim value llvm_dump_type(LLVMTypeRef Val) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   LLVMDumpType(Val);
+#else
+  caml_raise_with_arg(*caml_named_value("Llvm.FeatureDisabled"),
+      caml_copy_string("dump"));
+#endif
   return Val_unit;
 }
 
@@ -506,6 +511,20 @@
 
 /*--... Operations on array, pointer, and vector types .....................--*/
 
+/* lltype -> lltype array */
+CAMLprim value llvm_subtypes(LLVMTypeRef Ty) {
+    CAMLparam0();
+    CAMLlocal1(Arr);
+
+    unsigned Size = LLVMGetNumContainedTypes(Ty);
+
+    Arr = caml_alloc(Size, 0);
+
+    LLVMGetSubtypes(Ty, (LLVMTypeRef *) Arr);
+
+    CAMLreturn(Arr);
+}
+
 /* lltype -> int -> lltype */
 CAMLprim LLVMTypeRef llvm_array_type(LLVMTypeRef ElementTy, value Count) {
   return LLVMArrayType(ElementTy, Int_val(Count));
diff --git a/bindings/ocaml/target/target_ocaml.c b/bindings/ocaml/target/target_ocaml.c
index b63bef6..8872f42 100644
--- a/bindings/ocaml/target/target_ocaml.c
+++ b/bindings/ocaml/target/target_ocaml.c
@@ -77,7 +77,7 @@
 
 /* Llvm.llcontext -> DataLayout.t -> Llvm.lltype */
 CAMLprim LLVMTypeRef llvm_datalayout_intptr_type(LLVMContextRef C, value DL) {
-  return LLVMIntPtrTypeInContext(C, DataLayout_val(DL));;
+  return LLVMIntPtrTypeInContext(C, DataLayout_val(DL));
 }
 
 /* int -> DataLayout.t -> int */
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
old mode 100755
new mode 100644
index 34c81fa..a1b4846
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -8,6 +8,7 @@
 include(CheckLibraryExists)
 include(CheckSymbolExists)
 include(CheckFunctionExists)
+include(CheckCCompilerFlag)
 include(CheckCXXSourceCompiles)
 include(TestBigEndian)
 
@@ -46,7 +47,6 @@
 check_include_file(dirent.h HAVE_DIRENT_H)
 check_include_file(dlfcn.h HAVE_DLFCN_H)
 check_include_file(errno.h HAVE_ERRNO_H)
-check_include_file(execinfo.h HAVE_EXECINFO_H)
 check_include_file(fcntl.h HAVE_FCNTL_H)
 check_include_file(inttypes.h HAVE_INTTYPES_H)
 check_include_file(link.h HAVE_LINK_H)
@@ -156,6 +156,18 @@
   else()
     set(HAVE_TERMINFO 0)
   endif()
+
+  find_library(ICONV_LIBRARY_PATH NAMES iconv libiconv libiconv-2)
+  set(LLVM_LIBXML2_ENABLED 0)
+  set(LIBXML2_FOUND 0)
+  if((LLVM_ENABLE_LIBXML2) AND ((CMAKE_SYSTEM_NAME MATCHES "Linux") AND (ICONV_LIBRARY_PATH) OR APPLE))
+    find_package(LibXml2)
+    if (LIBXML2_FOUND)
+      set(LLVM_LIBXML2_ENABLED 1)
+      include_directories(${LIBXML2_INCLUDE_DIR})
+      set(LIBXML2_LIBS "xml2")
+    endif()
+  endif()
 endif()
 
 check_library_exists(xar xar_open "" HAVE_LIBXAR)
@@ -165,7 +177,17 @@
 
 # function checks
 check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
-check_symbol_exists(backtrace "execinfo.h" HAVE_BACKTRACE)
+find_package(Backtrace)
+set(HAVE_BACKTRACE ${Backtrace_FOUND})
+set(BACKTRACE_HEADER ${Backtrace_HEADER})
+
+# Prevent check_symbol_exists from using API that is not supported for a given
+# deployment target.
+check_c_compiler_flag("-Werror=unguarded-availability-new" "C_SUPPORTS_WERROR_UNGUARDED_AVAILABILITY_NEW")
+if(C_SUPPORTS_WERROR_UNGUARDED_AVAILABILITY_NEW)
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror=unguarded-availability-new")
+endif()
+
 check_symbol_exists(_Unwind_Backtrace "unwind.h" HAVE__UNWIND_BACKTRACE)
 check_symbol_exists(getpagesize unistd.h HAVE_GETPAGESIZE)
 check_symbol_exists(sysconf unistd.h HAVE_SYSCONF)
@@ -244,9 +266,12 @@
 
 check_symbol_exists(__GLIBC__ stdio.h LLVM_USING_GLIBC)
 if( LLVM_USING_GLIBC )
-  add_llvm_definitions( -D_GNU_SOURCE )
+  add_definitions( -D_GNU_SOURCE )
+  list(APPEND CMAKE_REQUIRED_DEFINITIONS "-D_GNU_SOURCE")
 endif()
 # This check requires _GNU_SOURCE
+check_symbol_exists(sched_getaffinity sched.h HAVE_SCHED_GETAFFINITY)
+check_symbol_exists(CPU_COUNT sched.h HAVE_CPU_COUNT)
 if(HAVE_LIBPTHREAD)
   check_library_exists(pthread pthread_getname_np "" HAVE_PTHREAD_GETNAME_NP)
   check_library_exists(pthread pthread_setname_np "" HAVE_PTHREAD_SETNAME_NP)
@@ -529,16 +554,6 @@
   message(STATUS "Doxygen disabled.")
 endif()
 
-if (LLVM_ENABLE_SPHINX)
-  message(STATUS "Sphinx enabled.")
-  find_package(Sphinx REQUIRED)
-  if (LLVM_BUILD_DOCS)
-    add_custom_target(sphinx ALL)
-  endif()
-else()
-  message(STATUS "Sphinx disabled.")
-endif()
-
 set(LLVM_BINDINGS "")
 if(WIN32)
   message(STATUS "Go bindings disabled.")
diff --git a/cmake/config.guess b/cmake/config.guess
index 8bf4226..ccb30f4 100644
--- a/cmake/config.guess
+++ b/cmake/config.guess
@@ -206,10 +206,6 @@
 	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
 	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
 	exit ;;
-    *:Bitrig:*:*)
-	UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
-	echo ${UNAME_MACHINE_ARCH}-unknown-bitrig${UNAME_RELEASE}
-	exit ;;
     *:ekkoBSD:*:*)
 	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
 	exit ;;
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 7f7608c..62e264e 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -81,8 +81,9 @@
     # Gold and BFD ld require a version script rather than a plain list.
     set(native_export_file "${target_name}.exports")
     # FIXME: Don't write the "local:" line on OpenBSD.
+    # in the export file, also add a linker script to version LLVM symbols (form: LLVM_N.M)
     add_custom_command(OUTPUT ${native_export_file}
-      COMMAND echo "{" > ${native_export_file}
+      COMMAND echo "LLVM_${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR} {" > ${native_export_file}
       COMMAND grep -q "[[:alnum:]]" ${export_file} && echo "  global:" >> ${native_export_file} || :
       COMMAND sed -e "s/$/;/" -e "s/^/    /" < ${export_file} >> ${native_export_file}
       COMMAND echo "  local: *;" >> ${native_export_file}
@@ -90,7 +91,7 @@
       DEPENDS ${export_file}
       VERBATIM
       COMMENT "Creating export file for ${target_name}")
-    if (${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
+    if (${LLVM_LINKER_IS_SOLARISLD})
       set_property(TARGET ${target_name} APPEND_STRING PROPERTY
                    LINK_FLAGS "  -Wl,-M,${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}")
     else()
@@ -147,13 +148,33 @@
 endfunction(add_llvm_symbol_exports)
 
 if(NOT WIN32 AND NOT APPLE)
+  # Detect what linker we have here
+  if( LLVM_USE_LINKER )
+    set(command ${CMAKE_C_COMPILER} -fuse-ld=${LLVM_USE_LINKER} -Wl,--version)
+  else()
+    set(command ${CMAKE_C_COMPILER} -Wl,--version)
+  endif()
   execute_process(
-    COMMAND ${CMAKE_C_COMPILER} -Wl,--version
+    COMMAND ${command}
     OUTPUT_VARIABLE stdout
-    ERROR_QUIET
+    ERROR_VARIABLE stderr
     )
+  set(LLVM_LINKER_DETECTED ON)
   if("${stdout}" MATCHES "GNU gold")
     set(LLVM_LINKER_IS_GOLD ON)
+    message(STATUS "Linker detection: GNU Gold")
+  elseif("${stdout}" MATCHES "^LLD")
+    set(LLVM_LINKER_IS_LLD ON)
+    message(STATUS "Linker detection: LLD")
+  elseif("${stdout}" MATCHES "GNU ld")
+    set(LLVM_LINKER_IS_GNULD ON)
+    message(STATUS "Linker detection: GNU ld")
+  elseif("${stderr}" MATCHES "Solaris Link Editors")
+    set(LLVM_LINKER_IS_SOLARISLD ON)
+    message(STATUS "Linker detection: Solaris ld")
+  else()
+    set(LLVM_LINKER_DETECTED OFF)
+    message(STATUS "Linker detection: unknown")
   endif()
 endif()
 
@@ -247,14 +268,14 @@
 #
 function(add_windows_version_resource_file OUT_VAR)
   set(sources ${ARGN})
-  if (MSVC)
+  if (MSVC AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
     set(resource_file ${LLVM_SOURCE_DIR}/resources/windows_version_resource.rc)
     if(EXISTS ${resource_file})
       set(sources ${sources} ${resource_file})
       source_group("Resource Files" ${resource_file})
       set(windows_resource_file ${resource_file} PARENT_SCOPE)
     endif()
-  endif(MSVC)
+  endif(MSVC AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
 
   set(${OUT_VAR} ${sources} PARENT_SCOPE)
 endfunction(add_windows_version_resource_file)
@@ -671,7 +692,7 @@
     # it forces Xcode to properly link the static library.
     list(APPEND ALL_FILES "${LLVM_MAIN_SRC_DIR}/cmake/dummy.cpp")
   endif()
-  
+
   if( EXCLUDE_FROM_ALL )
     add_executable(${name} EXCLUDE_FROM_ALL ${ALL_FILES})
   else()
@@ -864,7 +885,7 @@
   set_target_properties(${name} PROPERTIES FOLDER "Utils")
   if( LLVM_INSTALL_UTILS AND LLVM_BUILD_UTILS )
     install (TARGETS ${name}
-      RUNTIME DESTINATION bin
+      RUNTIME DESTINATION ${LLVM_UTILS_INSTALL_DIR}
       COMPONENT ${name})
     if (NOT CMAKE_CONFIGURATION_TYPES)
       add_custom_target(install-${name}
@@ -876,6 +897,23 @@
   endif()
 endmacro(add_llvm_utility name)
 
+macro(add_llvm_fuzzer name)
+  cmake_parse_arguments(ARG "" "DUMMY_MAIN" "" ${ARGN})
+  if( LLVM_LIB_FUZZING_ENGINE )
+    set(LLVM_OPTIONAL_SOURCES ${ARG_DUMMY_MAIN})
+    add_llvm_executable(${name} ${ARG_UNPARSED_ARGUMENTS})
+    target_link_libraries(${name} ${LLVM_LIB_FUZZING_ENGINE})
+    set_target_properties(${name} PROPERTIES FOLDER "Fuzzers")
+  elseif( LLVM_USE_SANITIZE_COVERAGE )
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=fuzzer")
+    set(LLVM_OPTIONAL_SOURCES ${ARG_DUMMY_MAIN})
+    add_llvm_executable(${name} ${ARG_UNPARSED_ARGUMENTS})
+    set_target_properties(${name} PROPERTIES FOLDER "Fuzzers")
+  elseif( ARG_DUMMY_MAIN )
+    add_llvm_executable(${name} ${ARG_DUMMY_MAIN} ${ARG_UNPARSED_ARGUMENTS})
+    set_target_properties(${name} PROPERTIES FOLDER "Fuzzers")
+  endif()
+endmacro()
 
 macro(add_llvm_target target_name)
   include_directories(BEFORE
@@ -1004,6 +1042,13 @@
     set(EXCLUDE_FROM_ALL ON)
   endif()
 
+  # Our current version of gtest does not properly recognize C++11 support
+  # with MSVC, so it falls back to tr1 / experimental classes.  Since LLVM
+  # itself requires C++11, we can safely force it on unconditionally so that
+  # we don't have to fight with the buggy gtest check.  
+  add_definitions(-DGTEST_LANG_CXX11=1)
+  add_definitions(-DGTEST_HAS_TR1_TUPLE=0)
+
   include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include)
   include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googlemock/include)
   if (NOT LLVM_ENABLE_THREADS)
@@ -1084,7 +1129,17 @@
 # variables needed for the 'lit.site.cfg' files. This function bundles the
 # common variables that any Lit instance is likely to need, and custom
 # variables can be passed in.
-function(configure_lit_site_cfg input output)
+function(configure_lit_site_cfg site_in site_out)
+  cmake_parse_arguments(ARG "" "" "MAIN_CONFIG;OUTPUT_MAPPING" ${ARGN})
+
+  if ("${ARG_MAIN_CONFIG}" STREQUAL "")
+    get_filename_component(INPUT_DIR ${site_in} DIRECTORY)
+    set(ARG_MAIN_CONFIG "${INPUT_DIR}/lit.cfg")
+  endif()
+  if ("${ARG_OUTPUT_MAPPING}" STREQUAL "")
+    set(ARG_OUTPUT_MAPPING "${site_out}")
+  endif()
+
   foreach(c ${LLVM_TARGETS_TO_BUILD})
     set(TARGETS_BUILT "${TARGETS_BUILT} ${c}")
   endforeach(c)
@@ -1102,11 +1157,11 @@
   # They below might not be the build tree but provided binary tree.
   set(LLVM_SOURCE_DIR ${LLVM_MAIN_SRC_DIR})
   set(LLVM_BINARY_DIR ${LLVM_BINARY_DIR})
-  string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} LLVM_TOOLS_DIR ${LLVM_TOOLS_BINARY_DIR})
-  string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} LLVM_LIBS_DIR  ${LLVM_LIBRARY_DIR})
+  string(REPLACE "${CMAKE_CFG_INTDIR}" "${LLVM_BUILD_MODE}" LLVM_TOOLS_DIR "${LLVM_TOOLS_BINARY_DIR}")
+  string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} LLVM_LIBS_DIR  "${LLVM_LIBRARY_DIR}")
 
   # SHLIBDIR points the build tree.
-  string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} SHLIBDIR "${LLVM_SHLIB_OUTPUT_INTDIR}")
+  string(REPLACE "${CMAKE_CFG_INTDIR}" "${LLVM_BUILD_MODE}" SHLIBDIR "${LLVM_SHLIB_OUTPUT_INTDIR}")
 
   set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE})
   # FIXME: "ENABLE_SHARED" doesn't make sense, since it is used just for
@@ -1130,9 +1185,89 @@
   set(HOST_CXX "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1}")
   set(HOST_LDFLAGS "${CMAKE_EXE_LINKER_FLAGS}")
 
-  set(LIT_SITE_CFG_IN_HEADER  "## Autogenerated from ${input}\n## Do not edit!")
+  set(LIT_SITE_CFG_IN_HEADER  "## Autogenerated from ${site_in}\n## Do not edit!")
 
-  configure_file(${input} ${output} @ONLY)
+  # Override config_target_triple (and the env)
+  if(LLVM_TARGET_TRIPLE_ENV)
+    # This is expanded into the heading.
+    string(CONCAT LIT_SITE_CFG_IN_HEADER "${LIT_SITE_CFG_IN_HEADER}\n\n"
+      "import os\n"
+      "target_env = \"${LLVM_TARGET_TRIPLE_ENV}\"\n"
+      "config.target_triple = config.environment[target_env] = os.environ.get(target_env, \"${TARGET_TRIPLE}\")\n"
+      )
+
+    # This is expanded to; config.target_triple = ""+config.target_triple+""
+    set(TARGET_TRIPLE "\"+config.target_triple+\"")
+  endif()
+
+  string(CONCAT LIT_SITE_CFG_IN_FOOTER
+     "import lit.llvm\n"
+     "lit.llvm.initialize(lit_config, config)\n")
+
+  configure_file(${site_in} ${site_out} @ONLY)
+  if (EXISTS "${ARG_MAIN_CONFIG}")
+    set(PYTHON_STATEMENT "map_config('${ARG_MAIN_CONFIG}', '${site_out}')")
+    get_property(LLVM_LIT_CONFIG_MAP GLOBAL PROPERTY LLVM_LIT_CONFIG_MAP)
+    set(LLVM_LIT_CONFIG_MAP "${LLVM_LIT_CONFIG_MAP}\n${PYTHON_STATEMENT}")
+    set_property(GLOBAL PROPERTY LLVM_LIT_CONFIG_MAP ${LLVM_LIT_CONFIG_MAP})
+  endif()
+endfunction()
+
+function(dump_all_cmake_variables)
+  get_cmake_property(_variableNames VARIABLES)
+  foreach (_variableName ${_variableNames})
+    message(STATUS "${_variableName}=${${_variableName}}")
+  endforeach()
+endfunction()
+
+function(get_llvm_lit_path base_dir file_name)
+  cmake_parse_arguments(ARG "ALLOW_EXTERNAL" "" "" ${ARGN})
+
+  if (ARG_ALLOW_EXTERNAL)
+    set(LLVM_DEFAULT_EXTERNAL_LIT "${LLVM_EXTERNAL_LIT}")
+    set (LLVM_EXTERNAL_LIT "" CACHE STRING "Command used to spawn lit")
+    if ("${LLVM_EXTERNAL_LIT}" STREQUAL "")
+      set(LLVM_EXTERNAL_LIT "${LLVM_DEFAULT_EXTERNAL_LIT}")
+    endif()
+
+    if (NOT "${LLVM_EXTERNAL_LIT}" STREQUAL "")
+      if (EXISTS ${LLVM_EXTERNAL_LIT})
+        get_filename_component(LIT_FILE_NAME ${LLVM_EXTERNAL_LIT} NAME)
+        get_filename_component(LIT_BASE_DIR ${LLVM_EXTERNAL_LIT} DIRECTORY)
+        set(${file_name} ${LIT_FILE_NAME} PARENT_SCOPE)
+        set(${base_dir} ${LIT_BASE_DIR} PARENT_SCOPE)
+        return()
+      else()
+        message(WARN "LLVM_EXTERNAL_LIT set to ${LLVM_EXTERNAL_LIT}, but the path does not exist.")
+      endif()
+    endif()
+  endif()
+
+  set(lit_file_name "llvm-lit")
+  if (WIN32 AND NOT CYGWIN)
+    # llvm-lit needs suffix.py for multiprocess to find a main module.
+    set(lit_file_name "${lit_file_name}.py")
+  endif ()
+  set(${file_name} ${lit_file_name} PARENT_SCOPE)
+
+  get_property(LLVM_LIT_BASE_DIR GLOBAL PROPERTY LLVM_LIT_BASE_DIR)
+  if (NOT "${LLVM_LIT_BASE_DIR}" STREQUAL "")
+    set(${base_dir} ${LLVM_LIT_BASE_DIR} PARENT_SCOPE)
+  endif()
+
+  # Allow individual projects to provide an override
+  if (NOT "${LLVM_LIT_OUTPUT_DIR}" STREQUAL "")
+    set(LLVM_LIT_BASE_DIR ${LLVM_LIT_OUTPUT_DIR})
+  elseif(NOT "${LLVM_RUNTIME_OUTPUT_INTDIR}" STREQUAL "")
+    set(LLVM_LIT_BASE_DIR ${LLVM_RUNTIME_OUTPUT_INTDIR})
+  else()
+    set(LLVM_LIT_BASE_DIR "")
+  endif()
+
+  # Cache this so we don't have to do it again and have subsequent calls
+  # potentially disagree on the value.
+  set_property(GLOBAL PROPERTY LLVM_LIT_BASE_DIR ${LLVM_LIT_BASE_DIR})
+  set(${base_dir} ${LLVM_LIT_BASE_DIR} PARENT_SCOPE)
 endfunction()
 
 # A raw function to create a lit target. This is used to implement the testuite
@@ -1144,17 +1279,16 @@
   if (NOT CMAKE_CFG_INTDIR STREQUAL ".")
     list(APPEND LIT_ARGS --param build_mode=${CMAKE_CFG_INTDIR})
   endif ()
-  if (EXISTS ${LLVM_MAIN_SRC_DIR}/utils/lit/lit.py)
-    # reset cache after erraneous r283029
-    # TODO: remove this once all buildbots run
-    if (LIT_COMMAND STREQUAL "${PYTHON_EXECUTABLE} ${LLVM_MAIN_SRC_DIR}/utils/lit/lit.py")
-      unset(LIT_COMMAND CACHE)
-    endif()
-    set (LIT_COMMAND "${PYTHON_EXECUTABLE};${LLVM_MAIN_SRC_DIR}/utils/lit/lit.py"
-         CACHE STRING "Command used to spawn llvm-lit")
-  else()
-    find_program(LIT_COMMAND NAMES llvm-lit lit.py lit)
-  endif ()
+
+  # Get the path to the lit to *run* tests with.  This can be overriden by
+  # the user by specifying -DLLVM_EXTERNAL_LIT=<path-to-lit.py>
+  get_llvm_lit_path(
+    lit_base_dir
+    lit_file_name
+    ALLOW_EXTERNAL
+    )
+
+  set(LIT_COMMAND "${PYTHON_EXECUTABLE};${lit_base_dir}/${lit_file_name}")
   list(APPEND LIT_COMMAND ${LIT_ARGS})
   foreach(param ${ARG_PARAMS})
     list(APPEND LIT_COMMAND --param ${param})
@@ -1314,7 +1448,7 @@
   # magic. First we grab one of the types, and a type-specific path. Then from
   # the type-specific path we find the last occurrence of the type in the path,
   # and replace it with CMAKE_CFG_INTDIR. This allows the build step to be type
-  # agnostic again. 
+  # agnostic again.
   if(NOT ARG_OUTPUT_DIR)
     # If you're not overriding the OUTPUT_DIR, we can make the link relative in
     # the same directory.
@@ -1464,3 +1598,36 @@
   set(sandbox_command "sandbox-exec -p '(version 1) (allow default) ${deny_attributes_gen} ${deny_intrinsics_gen}'")
   set_target_properties(${name} PROPERTIES RULE_LAUNCH_COMPILE ${sandbox_command})
 endfunction()
+
+# Figure out if we can track VC revisions.
+function(find_first_existing_file out_var)
+  foreach(file ${ARGN})
+    if(EXISTS "${file}")
+      set(${out_var} "${file}" PARENT_SCOPE)
+      return()
+    endif()
+  endforeach()
+endfunction()
+
+macro(find_first_existing_vc_file out_var path)
+    find_program(git_executable NAMES git git.exe git.cmd)
+    # Run from a subdirectory to force git to print an absolute path.
+    execute_process(COMMAND ${git_executable} rev-parse --git-dir
+      WORKING_DIRECTORY ${path}/cmake
+      RESULT_VARIABLE git_result
+      OUTPUT_VARIABLE git_dir
+      ERROR_QUIET)
+    if(git_result EQUAL 0)
+      string(STRIP "${git_dir}" git_dir)
+      set(${out_var} "${git_dir}/logs/HEAD")
+      # some branchless cases (e.g. 'repo') may not yet have .git/logs/HEAD
+      if (NOT EXISTS "${git_dir}/logs/HEAD")
+        file(WRITE "${git_dir}/logs/HEAD" "")
+      endif()
+    else()
+      find_first_existing_file(${out_var}
+        "${path}/.svn/wc.db"   # SVN 1.7
+        "${path}/.svn/entries" # SVN 1.6
+      )
+    endif()
+endmacro()
diff --git a/cmake/modules/AddOCaml.cmake b/cmake/modules/AddOCaml.cmake
index 1b805c0..1d8094c 100644
--- a/cmake/modules/AddOCaml.cmake
+++ b/cmake/modules/AddOCaml.cmake
@@ -87,6 +87,11 @@
   foreach( include_dir ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR} )
     set(c_flags "${c_flags} -I${include_dir}")
   endforeach()
+  # include -D/-UNDEBUG to match dump function visibility
+  # regex from HandleLLVMOptions.cmake
+  string(REGEX MATCH "(^| )[/-][UD] *NDEBUG($| )" flag_matches
+         "${CMAKE_C_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${CMAKE_C_FLAGS}")
+  set(c_flags "${c_flags} ${flag_matches}")
 
   foreach( ocaml_file ${ARG_OCAML} )
     list(APPEND sources "${ocaml_file}.mli" "${ocaml_file}.ml")
@@ -199,7 +204,7 @@
           PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE
                       GROUP_READ GROUP_EXECUTE
                       WORLD_READ WORLD_EXECUTE
-          DESTINATION "${LLVM_OCAML_INSTALL_PATH}/llvm")
+          DESTINATION "${LLVM_OCAML_INSTALL_PATH}/stublibs")
 
   foreach( install_file ${install_files} ${install_shlibs} )
     get_filename_component(filename "${install_file}" NAME)
diff --git a/cmake/modules/AddSphinxTarget.cmake b/cmake/modules/AddSphinxTarget.cmake
index 3456b53..4540c5c 100644
--- a/cmake/modules/AddSphinxTarget.cmake
+++ b/cmake/modules/AddSphinxTarget.cmake
@@ -1,3 +1,16 @@
+
+# Create sphinx target
+if (LLVM_ENABLE_SPHINX)
+  message(STATUS "Sphinx enabled.")
+  find_package(Sphinx REQUIRED)
+  if (LLVM_BUILD_DOCS AND NOT TARGET sphinx)
+    add_custom_target(sphinx ALL)
+  endif()
+else()
+  message(STATUS "Sphinx disabled.")
+endif()
+
+
 # Handy function for creating the different Sphinx targets.
 #
 # ``builder`` should be one of the supported builders used by
@@ -48,10 +61,15 @@
     # Handle installation
     if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
       if (builder STREQUAL man)
+        if (CMAKE_INSTALL_MANDIR)
+          set(INSTALL_MANDIR ${CMAKE_INSTALL_MANDIR}/)
+        else()
+          set(INSTALL_MANDIR share/man/)
+        endif()
         # FIXME: We might not ship all the tools that these man pages describe
         install(DIRECTORY "${SPHINX_BUILD_DIR}/" # Slash indicates contents of
                 COMPONENT "${project}-sphinx-man"
-                DESTINATION share/man/man1)
+                DESTINATION ${INSTALL_MANDIR}man1)
 
       elseif (builder STREQUAL html)
         string(TOUPPER "${project}" project_upper)
diff --git a/cmake/modules/CheckLinkerFlag.cmake b/cmake/modules/CheckLinkerFlag.cmake
index e96d35e..fe9d01a 100644
--- a/cmake/modules/CheckLinkerFlag.cmake
+++ b/cmake/modules/CheckLinkerFlag.cmake
@@ -1,8 +1,6 @@
 include(CheckCXXCompilerFlag)
 
 function(check_linker_flag flag out_var)
-  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
-  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${flag}")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${flag}")
   check_cxx_compiler_flag("" ${out_var})
-  set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
 endfunction()
diff --git a/cmake/modules/GetSVN.cmake b/cmake/modules/GetSVN.cmake
index d512bd2..f729395 100644
--- a/cmake/modules/GetSVN.cmake
+++ b/cmake/modules/GetSVN.cmake
@@ -1,17 +1,15 @@
 # CMake project that writes Subversion revision information to a header.
 #
 # Input variables:
-#   FIRST_SOURCE_DIR  - First source directory
-#   FIRST_NAME        - The macro prefix for the first repository's info
-#   SECOND_SOURCE_DIR - Second source directory (opt)
-#   SECOND_NAME       - The macro prefix for the second repository's info (opt)
-#   HEADER_FILE       - The header file to write
+#   SOURCE_DIRS - A list of source directories.
+#   NAMES       - A list of macro prefixes for each of the source directories.
+#   HEADER_FILE - The header file to write
 #
-# The output header will contain macros FIRST_REPOSITORY and FIRST_REVISION,
-# and SECOND_REPOSITORY and SECOND_REVISION if requested, where "FIRST" and
-# "SECOND" are substituted with the names specified in the input variables.
+# The output header will contain macros <NAME>_REPOSITORY and <NAME>_REVISION,
+# where "<NAME>" and is substituted with the names specified in the input
+# variables, for each of the SOURCE_DIRS given.
 
-# Chop off cmake/modules/GetSVN.cmake 
+# Chop off cmake/modules/GetSVN.cmake
 get_filename_component(LLVM_DIR "${CMAKE_SCRIPT_MODE_FILE}" PATH)
 get_filename_component(LLVM_DIR "${LLVM_DIR}" PATH)
 get_filename_component(LLVM_DIR "${LLVM_DIR}" PATH)
@@ -86,7 +84,7 @@
 function(get_source_info path revision repository)
   if (EXISTS "${path}/.svn")
     get_source_info_svn("${path}" revision repository)
-  elseif (EXISTS "${path}/.git/svn")
+  elseif (EXISTS "${path}/.git/svn/refs")
     get_source_info_git_svn("${path}" revision repository)
   elseif (EXISTS "${path}/.git")
     get_source_info_git("${path}" revision repository)
@@ -103,9 +101,37 @@
     "#define ${name}_REPOSITORY \"${repository}\"\n")
 endfunction()
 
-append_info(${FIRST_NAME} "${FIRST_SOURCE_DIR}")
-if(DEFINED SECOND_SOURCE_DIR)
-  append_info(${SECOND_NAME} "${SECOND_SOURCE_DIR}")
+function(validate_inputs source_dirs names)
+  list(LENGTH source_dirs source_dirs_length)
+  list(LENGTH names names_length)
+  if (NOT source_dirs_length EQUAL names_length)
+    message(FATAL_ERROR
+            "GetSVN.cmake takes two arguments: a list of source directories, "
+            "and a list of names. Expected two lists must be of equal length, "
+            "but got ${source_dirs_length} source directories and "
+            "${names_length} names.")
+  endif()
+endfunction()
+
+if (DEFINED SOURCE_DIRS AND DEFINED NAMES)
+  validate_inputs("${SOURCE_DIRS}" "${NAMES}")
+
+  list(LENGTH SOURCE_DIRS source_dirs_length)
+  math(EXPR source_dirs_max_index ${source_dirs_length}-1)
+  foreach(index RANGE ${source_dirs_max_index})
+    list(GET SOURCE_DIRS ${index} source_dir)
+    list(GET NAMES ${index} name)
+    append_info(${name} ${source_dir})
+  endforeach()
+endif()
+
+# Allow -DFIRST_SOURCE_DIR arguments until Clang migrates to the new
+# -DSOURCE_DIRS argument.
+if(DEFINED FIRST_SOURCE_DIR)
+  append_info(${FIRST_NAME} "${FIRST_SOURCE_DIR}")
+  if(DEFINED SECOND_SOURCE_DIR)
+    append_info(${SECOND_NAME} "${SECOND_SOURCE_DIR}")
+  endif()
 endif()
 
 # Copy the file only if it has changed.
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index d776285..03b9664 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -8,7 +8,6 @@
 
 include(CheckCompilerVersion)
 include(HandleLLVMStdlib)
-include(AddLLVMDefinitions)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
 
@@ -18,6 +17,9 @@
   set(LINKER_IS_LLD_LINK FALSE)
 endif()
 
+set(LLVM_ENABLE_LTO OFF CACHE STRING "Build LLVM with LTO. May be specified as Thin or Full to use a particular kind of LTO")
+string(TOUPPER "${LLVM_ENABLE_LTO}" uppercase_LLVM_ENABLE_LTO)
+
 # Ninja Job Pool support
 # The following only works with the Ninja generator in CMake >= 3.0.
 set(LLVM_PARALLEL_COMPILE_JOBS "" CACHE STRING
@@ -33,16 +35,19 @@
 
 set(LLVM_PARALLEL_LINK_JOBS "" CACHE STRING
   "Define the maximum number of concurrent link jobs.")
-if(LLVM_PARALLEL_LINK_JOBS)
-  if(NOT CMAKE_MAKE_PROGRAM MATCHES "ninja")
-    message(WARNING "Job pooling is only available with Ninja generators.")
-  else()
+if(CMAKE_MAKE_PROGRAM MATCHES "ninja")
+  if(NOT LLVM_PARALLEL_LINK_JOBS AND uppercase_LLVM_ENABLE_LTO STREQUAL "THIN")
+    message(STATUS "ThinLTO provides its own parallel linking - limiting parallel link jobs to 2.")
+    set(LLVM_PARALLEL_LINK_JOBS "2")
+  endif()
+  if(LLVM_PARALLEL_LINK_JOBS)
     set_property(GLOBAL APPEND PROPERTY JOB_POOLS link_job_pool=${LLVM_PARALLEL_LINK_JOBS})
     set(CMAKE_JOB_POOL_LINK link_job_pool)
   endif()
+elseif(LLVM_PARALLEL_LINK_JOBS)
+  message(WARNING "Job pooling is only available with Ninja generators.")
 endif()
 
-
 if (LINKER_IS_LLD_LINK)
   # Pass /MANIFEST:NO so that CMake doesn't run mt.exe on our binaries.  Adding
   # manifests with mt.exe breaks LLD's symbol tables and takes as much time as
@@ -96,6 +101,10 @@
   message(FATAL_ERROR "Unknown value for LLVM_ABI_BREAKING_CHECKS: \"${LLVM_ABI_BREAKING_CHECKS}\"!")
 endif()
 
+if( LLVM_REVERSE_ITERATION )
+  set( LLVM_ENABLE_REVERSE_ITERATION 1 )
+endif()
+
 if(WIN32)
   set(LLVM_HAVE_LINK_VERSION_SCRIPT 0)
   if(CYGWIN)
@@ -185,10 +194,13 @@
 endif()
 
 if( LLVM_USE_LINKER )
-  check_cxx_compiler_flag("-fuse-ld=${LLVM_USE_LINKER}" CXX_SUPPORTS_CUSTOM_LINKER)
+  set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -fuse-ld=${LLVM_USE_LINKER}")
+  check_cxx_source_compiles("int main() { return 0; }" CXX_SUPPORTS_CUSTOM_LINKER)
   if ( NOT CXX_SUPPORTS_CUSTOM_LINKER )
 	  message(FATAL_ERROR "Host compiler does not support '-fuse-ld=${LLVM_USE_LINKER}'")
   endif()
+  set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
   append("-fuse-ld=${LLVM_USE_LINKER}"
     CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
 endif()
@@ -223,6 +235,17 @@
   endif( LLVM_BUILD_32_BITS )
 endif( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 )
 
+# If building on a GNU specific 32-bit system, make sure off_t is 64 bits
+# so that off_t can stored offset > 2GB.
+# Android until version N (API 24) doesn't support it.
+if (ANDROID AND (ANDROID_NATIVE_API_LEVEL LESS 24))
+  set(LLVM_FORCE_SMALLFILE_FOR_ANDROID TRUE)
+endif()
+if( CMAKE_SIZEOF_VOID_P EQUAL 4 AND NOT LLVM_FORCE_SMALLFILE_FOR_ANDROID)
+  add_definitions( -D_LARGEFILE_SOURCE )
+  add_definitions( -D_FILE_OFFSET_BITS=64 )
+endif()
+
 if( XCODE )
   # For Xcode enable several build settings that correspond to
   # many warnings that are on by default in Clang but are
@@ -253,10 +276,10 @@
     "Number of parallel compiler jobs. 0 means use all processors. Default is 0.")
   if( NOT LLVM_COMPILER_JOBS STREQUAL "1" )
     if( LLVM_COMPILER_JOBS STREQUAL "0" )
-      add_llvm_definitions( /MP )
+      add_definitions( /MP )
     else()
       message(STATUS "Number of parallel compiler jobs set to " ${LLVM_COMPILER_JOBS})
-      add_llvm_definitions( /MP${LLVM_COMPILER_JOBS} )
+      add_definitions( /MP${LLVM_COMPILER_JOBS} )
     endif()
   else()
     message(STATUS "Parallel compilation disabled")
@@ -285,17 +308,17 @@
   if( CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.0 )
     # For MSVC 2013, disable iterator null pointer checking in debug mode,
     # especially so std::equal(nullptr, nullptr, nullptr) will not assert.
-    add_llvm_definitions("-D_DEBUG_POINTER_IMPL=")
+    add_definitions("-D_DEBUG_POINTER_IMPL=")
   endif()
-  
+
   include(ChooseMSVCCRT)
 
   if( MSVC11 )
-    add_llvm_definitions(-D_VARIADIC_MAX=10)
+    add_definitions(-D_VARIADIC_MAX=10)
   endif()
-  
+
   # Add definitions that make MSVC much less annoying.
-  add_llvm_definitions(
+  add_definitions(
     # For some reason MS wants to deprecate a bunch of standard functions...
     -D_CRT_SECURE_NO_DEPRECATE
     -D_CRT_SECURE_NO_WARNINGS
@@ -306,94 +329,15 @@
     )
 
   # Tell MSVC to use the Unicode version of the Win32 APIs instead of ANSI.
-  add_llvm_definitions(
+  add_definitions(
     -DUNICODE
     -D_UNICODE
   )
 
-  set(msvc_warning_flags
-    # Disabled warnings.
-    -wd4141 # Suppress ''modifier' : used more than once' (because of __forceinline combined with inline)
-    -wd4146 # Suppress 'unary minus operator applied to unsigned type, result still unsigned'
-    -wd4180 # Suppress 'qualifier applied to function type has no meaning; ignored'
-    -wd4244 # Suppress ''argument' : conversion from 'type1' to 'type2', possible loss of data'
-    -wd4258 # Suppress ''var' : definition from the for loop is ignored; the definition from the enclosing scope is used'
-    -wd4267 # Suppress ''var' : conversion from 'size_t' to 'type', possible loss of data'
-    -wd4291 # Suppress ''declaration' : no matching operator delete found; memory will not be freed if initialization throws an exception'
-    -wd4345 # Suppress 'behavior change: an object of POD type constructed with an initializer of the form () will be default-initialized'
-    -wd4351 # Suppress 'new behavior: elements of array 'array' will be default initialized'
-    -wd4355 # Suppress ''this' : used in base member initializer list'
-    -wd4456 # Suppress 'declaration of 'var' hides local variable'
-    -wd4457 # Suppress 'declaration of 'var' hides function parameter'
-    -wd4458 # Suppress 'declaration of 'var' hides class member'
-    -wd4459 # Suppress 'declaration of 'var' hides global declaration'
-    -wd4503 # Suppress ''identifier' : decorated name length exceeded, name was truncated'
-    -wd4624 # Suppress ''derived class' : destructor could not be generated because a base class destructor is inaccessible'
-    -wd4722 # Suppress 'function' : destructor never returns, potential memory leak
-    -wd4800 # Suppress ''type' : forcing value to bool 'true' or 'false' (performance warning)'
-    -wd4100 # Suppress 'unreferenced formal parameter'
-    -wd4127 # Suppress 'conditional expression is constant'
-    -wd4512 # Suppress 'assignment operator could not be generated'
-    -wd4505 # Suppress 'unreferenced local function has been removed'
-    -wd4610 # Suppress '<class> can never be instantiated'
-    -wd4510 # Suppress 'default constructor could not be generated'
-    -wd4702 # Suppress 'unreachable code'
-    -wd4245 # Suppress 'signed/unsigned mismatch'
-    -wd4706 # Suppress 'assignment within conditional expression'
-    -wd4310 # Suppress 'cast truncates constant value'
-    -wd4701 # Suppress 'potentially uninitialized local variable'
-    -wd4703 # Suppress 'potentially uninitialized local pointer variable'
-    -wd4389 # Suppress 'signed/unsigned mismatch'
-    -wd4611 # Suppress 'interaction between '_setjmp' and C++ object destruction is non-portable'
-    -wd4805 # Suppress 'unsafe mix of type <type> and type <type> in operation'
-    -wd4204 # Suppress 'nonstandard extension used : non-constant aggregate initializer'
-    -wd4577 # Suppress 'noexcept used with no exception handling mode specified; termination on exception is not guaranteed'
-    -wd4091 # Suppress 'typedef: ignored on left of '' when no variable is declared'
-        # C4592 is disabled because of false positives in Visual Studio 2015
-        # Update 1. Re-evaluate the usefulness of this diagnostic with Update 2.
-    -wd4592 # Suppress ''var': symbol will be dynamically initialized (implementation limitation)
-    -wd4319 # Suppress ''operator' : zero extending 'type' to 'type' of greater size'
-
-	# Ideally, we'd like this warning to be enabled, but MSVC 2013 doesn't
-	# support the 'aligned' attribute in the way that clang sources requires (for
-	# any code that uses the LLVM_ALIGNAS macro), so this is must be disabled to
-	# avoid unwanted alignment warnings.
-	# When we switch to requiring a version of MSVC that supports the 'alignas'
-	# specifier (MSVC 2015?) this warning can be re-enabled.
-    -wd4324 # Suppress 'structure was padded due to __declspec(align())'
-
-    # Promoted warnings.
-    -w14062 # Promote 'enumerator in switch of enum is not handled' to level 1 warning.
-
-    # Promoted warnings to errors.
-    -we4238 # Promote 'nonstandard extension used : class rvalue used as lvalue' to error.
-    )
-
-  # Enable warnings
-  if (LLVM_ENABLE_WARNINGS)
-    # Put /W4 in front of all the -we flags. cl.exe doesn't care, but for
-    # clang-cl having /W4 after the -we flags will re-enable the warnings
-    # disabled by -we.
-    set(msvc_warning_flags "/W4 ${msvc_warning_flags}")
-    # CMake appends /W3 by default, and having /W3 followed by /W4 will result in 
-    # cl : Command line warning D9025 : overriding '/W3' with '/W4'.  Since this is
-    # a command line warning and not a compiler warning, it cannot be suppressed except
-    # by fixing the command line.
-    string(REGEX REPLACE " /W[0-4]" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-    string(REGEX REPLACE " /W[0-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-
-    if (LLVM_ENABLE_PEDANTIC)
-      # No MSVC equivalent available
-    endif (LLVM_ENABLE_PEDANTIC)
-  endif (LLVM_ENABLE_WARNINGS)
   if (LLVM_ENABLE_WERROR)
-    append("/WX" msvc_warning_flags)
+    append("/WX" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif (LLVM_ENABLE_WERROR)
 
-  foreach(flag ${msvc_warning_flags})
-    append("${flag}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-  endforeach(flag)
-
   append("/Zc:inline" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 
   # /Zc:strictStrings is incompatible with VS12's (Visual Studio 2013's)
@@ -433,7 +377,7 @@
 
       string(FIND "${upper_exe_flags} ${upper_module_flags} ${upper_shared_flags}"
         "/INCREMENTAL" linker_flag_idx)
-      
+
       if (${linker_flag_idx} GREATER -1)
         message(WARNING "/Brepro not compatible with /INCREMENTAL linking - builds will be non-deterministic")
       else()
@@ -443,68 +387,16 @@
   endif()
 
 elseif( LLVM_COMPILER_IS_GCC_COMPATIBLE )
-  if (LLVM_ENABLE_WARNINGS)
-    append("-Wall -W -Wno-unused-parameter -Wwrite-strings" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-    append("-Wcast-qual" CMAKE_CXX_FLAGS)
-
-    # Turn off missing field initializer warnings for gcc to avoid noise from
-    # false positives with empty {}. Turn them on otherwise (they're off by
-    # default for clang).
-    check_cxx_compiler_flag("-Wmissing-field-initializers" CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
-    if (CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
-      if (CMAKE_COMPILER_IS_GNUCXX)
-        append("-Wno-missing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-      else()
-        append("-Wmissing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-      endif()
-    endif()
-
-    append_if(LLVM_ENABLE_PEDANTIC "-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-    append_if(LLVM_ENABLE_PEDANTIC "-Wno-long-long" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-    add_flag_if_supported("-Wcovered-switch-default" COVERED_SWITCH_DEFAULT_FLAG)
-    append_if(USE_NO_UNINITIALIZED "-Wno-uninitialized" CMAKE_CXX_FLAGS)
-    append_if(USE_NO_MAYBE_UNINITIALIZED "-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
-
-    # Check if -Wnon-virtual-dtor warns even though the class is marked final.
-    # If it does, don't add it. So it won't be added on clang 3.4 and older.
-    # This also catches cases when -Wnon-virtual-dtor isn't supported by
-    # the compiler at all.  This flag is not activated for gcc since it will
-    # incorrectly identify a protected non-virtual base when there is a friend
-    # declaration.
-    if (NOT CMAKE_COMPILER_IS_GNUCXX)
-      set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
-      set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -std=c++11 -Werror=non-virtual-dtor")
-      CHECK_CXX_SOURCE_COMPILES("class base {public: virtual void anchor();protected: ~base();};
-                                 class derived final : public base { public: ~derived();};
-                                 int main() { return 0; }"
-                                CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR)
-      set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
-      append_if(CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR
-                "-Wnon-virtual-dtor" CMAKE_CXX_FLAGS)
-    endif()
-
-    # Enable -Wdelete-non-virtual-dtor if available.
-    add_flag_if_supported("-Wdelete-non-virtual-dtor" DELETE_NON_VIRTUAL_DTOR_FLAG)
-
-    # Check if -Wcomment is OK with an // comment ending with '\' if the next
-    # line is also a // comment.
-    set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
-    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror -Wcomment")
-    CHECK_C_SOURCE_COMPILES("// \\\\\\n//\\nint main() {return 0;}"
-                            C_WCOMMENT_ALLOWS_LINE_WRAP)
-    set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
-    if (NOT C_WCOMMENT_ALLOWS_LINE_WRAP)
-      append("-Wno-comment" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-    endif()
-
-    # Enable -Wstring-conversion to catch misuse of string literals.
-    add_flag_if_supported("-Wstring-conversion" STRING_CONVERSION_FLAG)
-  endif (LLVM_ENABLE_WARNINGS)
   append_if(LLVM_ENABLE_WERROR "-Werror" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  append_if(LLVM_ENABLE_WERROR "-Wno-error" CMAKE_REQUIRED_FLAGS)
   add_flag_if_supported("-Werror=date-time" WERROR_DATE_TIME)
+  add_flag_if_supported("-Werror=unguarded-availability-new" WERROR_UNGUARDED_AVAILABILITY_NEW)
   if (LLVM_ENABLE_CXX1Y)
     check_cxx_compiler_flag("-std=c++1y" CXX_SUPPORTS_CXX1Y)
     append_if(CXX_SUPPORTS_CXX1Y "-std=c++1y" CMAKE_CXX_FLAGS)
+  elseif(LLVM_ENABLE_CXX1Z)
+    check_cxx_compiler_flag("-std=c++1z" CXX_SUPPORTS_CXX1Z)
+    append_if(CXX_SUPPORTS_CXX1Z "-std=c++1z" CMAKE_CXX_FLAGS)
   else()
     check_cxx_compiler_flag("-std=c++11" CXX_SUPPORTS_CXX11)
     if (CXX_SUPPORTS_CXX11)
@@ -553,6 +445,155 @@
   endif(LLVM_ENABLE_MODULES)
 endif( MSVC )
 
+if (MSVC AND NOT CLANG_CL)
+  set(msvc_warning_flags
+    # Disabled warnings.
+    -wd4141 # Suppress ''modifier' : used more than once' (because of __forceinline combined with inline)
+    -wd4146 # Suppress 'unary minus operator applied to unsigned type, result still unsigned'
+    -wd4180 # Suppress 'qualifier applied to function type has no meaning; ignored'
+    -wd4244 # Suppress ''argument' : conversion from 'type1' to 'type2', possible loss of data'
+    -wd4258 # Suppress ''var' : definition from the for loop is ignored; the definition from the enclosing scope is used'
+    -wd4267 # Suppress ''var' : conversion from 'size_t' to 'type', possible loss of data'
+    -wd4291 # Suppress ''declaration' : no matching operator delete found; memory will not be freed if initialization throws an exception'
+    -wd4345 # Suppress 'behavior change: an object of POD type constructed with an initializer of the form () will be default-initialized'
+    -wd4351 # Suppress 'new behavior: elements of array 'array' will be default initialized'
+    -wd4355 # Suppress ''this' : used in base member initializer list'
+    -wd4456 # Suppress 'declaration of 'var' hides local variable'
+    -wd4457 # Suppress 'declaration of 'var' hides function parameter'
+    -wd4458 # Suppress 'declaration of 'var' hides class member'
+    -wd4459 # Suppress 'declaration of 'var' hides global declaration'
+    -wd4503 # Suppress ''identifier' : decorated name length exceeded, name was truncated'
+    -wd4624 # Suppress ''derived class' : destructor could not be generated because a base class destructor is inaccessible'
+    -wd4722 # Suppress 'function' : destructor never returns, potential memory leak
+    -wd4800 # Suppress ''type' : forcing value to bool 'true' or 'false' (performance warning)'
+    -wd4100 # Suppress 'unreferenced formal parameter'
+    -wd4127 # Suppress 'conditional expression is constant'
+    -wd4512 # Suppress 'assignment operator could not be generated'
+    -wd4505 # Suppress 'unreferenced local function has been removed'
+    -wd4610 # Suppress '<class> can never be instantiated'
+    -wd4510 # Suppress 'default constructor could not be generated'
+    -wd4702 # Suppress 'unreachable code'
+    -wd4245 # Suppress 'signed/unsigned mismatch'
+    -wd4706 # Suppress 'assignment within conditional expression'
+    -wd4310 # Suppress 'cast truncates constant value'
+    -wd4701 # Suppress 'potentially uninitialized local variable'
+    -wd4703 # Suppress 'potentially uninitialized local pointer variable'
+    -wd4389 # Suppress 'signed/unsigned mismatch'
+    -wd4611 # Suppress 'interaction between '_setjmp' and C++ object destruction is non-portable'
+    -wd4805 # Suppress 'unsafe mix of type <type> and type <type> in operation'
+    -wd4204 # Suppress 'nonstandard extension used : non-constant aggregate initializer'
+    -wd4577 # Suppress 'noexcept used with no exception handling mode specified; termination on exception is not guaranteed'
+    -wd4091 # Suppress 'typedef: ignored on left of '' when no variable is declared'
+        # C4592 is disabled because of false positives in Visual Studio 2015
+        # Update 1. Re-evaluate the usefulness of this diagnostic with Update 2.
+    -wd4592 # Suppress ''var': symbol will be dynamically initialized (implementation limitation)
+    -wd4319 # Suppress ''operator' : zero extending 'type' to 'type' of greater size'
+
+    # Ideally, we'd like this warning to be enabled, but MSVC 2013 doesn't
+    # support the 'aligned' attribute in the way that clang sources requires (for
+    # any code that uses the LLVM_ALIGNAS macro), so this is must be disabled to
+    # avoid unwanted alignment warnings.
+    # When we switch to requiring a version of MSVC that supports the 'alignas'
+    # specifier (MSVC 2015?) this warning can be re-enabled.
+    -wd4324 # Suppress 'structure was padded due to __declspec(align())'
+
+    # Promoted warnings.
+    -w14062 # Promote 'enumerator in switch of enum is not handled' to level 1 warning.
+
+    # Promoted warnings to errors.
+    -we4238 # Promote 'nonstandard extension used : class rvalue used as lvalue' to error.
+    )
+
+  # Enable warnings
+  if (LLVM_ENABLE_WARNINGS)
+    # Put /W4 in front of all the -we flags. cl.exe doesn't care, but for
+    # clang-cl having /W4 after the -we flags will re-enable the warnings
+    # disabled by -we.
+    set(msvc_warning_flags "/W4 ${msvc_warning_flags}")
+    # CMake appends /W3 by default, and having /W3 followed by /W4 will result in
+    # cl : Command line warning D9025 : overriding '/W3' with '/W4'.  Since this is
+    # a command line warning and not a compiler warning, it cannot be suppressed except
+    # by fixing the command line.
+    string(REGEX REPLACE " /W[0-4]" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+    string(REGEX REPLACE " /W[0-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+    if (LLVM_ENABLE_PEDANTIC)
+      # No MSVC equivalent available
+    endif (LLVM_ENABLE_PEDANTIC)
+  endif (LLVM_ENABLE_WARNINGS)
+
+  foreach(flag ${msvc_warning_flags})
+    append("${flag}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endforeach(flag)
+endif (MSVC AND NOT CLANG_CL)
+
+if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
+  append("-Wall -W -Wno-unused-parameter -Wwrite-strings" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  append("-Wcast-qual" CMAKE_CXX_FLAGS)
+
+  # Turn off missing field initializer warnings for gcc to avoid noise from
+  # false positives with empty {}. Turn them on otherwise (they're off by
+  # default for clang).
+  check_cxx_compiler_flag("-Wmissing-field-initializers" CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
+  if (CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
+    if (CMAKE_COMPILER_IS_GNUCXX)
+      append("-Wno-missing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    else()
+      append("-Wmissing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    endif()
+  endif()
+
+  if (LLVM_ENABLE_PEDANTIC AND LLVM_COMPILER_IS_GCC_COMPATIBLE)
+    append("-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    append("-Wno-long-long" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
+
+  add_flag_if_supported("-Wcovered-switch-default" COVERED_SWITCH_DEFAULT_FLAG)
+  append_if(USE_NO_UNINITIALIZED "-Wno-uninitialized" CMAKE_CXX_FLAGS)
+  append_if(USE_NO_MAYBE_UNINITIALIZED "-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
+
+  # Check if -Wnon-virtual-dtor warns even though the class is marked final.
+  # If it does, don't add it. So it won't be added on clang 3.4 and older.
+  # This also catches cases when -Wnon-virtual-dtor isn't supported by
+  # the compiler at all.  This flag is not activated for gcc since it will
+  # incorrectly identify a protected non-virtual base when there is a friend
+  # declaration. Don't activate this in general on Windows as this warning has
+  # too many false positives on COM-style classes, which are destroyed with
+  # Release() (PR32286).
+  if (NOT CMAKE_COMPILER_IS_GNUCXX AND NOT WIN32)
+    set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -std=c++11 -Werror=non-virtual-dtor")
+    CHECK_CXX_SOURCE_COMPILES("class base {public: virtual void anchor();protected: ~base();};
+                               class derived final : public base { public: ~derived();};
+                               int main() { return 0; }"
+                              CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR)
+    set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+    append_if(CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR
+              "-Wnon-virtual-dtor" CMAKE_CXX_FLAGS)
+  endif()
+
+  # Enable -Wdelete-non-virtual-dtor if available.
+  add_flag_if_supported("-Wdelete-non-virtual-dtor" DELETE_NON_VIRTUAL_DTOR_FLAG)
+
+  # Check if -Wcomment is OK with an // comment ending with '\' if the next
+  # line is also a // comment.
+  set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror -Wcomment")
+  CHECK_C_SOURCE_COMPILES("// \\\\\\n//\\nint main() {return 0;}"
+                          C_WCOMMENT_ALLOWS_LINE_WRAP)
+  set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+  if (NOT C_WCOMMENT_ALLOWS_LINE_WRAP)
+    append("-Wno-comment" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
+
+  # Enable -Wstring-conversion to catch misuse of string literals.
+  add_flag_if_supported("-Wstring-conversion" STRING_CONVERSION_FLAG)
+endif (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
+
+if (LLVM_COMPILER_IS_GCC_COMPATIBLE AND NOT LLVM_ENABLE_WARNINGS)
+  append("-w" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+endif()
+
 macro(append_common_sanitizer_flags)
   if (NOT MSVC)
     # Append -fno-omit-frame-pointer and turn on debug info to get better
@@ -610,6 +651,9 @@
       append_common_sanitizer_flags()
       append("-fsanitize=address,undefined -fno-sanitize=vptr,function -fno-sanitize-recover=all"
               CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    elseif (LLVM_USE_SANITIZER STREQUAL "Leaks")
+      append_common_sanitizer_flags()
+      append("-fsanitize=leak" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     else()
       message(FATAL_ERROR "Unsupported value of LLVM_USE_SANITIZER: ${LLVM_USE_SANITIZER}")
     endif()
@@ -628,7 +672,7 @@
                           FSANITIZE_USE_AFTER_SCOPE_FLAG)
   endif()
   if (LLVM_USE_SANITIZE_COVERAGE)
-    append("-fsanitize-coverage=trace-pc-guard,indirect-calls,trace-cmp" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    append("-fsanitize=fuzzer-no-link" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif()
 endif()
 
@@ -637,9 +681,9 @@
   add_definitions("-gsplit-dwarf")
 endif()
 
-add_llvm_definitions( -D__STDC_CONSTANT_MACROS )
-add_llvm_definitions( -D__STDC_FORMAT_MACROS )
-add_llvm_definitions( -D__STDC_LIMIT_MACROS )
+add_definitions( -D__STDC_CONSTANT_MACROS )
+add_definitions( -D__STDC_FORMAT_MACROS )
+add_definitions( -D__STDC_LIMIT_MACROS )
 
 # clang doesn't print colored diagnostics when invoked from Ninja
 if (UNIX AND
@@ -651,8 +695,8 @@
 # lld doesn't print colored diagnostics when invoked from Ninja
 if (UNIX AND CMAKE_GENERATOR STREQUAL "Ninja")
   include(CheckLinkerFlag)
-  check_linker_flag("-Wl,-color-diagnostics" LINKER_SUPPORTS_COLOR_DIAGNOSTICS)
-  append_if(LINKER_SUPPORTS_COLOR_DIAGNOSTICS "-Wl,-color-diagnostics"
+  check_linker_flag("-Wl,--color-diagnostics" LINKER_SUPPORTS_COLOR_DIAGNOSTICS)
+  append_if(LINKER_SUPPORTS_COLOR_DIAGNOSTICS "-Wl,--color-diagnostics"
     CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
 endif()
 
@@ -689,15 +733,29 @@
   message(FATAL_ERROR "Exception handling requires RTTI. You must set LLVM_ENABLE_RTTI to ON")
 endif()
 
-option(LLVM_BUILD_INSTRUMENTED "Build LLVM and tools with PGO instrumentation (experimental)" Off)
-mark_as_advanced(LLVM_BUILD_INSTRUMENTED)
-append_if(LLVM_BUILD_INSTRUMENTED "-fprofile-instr-generate='${LLVM_PROFILE_FILE_PATTERN}'"
-  CMAKE_CXX_FLAGS
-  CMAKE_C_FLAGS
-  CMAKE_EXE_LINKER_FLAGS
-  CMAKE_SHARED_LINKER_FLAGS)
+option(LLVM_ENABLE_IR_PGO "Build LLVM and tools with IR PGO instrumentation (experimental)" Off)
+mark_as_advanced(LLVM_ENABLE_IR_PGO)
 
-option(LLVM_BUILD_INSTRUMENTED_COVERAGE "Build LLVM and tools with Code Coverage instrumentation (experimental)" Off)
+option(LLVM_BUILD_INSTRUMENTED "Build LLVM and tools with PGO instrumentation" Off)
+mark_as_advanced(LLVM_BUILD_INSTRUMENTED)
+
+if (LLVM_BUILD_INSTRUMENTED)
+  if (LLVM_ENABLE_IR_PGO)
+    append("-fprofile-generate='${LLVM_PROFILE_DATA_DIR}'"
+      CMAKE_CXX_FLAGS
+      CMAKE_C_FLAGS
+      CMAKE_EXE_LINKER_FLAGS
+      CMAKE_SHARED_LINKER_FLAGS)
+  else()
+    append("-fprofile-instr-generate='${LLVM_PROFILE_FILE_PATTERN}'"
+      CMAKE_CXX_FLAGS
+      CMAKE_C_FLAGS
+      CMAKE_EXE_LINKER_FLAGS
+      CMAKE_SHARED_LINKER_FLAGS)
+  endif()
+endif()
+
+option(LLVM_BUILD_INSTRUMENTED_COVERAGE "Build LLVM and tools with Code Coverage instrumentation" Off)
 mark_as_advanced(LLVM_BUILD_INSTRUMENTED_COVERAGE)
 append_if(LLVM_BUILD_INSTRUMENTED_COVERAGE "-fprofile-instr-generate='${LLVM_PROFILE_FILE_PATTERN}' -fcoverage-mapping"
   CMAKE_CXX_FLAGS
@@ -705,8 +763,6 @@
   CMAKE_EXE_LINKER_FLAGS
   CMAKE_SHARED_LINKER_FLAGS)
 
-set(LLVM_ENABLE_LTO OFF CACHE STRING "Build LLVM with LTO. May be specified as Thin or Full to use a particular kind of LTO")
-string(TOUPPER "${LLVM_ENABLE_LTO}" uppercase_LLVM_ENABLE_LTO)
 if(LLVM_ENABLE_LTO AND LLVM_ON_WIN32 AND NOT LINKER_IS_LLD_LINK)
   message(FATAL_ERROR "When compiling for Windows, LLVM_ENABLE_LTO requires using lld as the linker (point CMAKE_LINKER at lld-link.exe)")
 endif()
@@ -715,11 +771,20 @@
   if(NOT LINKER_IS_LLD_LINK)
     append("-flto=thin" CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
   endif()
-  # On darwin, enable the lto cache. This improves initial build time a little
-  # since we re-link a lot of the same objects, and significantly improves
-  # incremental build time.
-  append_if(APPLE "-Wl,-cache_path_lto,${PROJECT_BINARY_DIR}/lto.cache"
-            CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+  # If the linker supports it, enable the lto cache. This improves initial build
+  # time a little since we re-link a lot of the same objects, and significantly
+  # improves incremental build time.
+  # FIXME: We should move all this logic into the clang driver.
+  if(APPLE)
+    append("-Wl,-cache_path_lto,${PROJECT_BINARY_DIR}/lto.cache"
+           CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+  elseif(UNIX AND LLVM_USE_LINKER STREQUAL "lld")
+    append("-Wl,--thinlto-cache-dir=${PROJECT_BINARY_DIR}/lto.cache"
+           CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+  elseif(LLVM_USE_LINKER STREQUAL "gold")
+    append("-Wl,--plugin-opt,cache-dir=${PROJECT_BINARY_DIR}/lto.cache"
+           CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+  endif()
 elseif(uppercase_LLVM_ENABLE_LTO STREQUAL "FULL")
   append("-flto=full" CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
   if(NOT LINKER_IS_LLD_LINK)
@@ -748,7 +813,7 @@
 # Plugin support
 # FIXME: Make this configurable.
 if(WIN32 OR CYGWIN)
-  if(BUILD_SHARED_LIBS)
+  if(BUILD_SHARED_LIBS OR LLVM_BUILD_LLVM_DYLIB)
     set(LLVM_ENABLE_PLUGINS ON)
   else()
     set(LLVM_ENABLE_PLUGINS OFF)
@@ -756,3 +821,16 @@
 else()
   set(LLVM_ENABLE_PLUGINS ON)
 endif()
+
+function(get_compile_definitions)
+  get_directory_property(top_dir_definitions DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS)
+  foreach(definition ${top_dir_definitions})
+    if(DEFINED result)
+      string(APPEND result " -D${definition}")
+    else()
+      set(result "-D${definition}")
+    endif()
+  endforeach()
+  set(LLVM_DEFINITIONS "${result}" PARENT_SCOPE)
+endfunction()
+get_compile_definitions()
diff --git a/cmake/modules/LLVM-Config.cmake b/cmake/modules/LLVM-Config.cmake
index 5233015..2b9ab23 100644
--- a/cmake/modules/LLVM-Config.cmake
+++ b/cmake/modules/LLVM-Config.cmake
@@ -175,18 +175,15 @@
           message(FATAL_ERROR "Target ${c} is not in the set of libraries.")
         endif()
       endif()
-      if( TARGET LLVM${c}AsmPrinter )
-        list(APPEND expanded_components "LLVM${c}AsmPrinter")
-      endif()
       if( TARGET LLVM${c}AsmParser )
         list(APPEND expanded_components "LLVM${c}AsmParser")
       endif()
+      if( TARGET LLVM${c}AsmPrinter )
+        list(APPEND expanded_components "LLVM${c}AsmPrinter")
+      endif()
       if( TARGET LLVM${c}Desc )
         list(APPEND expanded_components "LLVM${c}Desc")
       endif()
-      if( TARGET LLVM${c}Info )
-        list(APPEND expanded_components "LLVM${c}Info")
-      endif()
       if( TARGET LLVM${c}Disassembler )
         list(APPEND expanded_components "LLVM${c}Disassembler")
       endif()
diff --git a/cmake/modules/LLVMConfig.cmake.in b/cmake/modules/LLVMConfig.cmake.in
index 7a8eb36..169fc99 100644
--- a/cmake/modules/LLVMConfig.cmake.in
+++ b/cmake/modules/LLVMConfig.cmake.in
@@ -79,5 +79,12 @@
   @llvm_config_include_buildtree_only_exports@
 endif()
 
+# By creating intrinsics_gen here, subprojects that depend on LLVM's
+# tablegen-generated headers can always depend on this target whether building
+# in-tree with LLVM or not.
+if(NOT TARGET intrinsics_gen)
+  add_custom_target(intrinsics_gen)
+endif()
+
 set_property(GLOBAL PROPERTY LLVM_TARGETS_CONFIGURED On)
 include(${LLVM_CMAKE_DIR}/LLVM-Config.cmake)
diff --git a/cmake/modules/LLVMExternalProjectUtils.cmake b/cmake/modules/LLVMExternalProjectUtils.cmake
index d457389..c356083 100644
--- a/cmake/modules/LLVMExternalProjectUtils.cmake
+++ b/cmake/modules/LLVMExternalProjectUtils.cmake
@@ -149,6 +149,7 @@
                -DPACKAGE_VERSION=${PACKAGE_VERSION}
                -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
                -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
+               -DCMAKE_EXPORT_COMPILE_COMMANDS=1
                ${ARG_CMAKE_ARGS}
                ${PASSTHROUGH_VARIABLES}
     INSTALL_COMMAND ""
@@ -195,8 +196,16 @@
 
   # Add top-level targets
   foreach(target ${ARG_EXTRA_TARGETS})
+    string(REPLACE ":" ";" target_list ${target})
+    list(GET target_list 0 target)
+    list(LENGTH target_list target_list_len)
+    if(${target_list_len} GREATER 1)
+      list(GET target_list 1 target_name)
+    else()
+      set(target_name "${target}")
+    endif()
     llvm_ExternalProject_BuildCmd(build_runtime_cmd ${target} ${BINARY_DIR})
-    add_custom_target(${target}
+    add_custom_target(${target_name}
       COMMAND ${build_runtime_cmd}
       DEPENDS ${name}-configure
       WORKING_DIRECTORY ${BINARY_DIR}
diff --git a/cmake/modules/LLVMProcessSources.cmake b/cmake/modules/LLVMProcessSources.cmake
index ae1921b..3b4838d 100644
--- a/cmake/modules/LLVMProcessSources.cmake
+++ b/cmake/modules/LLVMProcessSources.cmake
@@ -68,17 +68,29 @@
 
 
 function(llvm_check_source_file_list)
-  set(listed ${ARGN})
-  file(GLOB globbed *.c *.cpp)
+  cmake_parse_arguments(ARG "" "SOURCE_DIR" "" ${ARGN})
+  set(listed ${ARG_UNPARSED_ARGUMENTS})
+  if(ARG_SOURCE_DIR)
+    file(GLOB globbed
+         RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+         "${ARG_SOURCE_DIR}/*.c" "${ARG_SOURCE_DIR}/*.cpp")
+  else()
+    file(GLOB globbed *.c *.cpp)
+  endif()
   foreach(g ${globbed})
     get_filename_component(fn ${g} NAME)
+    if(ARG_SOURCE_DIR)
+      set(entry "${g}")
+    else()
+      set(entry "${fn}")
+    endif()
 
     # Don't reject hidden files. Some editors create backups in the
     # same directory as the file.
     if (NOT "${fn}" MATCHES "^\\.")
-      list(FIND LLVM_OPTIONAL_SOURCES ${fn} idx)
+      list(FIND LLVM_OPTIONAL_SOURCES ${entry} idx)
       if( idx LESS 0 )
-        list(FIND listed ${fn} idx)
+        list(FIND listed ${entry} idx)
         if( idx LESS 0 )
           message(SEND_ERROR "Found unknown source file ${g}
 Please update ${CMAKE_CURRENT_LIST_FILE}\n")
diff --git a/cmake/modules/TableGen.cmake b/cmake/modules/TableGen.cmake
index da0858e..7f17f70 100644
--- a/cmake/modules/TableGen.cmake
+++ b/cmake/modules/TableGen.cmake
@@ -14,8 +14,31 @@
     message(FATAL_ERROR "${project}_TABLEGEN_EXE not set")
   endif()
 
-  file(GLOB local_tds "*.td")
-  file(GLOB_RECURSE global_tds "${LLVM_MAIN_INCLUDE_DIR}/llvm/*.td")
+  # Use depfile instead of globbing arbitrary *.td(s)
+  # DEPFILE is available for Ninja Generator with CMake>=3.7.
+  if(CMAKE_GENERATOR STREQUAL "Ninja" AND NOT CMAKE_VERSION VERSION_LESS 3.7)
+    # Make output path relative to build.ninja, assuming located on
+    # ${CMAKE_BINARY_DIR}.
+    # CMake emits build targets as relative paths but Ninja doesn't identify
+    # absolute path (in *.d) as relative path (in build.ninja)
+    # Note that tblgen is executed on ${CMAKE_BINARY_DIR} as working directory.
+    file(RELATIVE_PATH ofn_rel
+      ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/${ofn})
+    set(additional_cmdline
+      -o ${ofn_rel}.tmp
+      -d ${ofn_rel}.d
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+      DEPFILE ${CMAKE_CURRENT_BINARY_DIR}/${ofn}.d
+      )
+    set(local_tds)
+    set(global_tds)
+  else()
+    file(GLOB local_tds "*.td")
+    file(GLOB_RECURSE global_tds "${LLVM_MAIN_INCLUDE_DIR}/llvm/*.td")
+    set(additional_cmdline
+      -o ${CMAKE_CURRENT_BINARY_DIR}/${ofn}.tmp
+      )
+  endif()
 
   if (IS_ABSOLUTE ${LLVM_TARGET_DEFINITIONS})
     set(LLVM_TARGET_DEFINITIONS_ABSOLUTE ${LLVM_TARGET_DEFINITIONS})
@@ -30,16 +53,26 @@
     endif()
   endif()
 
+  # We need both _TABLEGEN_TARGET and _TABLEGEN_EXE in the  DEPENDS list
+  # (both the target and the file) to have .inc files rebuilt on
+  # a tablegen change, as cmake does not propagate file-level dependencies
+  # of custom targets. See the following ticket for more information:
+  # https://cmake.org/Bug/view.php?id=15858
+  # The dependency on both, the target and the file, produces the same
+  # dependency twice in the result file when
+  # ("${${project}_TABLEGEN_TARGET}" STREQUAL "${${project}_TABLEGEN_EXE}")
+  # but lets us having smaller and cleaner code here.
   add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn}.tmp
     # Generate tablegen output in a temporary file.
     COMMAND ${${project}_TABLEGEN_EXE} ${ARGN} -I ${CMAKE_CURRENT_SOURCE_DIR}
-    ${LLVM_TABLEGEN_FLAGS} 
+    ${LLVM_TABLEGEN_FLAGS}
     ${LLVM_TARGET_DEFINITIONS_ABSOLUTE}
-    -o ${CMAKE_CURRENT_BINARY_DIR}/${ofn}.tmp
+    ${additional_cmdline}
     # The file in LLVM_TARGET_DEFINITIONS may be not in the current
     # directory and local_tds may not contain it, so we must
     # explicitly list it here:
-    DEPENDS ${${project}_TABLEGEN_TARGET} ${local_tds} ${global_tds}
+    DEPENDS ${${project}_TABLEGEN_TARGET} ${${project}_TABLEGEN_EXE}
+      ${local_tds} ${global_tds}
     ${LLVM_TARGET_DEFINITIONS_ABSOLUTE}
     COMMENT "Building ${ofn}..."
     )
@@ -77,7 +110,7 @@
   set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} ${target} PARENT_SCOPE)
 endfunction()
 
-if(LLVM_USE_HOST_TOOLS)
+if(LLVM_USE_HOST_TOOLS AND NOT TARGET NATIVE_LIB_LLVMTABLEGEN)
   llvm_ExternalProject_BuildCmd(tblgen_build_cmd LLVMSupport
     ${LLVM_NATIVE_BUILD}
     CONFIGURATION Release)
@@ -88,13 +121,14 @@
       COMMENT "Building libLLVMTableGen for native TableGen..."
       USES_TERMINAL)
   add_custom_target(NATIVE_LIB_LLVMTABLEGEN DEPENDS LIB_LLVMTABLEGEN)
-endif(LLVM_USE_HOST_TOOLS)
+endif()
 
 macro(add_tablegen target project)
   set(${target}_OLD_LLVM_LINK_COMPONENTS ${LLVM_LINK_COMPONENTS})
   set(LLVM_LINK_COMPONENTS ${LLVM_LINK_COMPONENTS} TableGen)
 
-  if(NOT XCODE)
+  # CMake-3.9 doesn't let compilation units depend on their dependent libraries.
+  if(NOT (CMAKE_GENERATOR STREQUAL "Ninja" AND NOT CMAKE_VERSION VERSION_LESS 3.9) AND NOT XCODE)
     # FIXME: It leaks to user, callee of add_tablegen.
     set(LLVM_ENABLE_OBJLIB ON)
   endif()
diff --git a/cmake/modules/VersionFromVCS.cmake b/cmake/modules/VersionFromVCS.cmake
index e925409..552fe77 100644
--- a/cmake/modules/VersionFromVCS.cmake
+++ b/cmake/modules/VersionFromVCS.cmake
@@ -25,57 +25,67 @@
         set(LLVM_REPOSITORY ${Project_WC_URL} PARENT_SCOPE)
       endif()
     endif()
-  elseif( EXISTS ${SOURCE_DIR}/.git )
-    set(result "${result}git")
-    # Try to get a ref-id
+  else()
     find_program(git_executable NAMES git git.exe git.cmd)
 
     if( git_executable )
-      if( EXISTS ${SOURCE_DIR}/.git/svn )
-        # Get the repository URL
-        execute_process(COMMAND
-          ${git_executable} svn info
-          WORKING_DIRECTORY ${SOURCE_DIR}
-          TIMEOUT 5
-          RESULT_VARIABLE git_result
-          OUTPUT_VARIABLE git_output)
-        if( git_result EQUAL 0 )
-          string(REGEX MATCH "URL: ([^ \n]*)" svn_url ${git_output})
-          if(svn_url)
-            set(LLVM_REPOSITORY ${CMAKE_MATCH_1} PARENT_SCOPE)
+      # Run from a subdirectory to force git to print an absoute path.
+      execute_process(COMMAND ${git_executable} rev-parse --git-dir
+        WORKING_DIRECTORY ${SOURCE_DIR}/cmake
+        RESULT_VARIABLE git_result
+        OUTPUT_VARIABLE git_dir
+        ERROR_QUIET)
+      if(git_result EQUAL 0)
+        # Try to get a ref-id
+        string(STRIP "${git_dir}" git_dir)
+        set(result "${result}git")
+        if( EXISTS ${git_dir}/svn )
+          # Get the repository URL
+          execute_process(COMMAND
+            ${git_executable} svn info
+            WORKING_DIRECTORY ${SOURCE_DIR}
+            TIMEOUT 5
+            RESULT_VARIABLE git_result
+            OUTPUT_VARIABLE git_output
+            ERROR_QUIET)
+          if( git_result EQUAL 0 )
+            string(REGEX MATCH "URL: ([^ \n]*)" svn_url ${git_output})
+            if(svn_url)
+              set(LLVM_REPOSITORY ${CMAKE_MATCH_1} PARENT_SCOPE)
+            endif()
+          endif()
+
+          # Get the svn revision number for this git commit if one exists.
+          execute_process(COMMAND ${git_executable} svn find-rev HEAD
+            WORKING_DIRECTORY ${SOURCE_DIR}
+            TIMEOUT 5
+            RESULT_VARIABLE git_result
+            OUTPUT_VARIABLE git_head_svn_rev_number
+            OUTPUT_STRIP_TRAILING_WHITESPACE)
+          if( git_result EQUAL 0 AND git_output)
+            set(SVN_REVISION ${git_head_svn_rev_number} PARENT_SCOPE)
+            set(git_svn_rev "-svn-${git_head_svn_rev_number}")
+          else()
+            set(git_svn_rev "")
           endif()
         endif()
 
-        # Get the svn revision number for this git commit if one exists.
-        execute_process(COMMAND ${git_executable} svn find-rev HEAD
+        # Get the git ref id
+        execute_process(COMMAND
+          ${git_executable} rev-parse --short HEAD
           WORKING_DIRECTORY ${SOURCE_DIR}
           TIMEOUT 5
           RESULT_VARIABLE git_result
-          OUTPUT_VARIABLE git_head_svn_rev_number
+          OUTPUT_VARIABLE git_ref_id
           OUTPUT_STRIP_TRAILING_WHITESPACE)
-        if( git_result EQUAL 0 AND git_output)
-          set(SVN_REVISION ${git_head_svn_rev_number} PARENT_SCOPE)
-          set(git_svn_rev "-svn-${git_head_svn_rev_number}")
+
+        if( git_result EQUAL 0 )
+          set(GIT_COMMIT ${git_ref_id} PARENT_SCOPE)
+          set(result "${result}${git_svn_rev}-${git_ref_id}")
         else()
-          set(git_svn_rev "")
+          set(result "${result}${git_svn_rev}")
         endif()
       endif()
-
-      # Get the git ref id
-      execute_process(COMMAND
-        ${git_executable} rev-parse --short HEAD
-        WORKING_DIRECTORY ${SOURCE_DIR}
-        TIMEOUT 5
-        RESULT_VARIABLE git_result
-        OUTPUT_VARIABLE git_ref_id
-        OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-      if( git_result EQUAL 0 )
-        set(GIT_COMMIT ${git_ref_id} PARENT_SCOPE)
-        set(result "${result}${git_svn_rev}-${git_ref_id}")
-      else()
-        set(result "${result}${git_svn_rev}")
-      endif()
     endif()
   endif()
   set(${VERS} ${result} PARENT_SCOPE)
diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst
index 5ff0f20..7062d75 100644
--- a/docs/AMDGPUUsage.rst
+++ b/docs/AMDGPUUsage.rst
@@ -1,110 +1,3746 @@
-==============================
-User Guide for AMDGPU Back-end
-==============================
+=============================
+User Guide for AMDGPU Backend
+=============================
+
+.. contents::
+   :local:
 
 Introduction
 ============
 
-The AMDGPU back-end provides ISA code generation for AMD GPUs, starting with
-the R600 family up until the current Volcanic Islands (GCN Gen 3).
+The AMDGPU backend provides ISA code generation for AMD GPUs, starting with the
+R600 family up until the current GCN families. It lives in the
+``lib/Target/AMDGPU`` directory.
 
-Refer to `AMDGPU section in Architecture & Platform Information for Compiler Writers <CompilerWriterInfo.html#amdgpu>`_
-for additional documentation.
+LLVM
+====
 
-Conventions
-===========
+.. _amdgpu-target-triples:
+
+Target Triples
+--------------
+
+Use the ``clang -target <Architecture>-<Vendor>-<OS>-<Environment>`` option to
+specify the target triple:
+
+  .. table:: AMDGPU Target Triples
+     :name: amdgpu-target-triples-table
+
+     ============ ======== ========= ===========
+     Architecture Vendor   OS        Environment
+     ============ ======== ========= ===========
+     r600         amd      <empty>   <empty>
+     amdgcn       amd      <empty>   <empty>
+     amdgcn       amd      amdhsa    <empty>
+     amdgcn       amd      amdhsa    opencl
+     amdgcn       amd      amdhsa    amdgizcl
+     amdgcn       amd      amdhsa    amdgiz
+     amdgcn       amd      amdhsa    hcc
+     ============ ======== ========= ===========
+
+``r600-amd--``
+  Supports AMD GPUs HD2XXX-HD6XXX for graphics and compute shaders executed on
+  the MESA runtime.
+
+``amdgcn-amd--``
+  Supports AMD GPUs GCN GFX6 onwards for graphics and compute shaders executed on
+  the MESA runtime.
+
+``amdgcn-amd-amdhsa-``
+  Supports AMD GCN GPUs GFX6 onwards for compute kernels executed on HSA [HSA]_
+  compatible runtimes such as AMD's ROCm [AMD-ROCm]_.
+
+``amdgcn-amd-amdhsa-opencl``
+  Supports AMD GCN GPUs GFX6 onwards for OpenCL compute kernels executed on HSA
+  [HSA]_ compatible runtimes such as AMD's ROCm [AMD-ROCm]_. See
+  :ref:`amdgpu-opencl`.
+
+``amdgcn-amd-amdhsa-amdgizcl``
+  Same as ``amdgcn-amd-amdhsa-opencl`` except a different address space mapping
+  is used (see :ref:`amdgpu-address-spaces`).
+
+``amdgcn-amd-amdhsa-amdgiz``
+  Same as ``amdgcn-amd-amdhsa-`` except a different address space mapping is
+  used (see :ref:`amdgpu-address-spaces`).
+
+``amdgcn-amd-amdhsa-hcc``
+  Supports AMD GCN GPUs GFX6 onwards for AMD HC language compute kernels
+  executed on HSA [HSA]_ compatible runtimes such as AMD's ROCm [AMD-ROCm]_. See
+  :ref:`amdgpu-hcc`.
+
+.. _amdgpu-processors:
+
+Processors
+----------
+
+Use the ``clang -mcpu <Processor>`` option to specify the AMD GPU processor. The
+names from both the *Processor* and *Alternative Processor* can be used.
+
+  .. table:: AMDGPU Processors
+     :name: amdgpu-processors-table
+
+     ========== =========== ============ ===== ======= ==================
+     Processor  Alternative Target       dGPU/ Runtime Example
+                Processor   Triple       APU   Support Products
+                            Architecture
+     ========== =========== ============ ===== ======= ==================
+     **Radeon HD 2000/3000 Series (R600)** [AMD-RADEON-HD-2000-3000]_
+     --------------------------------------------------------------------
+     r600                   r600         dGPU
+     r630                   r600         dGPU
+     rs880                  r600         dGPU
+     rv670                  r600         dGPU
+     **Radeon HD 4000 Series (R700)** [AMD-RADEON-HD-4000]_
+     --------------------------------------------------------------------
+     rv710                  r600         dGPU
+     rv730                  r600         dGPU
+     rv770                  r600         dGPU
+     **Radeon HD 5000 Series (Evergreen)** [AMD-RADEON-HD-5000]_
+     --------------------------------------------------------------------
+     cedar                  r600         dGPU
+     redwood                r600         dGPU
+     sumo                   r600         dGPU
+     juniper                r600         dGPU
+     cypress                r600         dGPU
+     **Radeon HD 6000 Series (Northern Islands)** [AMD-RADEON-HD-6000]_
+     --------------------------------------------------------------------
+     barts                  r600         dGPU
+     turks                  r600         dGPU
+     caicos                 r600         dGPU
+     cayman                 r600         dGPU
+     **GCN GFX6 (Southern Islands (SI))** [AMD-GCN-GFX6]_
+     --------------------------------------------------------------------
+     gfx600     - tahiti    amdgcn       dGPU
+     gfx601     - pitcairn  amdgcn       dGPU
+                - verde
+                - oland
+                - hainan
+     **GCN GFX7 (Sea Islands (CI))** [AMD-GCN-GFX7]_
+     --------------------------------------------------------------------
+     gfx700     - bonaire   amdgcn       dGPU          - Radeon HD 7790
+                                                       - Radeon HD 8770
+                                                       - R7 260
+                                                       - R7 260X
+     \          - kaveri    amdgcn       APU           - A6-7000
+                                                       - A6 Pro-7050B
+                                                       - A8-7100
+                                                       - A8 Pro-7150B
+                                                       - A10-7300
+                                                       - A10 Pro-7350B
+                                                       - FX-7500
+                                                       - A8-7200P
+                                                       - A10-7400P
+                                                       - FX-7600P
+     gfx701     - hawaii    amdgcn       dGPU  ROCm    - FirePro W8100
+                                                       - FirePro W9100
+                                                       - FirePro S9150
+                                                       - FirePro S9170
+     gfx702                              dGPU  ROCm    - Radeon R9 290
+                                                       - Radeon R9 290x
+                                                       - Radeon R390
+                                                       - Radeon R390x
+     gfx703     - kabini    amdgcn       APU           - E1-2100
+                - mullins                              - E1-2200
+                                                       - E1-2500
+                                                       - E2-3000
+                                                       - E2-3800
+                                                       - A4-5000
+                                                       - A4-5100
+                                                       - A6-5200
+                                                       - A4 Pro-3340B
+     **GCN GFX8 (Volcanic Islands (VI))** [AMD-GCN-GFX8]_
+     --------------------------------------------------------------------
+     gfx800     - iceland   amdgcn       dGPU          - FirePro S7150
+                                                       - FirePro S7100
+                                                       - FirePro W7100
+                                                       - Radeon R285
+                                                       - Radeon R9 380
+                                                       - Radeon R9 385
+                                                       - Mobile FirePro
+                                                         M7170
+     gfx801     - carrizo   amdgcn       APU           - A6-8500P
+                                                       - Pro A6-8500B
+                                                       - A8-8600P
+                                                       - Pro A8-8600B
+                                                       - FX-8800P
+                                                       - Pro A12-8800B
+     \                      amdgcn       APU   ROCm    - A10-8700P
+                                                       - Pro A10-8700B
+                                                       - A10-8780P
+     \                      amdgcn       APU           - A10-9600P
+                                                       - A10-9630P
+                                                       - A12-9700P
+                                                       - A12-9730P
+                                                       - FX-9800P
+                                                       - FX-9830P
+     \                      amdgcn       APU           - E2-9010
+                                                       - A6-9210
+                                                       - A9-9410
+     gfx802     - tonga     amdgcn       dGPU  ROCm    Same as gfx800
+     gfx803     - fiji      amdgcn       dGPU  ROCm    - Radeon R9 Nano
+                                                       - Radeon R9 Fury
+                                                       - Radeon R9 FuryX
+                                                       - Radeon Pro Duo
+                                                       - FirePro S9300x2
+                                                       - Radeon Instinct MI8
+     \          - polaris10 amdgcn       dGPU  ROCm    - Radeon RX 470
+                                                       - Radeon RX 480
+                                                       - Radeon Instinct MI6
+     \          - polaris11 amdgcn       dGPU  ROCm    - Radeon RX 460
+     gfx804                 amdgcn       dGPU          Same as gfx803
+     gfx810     - stoney    amdgcn       APU
+     **GCN GFX9** [AMD-GCN-GFX9]_
+     --------------------------------------------------------------------
+     gfx900                 amdgcn       dGPU          - Radeon Vega
+                                                         Frontier Edition
+                                                       - Radeon RX Vega 56
+                                                       - Radeon RX Vega 64
+                                                       - Radeon RX Vega 64
+                                                         Liquid
+                                                       - Radeon Instinct MI25
+     gfx901                 amdgcn       dGPU  ROCm    Same as gfx900
+                                                       except XNACK is
+                                                       enabled
+     gfx902                 amdgcn       APU           *TBA*
+
+                                                       .. TODO
+                                                          Add product
+                                                          names.
+     gfx903                 amdgcn       APU           Same as gfx902
+                                                       except XNACK is
+                                                       enabled
+     ========== =========== ============ ===== ======= ==================
+
+.. _amdgpu-address-spaces:
 
 Address Spaces
 --------------
 
-The AMDGPU back-end uses the following address space mapping:
+The AMDGPU backend uses the following address space mappings.
 
-   ================== =================== ==============
-   LLVM Address Space DWARF Address Space Memory Space
-   ================== =================== ==============
-   0                  1                   Private
-   1                  N/A                 Global
-   2                  N/A                 Constant
-   3                  2                   Local
-   4                  N/A                 Generic (Flat)
-   5                  N/A                 Region
-   ================== =================== ==============
+The memory space names used in the table, aside from the region memory space, is
+from the OpenCL standard.
 
-The terminology in the table, aside from the region memory space, is from the
-OpenCL standard.
+LLVM Address Space number is used throughout LLVM (for example, in LLVM IR).
 
-LLVM Address Space is used throughout LLVM (for example, in LLVM IR). DWARF
-Address Space is emitted in DWARF, and is used by tools, such as debugger,
-profiler and others.
+  .. table:: Address Space Mapping
+     :name: amdgpu-address-space-mapping-table
+
+     ================== ================= ================= ================= =================
+     LLVM Address Space Memory Space
+     ------------------ -----------------------------------------------------------------------
+     \                  Current Default   amdgiz/amdgizcl   hcc               Future Default
+     ================== ================= ================= ================= =================
+     0                  Private (Scratch) Generic (Flat)    Generic (Flat)    Generic (Flat)
+     1                  Global            Global            Global            Global
+     2                  Constant          Constant          Constant          Region (GDS)
+     3                  Local (group/LDS) Local (group/LDS) Local (group/LDS) Local (group/LDS)
+     4                  Generic (Flat)    Region (GDS)      Region (GDS)      Constant
+     5                  Region (GDS)      Private (Scratch) Private (Scratch) Private (Scratch)
+     ================== ================= ================= ================= =================
+
+Current Default
+  This is the current default address space mapping used for all languages
+  except hcc. This will shortly be deprecated.
+
+amdgiz/amdgizcl
+  This is the current address space mapping used when ``amdgiz`` or ``amdgizcl``
+  is specified as the target triple environment value.
+
+hcc
+  This is the current address space mapping used when ``hcc`` is specified as
+  the target triple environment value.This will shortly be deprecated.
+
+Future Default
+  This will shortly be the only address space mapping for all languages using
+  AMDGPU backend.
+
+.. _amdgpu-memory-scopes:
+
+Memory Scopes
+-------------
+
+This section provides LLVM memory synchronization scopes supported by the AMDGPU
+backend memory model when the target triple OS is ``amdhsa`` (see
+:ref:`amdgpu-amdhsa-memory-model` and :ref:`amdgpu-target-triples`).
+
+The memory model supported is based on the HSA memory model [HSA]_ which is
+based in turn on HRF-indirect with scope inclusion [HRF]_. The happens-before
+relation is transitive over the synchonizes-with relation independent of scope,
+and synchonizes-with allows the memory scope instances to be inclusive (see
+table :ref:`amdgpu-amdhsa-llvm-sync-scopes-amdhsa-table`).
+
+This is different to the OpenCL [OpenCL]_ memory model which does not have scope
+inclusion and requires the memory scopes to exactly match. However, this
+is conservatively correct for OpenCL.
+
+  .. table:: AMDHSA LLVM Sync Scopes for AMDHSA
+     :name: amdgpu-amdhsa-llvm-sync-scopes-amdhsa-table
+
+     ================ ==========================================================
+     LLVM Sync Scope  Description
+     ================ ==========================================================
+     *none*           The default: ``system``.
+
+                      Synchronizes with, and participates in modification and
+                      seq_cst total orderings with, other operations (except
+                      image operations) for all address spaces (except private,
+                      or generic that accesses private) provided the other
+                      operation's sync scope is:
+
+                      - ``system``.
+                      - ``agent`` and executed by a thread on the same agent.
+                      - ``workgroup`` and executed by a thread in the same
+                        workgroup.
+                      - ``wavefront`` and executed by a thread in the same
+                        wavefront.
+
+     ``agent``        Synchronizes with, and participates in modification and
+                      seq_cst total orderings with, other operations (except
+                      image operations) for all address spaces (except private,
+                      or generic that accesses private) provided the other
+                      operation's sync scope is:
+
+                      - ``system`` or ``agent`` and executed by a thread on the
+                        same agent.
+                      - ``workgroup`` and executed by a thread in the same
+                        workgroup.
+                      - ``wavefront`` and executed by a thread in the same
+                        wavefront.
+
+     ``workgroup``    Synchronizes with, and participates in modification and
+                      seq_cst total orderings with, other operations (except
+                      image operations) for all address spaces (except private,
+                      or generic that accesses private) provided the other
+                      operation's sync scope is:
+
+                      - ``system``, ``agent`` or ``workgroup`` and executed by a
+                        thread in the same workgroup.
+                      - ``wavefront`` and executed by a thread in the same
+                        wavefront.
+
+     ``wavefront``    Synchronizes with, and participates in modification and
+                      seq_cst total orderings with, other operations (except
+                      image operations) for all address spaces (except private,
+                      or generic that accesses private) provided the other
+                      operation's sync scope is:
+
+                      - ``system``, ``agent``, ``workgroup`` or ``wavefront``
+                        and executed by a thread in the same wavefront.
+
+     ``singlethread`` Only synchronizes with, and participates in modification
+                      and seq_cst total orderings with, other operations (except
+                      image operations) running in the same thread for all
+                      address spaces (for example, in signal handlers).
+     ================ ==========================================================
+
+AMDGPU Intrinsics
+-----------------
+
+The AMDGPU backend implements the following intrinsics.
+
+*This section is WIP.*
+
+.. TODO
+   List AMDGPU intrinsics
+
+Code Object
+===========
+
+The AMDGPU backend generates a standard ELF [ELF]_ relocatable code object that
+can be linked by ``lld`` to produce a standard ELF shared code object which can
+be loaded and executed on an AMDGPU target.
+
+Header
+------
+
+The AMDGPU backend uses the following ELF header:
+
+  .. table:: AMDGPU ELF Header
+     :name: amdgpu-elf-header-table
+
+     ========================== ===============================
+     Field                      Value
+     ========================== ===============================
+     ``e_ident[EI_CLASS]``      ``ELFCLASS64``
+     ``e_ident[EI_DATA]``       ``ELFDATA2LSB``
+     ``e_ident[EI_OSABI]``      ``ELFOSABI_AMDGPU_HSA``,
+                                ``ELFOSABI_AMDGPU_PAL`` or
+                                ``ELFOSABI_AMDGPU_MESA3D``
+     ``e_ident[EI_ABIVERSION]`` ``ELFABIVERSION_AMDGPU_HSA``,
+                                ``ELFABIVERSION_AMDGPU_PAL`` or
+                                ``ELFABIVERSION_AMDGPU_MESA3D``
+     ``e_type``                 ``ET_REL`` or ``ET_DYN``
+     ``e_machine``              ``EM_AMDGPU``
+     ``e_entry``                0
+     ``e_flags``                0
+     ========================== ===============================
+
+..
+
+  .. table:: AMDGPU ELF Header Enumeration Values
+     :name: amdgpu-elf-header-enumeration-values-table
+
+     =============================== =====
+     Name                            Value
+     =============================== =====
+     ``EM_AMDGPU``                   224
+     ``ELFOSABI_AMDGPU_HSA``         64
+     ``ELFOSABI_AMDGPU_PAL``         65
+     ``ELFOSABI_AMDGPU_MESA3D``      66
+     ``ELFABIVERSION_AMDGPU_HSA``    1
+     ``ELFABIVERSION_AMDGPU_PAL``    0
+     ``ELFABIVERSION_AMDGPU_MESA3D`` 0
+     =============================== =====
+
+``e_ident[EI_CLASS]``
+  The ELF class is always ``ELFCLASS64``. The AMDGPU backend only supports 64
+  bit applications.
+
+``e_ident[EI_DATA]``
+  All AMDGPU targets use ELFDATA2LSB for little-endian byte ordering.
+
+``e_ident[EI_OSABI]``
+  One of the following AMD GPU architecture specific OS ABIs:
+
+  * ``ELFOSABI_AMDGPU_HSA`` is used to specify that the code object conforms to
+    the AMD HSA runtime ABI [HSA]_.
+
+  * ``ELFOSABI_AMDGPU_PAL`` is used to specify that the code object conforms to
+    the AMD PAL runtime ABI.
+
+  * ``ELFOSABI_AMDGPU_MESA3D`` is used to specify that the code object conforms
+    to the AMD MESA runtime ABI.
+
+``e_ident[EI_ABIVERSION]``
+  The ABI version of the AMD GPU architecture specific OS ABI to which the code
+  object conforms:
+
+  * ``ELFABIVERSION_AMDGPU_HSA`` is used to specify the version of AMD HSA
+    runtime ABI.
+
+  * ``ELFABIVERSION_AMDGPU_PAL`` is used to specify the version of AMD PAL
+    runtime ABI.
+
+  * ``ELFABIVERSION_AMDGPU_MESA3D`` is used to specify the version of AMD MESA
+    runtime ABI.
+
+``e_type``
+  Can be one of the following values:
+
+
+  ``ET_REL``
+    The type produced by the AMD GPU backend compiler as it is relocatable code
+    object.
+
+  ``ET_DYN``
+    The type produced by the linker as it is a shared code object.
+
+  The AMD HSA runtime loader requires a ``ET_DYN`` code object.
+
+``e_machine``
+  The value ``EM_AMDGPU`` is used for the machine for all members of the AMD GPU
+  architecture family. The specific member is specified in the
+  ``NT_AMD_AMDGPU_ISA`` entry in the ``.note`` section (see
+  :ref:`amdgpu-note-records`).
+
+``e_entry``
+  The entry point is 0 as the entry points for individual kernels must be
+  selected in order to invoke them through AQL packets.
+
+``e_flags``
+  The value is 0 as no flags are used.
+
+Sections
+--------
+
+An AMDGPU target ELF code object has the standard ELF sections which include:
+
+  .. table:: AMDGPU ELF Sections
+     :name: amdgpu-elf-sections-table
+
+     ================== ================ =================================
+     Name               Type             Attributes
+     ================== ================ =================================
+     ``.bss``           ``SHT_NOBITS``   ``SHF_ALLOC`` + ``SHF_WRITE``
+     ``.data``          ``SHT_PROGBITS`` ``SHF_ALLOC`` + ``SHF_WRITE``
+     ``.debug_``\ *\**  ``SHT_PROGBITS`` *none*
+     ``.dynamic``       ``SHT_DYNAMIC``  ``SHF_ALLOC``
+     ``.dynstr``        ``SHT_PROGBITS`` ``SHF_ALLOC``
+     ``.dynsym``        ``SHT_PROGBITS`` ``SHF_ALLOC``
+     ``.got``           ``SHT_PROGBITS`` ``SHF_ALLOC`` + ``SHF_WRITE``
+     ``.hash``          ``SHT_HASH``     ``SHF_ALLOC``
+     ``.note``          ``SHT_NOTE``     *none*
+     ``.rela``\ *name*  ``SHT_RELA``     *none*
+     ``.rela.dyn``      ``SHT_RELA``     *none*
+     ``.rodata``        ``SHT_PROGBITS`` ``SHF_ALLOC``
+     ``.shstrtab``      ``SHT_STRTAB``   *none*
+     ``.strtab``        ``SHT_STRTAB``   *none*
+     ``.symtab``        ``SHT_SYMTAB``   *none*
+     ``.text``          ``SHT_PROGBITS`` ``SHF_ALLOC`` + ``SHF_EXECINSTR``
+     ================== ================ =================================
+
+These sections have their standard meanings (see [ELF]_) and are only generated
+if needed.
+
+``.debug``\ *\**
+  The standard DWARF sections. See :ref:`amdgpu-dwarf` for information on the
+  DWARF produced by the AMDGPU backend.
+
+``.dynamic``, ``.dynstr``, ``.dynsym``, ``.hash``
+  The standard sections used by a dynamic loader.
+
+``.note``
+  See :ref:`amdgpu-note-records` for the note records supported by the AMDGPU
+  backend.
+
+``.rela``\ *name*, ``.rela.dyn``
+  For relocatable code objects, *name* is the name of the section that the
+  relocation records apply. For example, ``.rela.text`` is the section name for
+  relocation records associated with the ``.text`` section.
+
+  For linked shared code objects, ``.rela.dyn`` contains all the relocation
+  records from each of the relocatable code object's ``.rela``\ *name* sections.
+
+  See :ref:`amdgpu-relocation-records` for the relocation records supported by
+  the AMDGPU backend.
+
+``.text``
+  The executable machine code for the kernels and functions they call. Generated
+  as position independent code. See :ref:`amdgpu-code-conventions` for
+  information on conventions used in the isa generation.
+
+.. _amdgpu-note-records:
+
+Note Records
+------------
+
+As required by ``ELFCLASS64``, minimal zero byte padding must be generated after
+the ``name`` field to ensure the ``desc`` field is 4 byte aligned. In addition,
+minimal zero byte padding must be generated to ensure the ``desc`` field size is
+a multiple of 4 bytes. The ``sh_addralign`` field of the ``.note`` section must
+be at least 4 to indicate at least 8 byte alignment.
+
+The AMDGPU backend code object uses the following ELF note records in the
+``.note`` section. The *Description* column specifies the layout of the note
+record's ``desc`` field. All fields are consecutive bytes. Note records with
+variable size strings have a corresponding ``*_size`` field that specifies the
+number of bytes, including the terminating null character, in the string. The
+string(s) come immediately after the preceding fields.
+
+Additional note records can be present.
+
+  .. table:: AMDGPU ELF Note Records
+     :name: amdgpu-elf-note-records-table
+
+     ===== ============================== ======================================
+     Name  Type                           Description
+     ===== ============================== ======================================
+     "AMD" ``NT_AMD_AMDGPU_HSA_METADATA`` <metadata null terminated string>
+     "AMD" ``NT_AMD_AMDGPU_ISA``          <isa name null terminated string>
+     ===== ============================== ======================================
+
+..
+
+  .. table:: AMDGPU ELF Note Record Enumeration Values
+     :name: amdgpu-elf-note-record-enumeration-values-table
+
+     ============================== =====
+     Name                           Value
+     ============================== =====
+     *reserved*                       0-9
+     ``NT_AMD_AMDGPU_HSA_METADATA``    10
+     ``NT_AMD_AMDGPU_ISA``             11
+     ============================== =====
+
+``NT_AMD_AMDGPU_ISA``
+  Specifies the instruction set architecture used by the machine code contained
+  in the code object.
+
+  This note record is required for code objects containing machine code for
+  processors matching the ``amdgcn`` architecture in table
+  :ref:`amdgpu-processors`.
+
+  The null terminated string has the following syntax:
+
+    *architecture*\ ``-``\ *vendor*\ ``-``\ *os*\ ``-``\ *environment*\ ``-``\ *processor*
+
+  where:
+
+    *architecture*
+      The architecture from table :ref:`amdgpu-target-triples-table`.
+
+      This is always ``amdgcn`` when the target triple OS is ``amdhsa`` (see
+      :ref:`amdgpu-target-triples`).
+
+    *vendor*
+      The vendor from table :ref:`amdgpu-target-triples-table`.
+
+      For the AMDGPU backend this is always ``amd``.
+
+    *os*
+      The OS from table :ref:`amdgpu-target-triples-table`.
+
+    *environment*
+      An environment from table :ref:`amdgpu-target-triples-table`, or blank if
+      the environment has no affect on the execution of the code object.
+
+      For the AMDGPU backend this is currently always blank.
+    *processor*
+      The processor from table :ref:`amdgpu-processors-table`.
+
+  For example:
+
+    ``amdgcn-amd-amdhsa--gfx901``
+
+``NT_AMD_AMDGPU_HSA_METADATA``
+  Specifies extensible metadata associated with the code objects executed on HSA
+  [HSA]_ compatible runtimes such as AMD's ROCm [AMD-ROCm]_. It is required when
+  the target triple OS is ``amdhsa`` (see :ref:`amdgpu-target-triples`). See
+  :ref:`amdgpu-amdhsa-hsa-code-object-metadata` for the syntax of the code
+  object metadata string.
+
+.. _amdgpu-symbols:
+
+Symbols
+-------
+
+Symbols include the following:
+
+  .. table:: AMDGPU ELF Symbols
+     :name: amdgpu-elf-symbols-table
+
+     ===================== ============== ============= ==================
+     Name                  Type           Section       Description
+     ===================== ============== ============= ==================
+     *link-name*           ``STT_OBJECT`` - ``.data``   Global variable
+                                          - ``.rodata``
+                                          - ``.bss``
+     *link-name*\ ``@kd``  ``STT_OBJECT`` - ``.rodata`` Kernel descriptor
+     *link-name*           ``STT_FUNC``   - ``.text``   Kernel entry point
+     ===================== ============== ============= ==================
+
+Global variable
+  Global variables both used and defined by the compilation unit.
+
+  If the symbol is defined in the compilation unit then it is allocated in the
+  appropriate section according to if it has initialized data or is readonly.
+
+  If the symbol is external then its section is ``STN_UNDEF`` and the loader
+  will resolve relocations using the definition provided by another code object
+  or explicitly defined by the runtime.
+
+  All global symbols, whether defined in the compilation unit or external, are
+  accessed by the machine code indirectly through a GOT table entry. This
+  allows them to be preemptable. The GOT table is only supported when the target
+  triple OS is ``amdhsa`` (see :ref:`amdgpu-target-triples`).
+
+  .. TODO
+     Add description of linked shared object symbols. Seems undefined symbols
+     are marked as STT_NOTYPE.
+
+Kernel descriptor
+  Every HSA kernel has an associated kernel descriptor. It is the address of the
+  kernel descriptor that is used in the AQL dispatch packet used to invoke the
+  kernel, not the kernel entry point. The layout of the HSA kernel descriptor is
+  defined in :ref:`amdgpu-amdhsa-kernel-descriptor`.
+
+Kernel entry point
+  Every HSA kernel also has a symbol for its machine code entry point.
+
+.. _amdgpu-relocation-records:
+
+Relocation Records
+------------------
+
+AMDGPU backend generates ``Elf64_Rela`` relocation records. Supported
+relocatable fields are:
+
+``word32``
+  This specifies a 32-bit field occupying 4 bytes with arbitrary byte
+  alignment. These values use the same byte order as other word values in the
+  AMD GPU architecture.
+
+``word64``
+  This specifies a 64-bit field occupying 8 bytes with arbitrary byte
+  alignment. These values use the same byte order as other word values in the
+  AMD GPU architecture.
+
+Following notations are used for specifying relocation calculations:
+
+**A**
+  Represents the addend used to compute the value of the relocatable field.
+
+**G**
+  Represents the offset into the global offset table at which the relocation
+  entry's symbol will reside during execution.
+
+**GOT**
+  Represents the address of the global offset table.
+
+**P**
+  Represents the place (section offset for ``et_rel`` or address for ``et_dyn``)
+  of the storage unit being relocated (computed using ``r_offset``).
+
+**S**
+  Represents the value of the symbol whose index resides in the relocation
+  entry. Relocations not using this must specify a symbol index of ``STN_UNDEF``.
+
+**B**
+  Represents the base address of a loaded executable or shared object which is
+  the difference between the ELF address and the actual load address. Relocations
+  using this are only valid in executable or shared objects.
+
+The following relocation types are supported:
+
+  .. table:: AMDGPU ELF Relocation Records
+     :name: amdgpu-elf-relocation-records-table
+
+     ==========================  =====  ==========  ==============================
+     Relocation Type             Value  Field       Calculation
+     ==========================  =====  ==========  ==============================
+     ``R_AMDGPU_NONE``           0      *none*      *none*
+     ``R_AMDGPU_ABS32_LO``       1      ``word32``  (S + A) & 0xFFFFFFFF
+     ``R_AMDGPU_ABS32_HI``       2      ``word32``  (S + A) >> 32
+     ``R_AMDGPU_ABS64``          3      ``word64``  S + A
+     ``R_AMDGPU_REL32``          4      ``word32``  S + A - P
+     ``R_AMDGPU_REL64``          5      ``word64``  S + A - P
+     ``R_AMDGPU_ABS32``          6      ``word32``  S + A
+     ``R_AMDGPU_GOTPCREL``       7      ``word32``  G + GOT + A - P
+     ``R_AMDGPU_GOTPCREL32_LO``  8      ``word32``  (G + GOT + A - P) & 0xFFFFFFFF
+     ``R_AMDGPU_GOTPCREL32_HI``  9      ``word32``  (G + GOT + A - P) >> 32
+     ``R_AMDGPU_REL32_LO``       10     ``word32``  (S + A - P) & 0xFFFFFFFF
+     ``R_AMDGPU_REL32_HI``       11     ``word32``  (S + A - P) >> 32
+     *reserved*                  12
+     ``R_AMDGPU_RELATIVE64``     13     ``word64``  B + A
+     ==========================  =====  ==========  ==============================
+
+.. _amdgpu-dwarf:
+
+DWARF
+-----
+
+Standard DWARF [DWARF]_ Version 2 sections can be generated. These contain
+information that maps the code object executable code and data to the source
+language constructs. It can be used by tools such as debuggers and profilers.
+
+Address Space Mapping
+~~~~~~~~~~~~~~~~~~~~~
+
+The following address space mapping is used:
+
+  .. table:: AMDGPU DWARF Address Space Mapping
+     :name: amdgpu-dwarf-address-space-mapping-table
+
+     =================== =================
+     DWARF Address Space Memory Space
+     =================== =================
+     1                   Private (Scratch)
+     2                   Local (group/LDS)
+     *omitted*           Global
+     *omitted*           Constant
+     *omitted*           Generic (Flat)
+     *not supported*     Region (GDS)
+     =================== =================
+
+See :ref:`amdgpu-address-spaces` for information on the memory space terminology
+used in the table.
+
+An ``address_class`` attribute is generated on pointer type DIEs to specify the
+DWARF address space of the value of the pointer when it is in the *private* or
+*local* address space. Otherwise the attribute is omitted.
+
+An ``XDEREF`` operation is generated in location list expressions for variables
+that are allocated in the *private* and *local* address space. Otherwise no
+``XDREF`` is omitted.
+
+Register Mapping
+~~~~~~~~~~~~~~~~
+
+*This section is WIP.*
+
+.. TODO
+   Define DWARF register enumeration.
+
+   If want to present a wavefront state then should expose vector registers as
+   64 wide (rather than per work-item view that LLVM uses). Either as separate
+   registers, or a 64x4 byte single register. In either case use a new LANE op
+   (akin to XDREF) to select the current lane usage in a location
+   expression. This would also allow scalar register spilling to vector register
+   lanes to be expressed (currently no debug information is being generated for
+   spilling). If choose a wide single register approach then use LANE in
+   conjunction with PIECE operation to select the dword part of the register for
+   the current lane. If the separate register approach then use LANE to select
+   the register.
+
+Source Text
+~~~~~~~~~~~
+
+*This section is WIP.*
+
+.. TODO
+   DWARF extension to include runtime generated source text.
+
+.. _amdgpu-code-conventions:
+
+Code Conventions
+================
+
+This section provides code conventions used for each supported target triple OS
+(see :ref:`amdgpu-target-triples`).
+
+AMDHSA
+------
+
+This section provides code conventions used when the target triple OS is
+``amdhsa`` (see :ref:`amdgpu-target-triples`).
+
+.. _amdgpu-amdhsa-hsa-code-object-metadata:
+
+Code Object Metadata
+~~~~~~~~~~~~~~~~~~~~
+
+The code object metadata specifies extensible metadata associated with the code
+objects executed on HSA [HSA]_ compatible runtimes such as AMD's ROCm
+[AMD-ROCm]_. It is specified by the ``NT_AMD_AMDGPU_HSA_METADATA`` note record
+(see :ref:`amdgpu-note-records`) and is required when the target triple OS is
+``amdhsa`` (see :ref:`amdgpu-target-triples`). It must contain the minimum
+information necessary to support the ROCM kernel queries. For example, the
+segment sizes needed in a dispatch packet. In addition, a high level language
+runtime may require other information to be included. For example, the AMD
+OpenCL runtime records kernel argument information.
+
+The metadata is specified as a YAML formatted string (see [YAML]_ and
+:doc:`YamlIO`).
+
+.. TODO
+   Is the string null terminated? It probably should not if YAML allows it to
+   contain null characters, otherwise it should be.
+
+The metadata is represented as a single YAML document comprised of the mapping
+defined in table :ref:`amdgpu-amdhsa-code-object-metadata-mapping-table` and
+referenced tables.
+
+For boolean values, the string values of ``false`` and ``true`` are used for
+false and true respectively.
+
+Additional information can be added to the mappings. To avoid conflicts, any
+non-AMD key names should be prefixed by "*vendor-name*.".
+
+  .. table:: AMDHSA Code Object Metadata Mapping
+     :name: amdgpu-amdhsa-code-object-metadata-mapping-table
+
+     ========== ============== ========= =======================================
+     String Key Value Type     Required? Description
+     ========== ============== ========= =======================================
+     "Version"  sequence of    Required  - The first integer is the major
+                2 integers                 version. Currently 1.
+                                         - The second integer is the minor
+                                           version. Currently 0.
+     "Printf"   sequence of              Each string is encoded information
+                strings                  about a printf function call. The
+                                         encoded information is organized as
+                                         fields separated by colon (':'):
+
+                                         ``ID:N:S[0]:S[1]:...:S[N-1]:FormatString``
+
+                                         where:
+
+                                         ``ID``
+                                           A 32 bit integer as a unique id for
+                                           each printf function call
+
+                                         ``N``
+                                           A 32 bit integer equal to the number
+                                           of arguments of printf function call
+                                           minus 1
+
+                                         ``S[i]`` (where i = 0, 1, ... , N-1)
+                                           32 bit integers for the size in bytes
+                                           of the i-th FormatString argument of
+                                           the printf function call
+
+                                         FormatString
+                                           The format string passed to the
+                                           printf function call.
+     "Kernels"  sequence of    Required  Sequence of the mappings for each
+                mapping                  kernel in the code object. See
+                                         :ref:`amdgpu-amdhsa-code-object-kernel-metadata-mapping-table`
+                                         for the definition of the mapping.
+     ========== ============== ========= =======================================
+
+..
+
+  .. table:: AMDHSA Code Object Kernel Metadata Mapping
+     :name: amdgpu-amdhsa-code-object-kernel-metadata-mapping-table
+
+     ================= ============== ========= ================================
+     String Key        Value Type     Required? Description
+     ================= ============== ========= ================================
+     "Name"            string         Required  Source name of the kernel.
+     "SymbolName"      string         Required  Name of the kernel
+                                                descriptor ELF symbol.
+     "Language"        string                   Source language of the kernel.
+                                                Values include:
+
+                                                - "OpenCL C"
+                                                - "OpenCL C++"
+                                                - "HCC"
+                                                - "OpenMP"
+
+     "LanguageVersion" sequence of              - The first integer is the major
+                       2 integers                 version.
+                                                - The second integer is the
+                                                  minor version.
+     "Attrs"           mapping                  Mapping of kernel attributes.
+                                                See
+                                                :ref:`amdgpu-amdhsa-code-object-kernel-attribute-metadata-mapping-table`
+                                                for the mapping definition.
+     "Args"            sequence of              Sequence of mappings of the
+                       mapping                  kernel arguments. See
+                                                :ref:`amdgpu-amdhsa-code-object-kernel-argument-metadata-mapping-table`
+                                                for the definition of the mapping.
+     "CodeProps"       mapping                  Mapping of properties related to
+                                                the kernel code. See
+                                                :ref:`amdgpu-amdhsa-code-object-kernel-code-properties-metadata-mapping-table`
+                                                for the mapping definition.
+     "DebugProps"      mapping                  Mapping of properties related to
+                                                the kernel debugging. See
+                                                :ref:`amdgpu-amdhsa-code-object-kernel-debug-properties-metadata-mapping-table`
+                                                for the mapping definition.
+     ================= ============== ========= ================================
+
+..
+
+  .. table:: AMDHSA Code Object Kernel Attribute Metadata Mapping
+     :name: amdgpu-amdhsa-code-object-kernel-attribute-metadata-mapping-table
+
+     =================== ============== ========= ==============================
+     String Key          Value Type     Required? Description
+     =================== ============== ========= ==============================
+     "ReqdWorkGroupSize" sequence of              The dispatch work-group size
+                         3 integers               X, Y, Z must correspond to the
+                                                  specified values.
+
+                                                  Corresponds to the OpenCL
+                                                  ``reqd_work_group_size``
+                                                  attribute.
+     "WorkGroupSizeHint" sequence of              The dispatch work-group size
+                         3 integers               X, Y, Z is likely to be the
+                                                  specified values.
+
+                                                  Corresponds to the OpenCL
+                                                  ``work_group_size_hint``
+                                                  attribute.
+     "VecTypeHint"       string                   The name of a scalar or vector
+                                                  type.
+
+                                                  Corresponds to the OpenCL
+                                                  ``vec_type_hint`` attribute.
+
+     "RuntimeHandle"     string                   The external symbol name
+                                                  associated with a kernel.
+                                                  OpenCL runtime allocates a
+                                                  global buffer for the symbol
+                                                  and saves the kernel's address
+                                                  to it, which is used for
+                                                  device side enqueueing. Only
+                                                  available for device side
+                                                  enqueued kernels.
+     =================== ============== ========= ==============================
+
+..
+
+  .. table:: AMDHSA Code Object Kernel Argument Metadata Mapping
+     :name: amdgpu-amdhsa-code-object-kernel-argument-metadata-mapping-table
+
+     ================= ============== ========= ================================
+     String Key        Value Type     Required? Description
+     ================= ============== ========= ================================
+     "Name"            string                   Kernel argument name.
+     "TypeName"        string                   Kernel argument type name.
+     "Size"            integer        Required  Kernel argument size in bytes.
+     "Align"           integer        Required  Kernel argument alignment in
+                                                bytes. Must be a power of two.
+     "ValueKind"       string         Required  Kernel argument kind that
+                                                specifies how to set up the
+                                                corresponding argument.
+                                                Values include:
+
+                                                "ByValue"
+                                                  The argument is copied
+                                                  directly into the kernarg.
+
+                                                "GlobalBuffer"
+                                                  A global address space pointer
+                                                  to the buffer data is passed
+                                                  in the kernarg.
+
+                                                "DynamicSharedPointer"
+                                                  A group address space pointer
+                                                  to dynamically allocated LDS
+                                                  is passed in the kernarg.
+
+                                                "Sampler"
+                                                  A global address space
+                                                  pointer to a S# is passed in
+                                                  the kernarg.
+
+                                                "Image"
+                                                  A global address space
+                                                  pointer to a T# is passed in
+                                                  the kernarg.
+
+                                                "Pipe"
+                                                  A global address space pointer
+                                                  to an OpenCL pipe is passed in
+                                                  the kernarg.
+
+                                                "Queue"
+                                                  A global address space pointer
+                                                  to an OpenCL device enqueue
+                                                  queue is passed in the
+                                                  kernarg.
+
+                                                "HiddenGlobalOffsetX"
+                                                  The OpenCL grid dispatch
+                                                  global offset for the X
+                                                  dimension is passed in the
+                                                  kernarg.
+
+                                                "HiddenGlobalOffsetY"
+                                                  The OpenCL grid dispatch
+                                                  global offset for the Y
+                                                  dimension is passed in the
+                                                  kernarg.
+
+                                                "HiddenGlobalOffsetZ"
+                                                  The OpenCL grid dispatch
+                                                  global offset for the Z
+                                                  dimension is passed in the
+                                                  kernarg.
+
+                                                "HiddenNone"
+                                                  An argument that is not used
+                                                  by the kernel. Space needs to
+                                                  be left for it, but it does
+                                                  not need to be set up.
+
+                                                "HiddenPrintfBuffer"
+                                                  A global address space pointer
+                                                  to the runtime printf buffer
+                                                  is passed in kernarg.
+
+                                                "HiddenDefaultQueue"
+                                                  A global address space pointer
+                                                  to the OpenCL device enqueue
+                                                  queue that should be used by
+                                                  the kernel by default is
+                                                  passed in the kernarg.
+
+                                                "HiddenCompletionAction"
+                                                  A global address space pointer
+                                                  to help link enqueued kernels into
+                                                  the ancestor tree for determining
+                                                  when the parent kernel has finished.
+
+     "ValueType"       string         Required  Kernel argument value type. Only
+                                                present if "ValueKind" is
+                                                "ByValue". For vector data
+                                                types, the value is for the
+                                                element type. Values include:
+
+                                                - "Struct"
+                                                - "I8"
+                                                - "U8"
+                                                - "I16"
+                                                - "U16"
+                                                - "F16"
+                                                - "I32"
+                                                - "U32"
+                                                - "F32"
+                                                - "I64"
+                                                - "U64"
+                                                - "F64"
+
+                                                .. TODO
+                                                   How can it be determined if a
+                                                   vector type, and what size
+                                                   vector?
+     "PointeeAlign"    integer                  Alignment in bytes of pointee
+                                                type for pointer type kernel
+                                                argument. Must be a power
+                                                of 2. Only present if
+                                                "ValueKind" is
+                                                "DynamicSharedPointer".
+     "AddrSpaceQual"   string                   Kernel argument address space
+                                                qualifier. Only present if
+                                                "ValueKind" is "GlobalBuffer" or
+                                                "DynamicSharedPointer". Values
+                                                are:
+
+                                                - "Private"
+                                                - "Global"
+                                                - "Constant"
+                                                - "Local"
+                                                - "Generic"
+                                                - "Region"
+
+                                                .. TODO
+                                                   Is GlobalBuffer only Global
+                                                   or Constant? Is
+                                                   DynamicSharedPointer always
+                                                   Local? Can HCC allow Generic?
+                                                   How can Private or Region
+                                                   ever happen?
+     "AccQual"         string                   Kernel argument access
+                                                qualifier. Only present if
+                                                "ValueKind" is "Image" or
+                                                "Pipe". Values
+                                                are:
+
+                                                - "ReadOnly"
+                                                - "WriteOnly"
+                                                - "ReadWrite"
+
+                                                .. TODO
+                                                   Does this apply to
+                                                   GlobalBuffer?
+     "ActualAccQual"   string                   The actual memory accesses
+                                                performed by the kernel on the
+                                                kernel argument. Only present if
+                                                "ValueKind" is "GlobalBuffer",
+                                                "Image", or "Pipe". This may be
+                                                more restrictive than indicated
+                                                by "AccQual" to reflect what the
+                                                kernel actual does. If not
+                                                present then the runtime must
+                                                assume what is implied by
+                                                "AccQual" and "IsConst". Values
+                                                are:
+
+                                                - "ReadOnly"
+                                                - "WriteOnly"
+                                                - "ReadWrite"
+
+     "IsConst"         boolean                  Indicates if the kernel argument
+                                                is const qualified. Only present
+                                                if "ValueKind" is
+                                                "GlobalBuffer".
+
+     "IsRestrict"      boolean                  Indicates if the kernel argument
+                                                is restrict qualified. Only
+                                                present if "ValueKind" is
+                                                "GlobalBuffer".
+
+     "IsVolatile"      boolean                  Indicates if the kernel argument
+                                                is volatile qualified. Only
+                                                present if "ValueKind" is
+                                                "GlobalBuffer".
+
+     "IsPipe"          boolean                  Indicates if the kernel argument
+                                                is pipe qualified. Only present
+                                                if "ValueKind" is "Pipe".
+
+                                                .. TODO
+                                                   Can GlobalBuffer be pipe
+                                                   qualified?
+     ================= ============== ========= ================================
+
+..
+
+  .. table:: AMDHSA Code Object Kernel Code Properties Metadata Mapping
+     :name: amdgpu-amdhsa-code-object-kernel-code-properties-metadata-mapping-table
+
+     ============================ ============== ========= =====================
+     String Key                   Value Type     Required? Description
+     ============================ ============== ========= =====================
+     "KernargSegmentSize"         integer        Required  The size in bytes of
+                                                           the kernarg segment
+                                                           that holds the values
+                                                           of the arguments to
+                                                           the kernel.
+     "GroupSegmentFixedSize"      integer        Required  The amount of group
+                                                           segment memory
+                                                           required by a
+                                                           work-group in
+                                                           bytes. This does not
+                                                           include any
+                                                           dynamically allocated
+                                                           group segment memory
+                                                           that may be added
+                                                           when the kernel is
+                                                           dispatched.
+     "PrivateSegmentFixedSize"    integer        Required  The amount of fixed
+                                                           private address space
+                                                           memory required for a
+                                                           work-item in
+                                                           bytes. If
+                                                           IsDynamicCallstack
+                                                           is 1 then additional
+                                                           space must be added
+                                                           to this value for the
+                                                           call stack.
+     "KernargSegmentAlign"        integer        Required  The maximum byte
+                                                           alignment of
+                                                           arguments in the
+                                                           kernarg segment. Must
+                                                           be a power of 2.
+     "WavefrontSize"              integer        Required  Wavefront size. Must
+                                                           be a power of 2.
+     "NumSGPRs"                   integer                  Number of scalar
+                                                           registers used by a
+                                                           wavefront for
+                                                           GFX6-GFX9. This
+                                                           includes the special
+                                                           SGPRs for VCC, Flat
+                                                           Scratch (GFX7-GFX9)
+                                                           and XNACK (for
+                                                           GFX8-GFX9). It does
+                                                           not include the 16
+                                                           SGPR added if a trap
+                                                           handler is
+                                                           enabled. It is not
+                                                           rounded up to the
+                                                           allocation
+                                                           granularity.
+     "NumVGPRs"                   integer                  Number of vector
+                                                           registers used by
+                                                           each work-item for
+                                                           GFX6-GFX9
+     "MaxFlatWorkGroupSize"       integer                  Maximum flat
+                                                           work-group size
+                                                           supported by the
+                                                           kernel in work-items.
+     "IsDynamicCallStack"         boolean                  Indicates if the
+                                                           generated machine
+                                                           code is using a
+                                                           dynamically sized
+                                                           call stack.
+     "IsXNACKEnabled"             boolean                  Indicates if the
+                                                           generated machine
+                                                           code is capable of
+                                                           supporting XNACK.
+     ============================ ============== ========= =====================
+
+..
+
+  .. table:: AMDHSA Code Object Kernel Debug Properties Metadata Mapping
+     :name: amdgpu-amdhsa-code-object-kernel-debug-properties-metadata-mapping-table
+
+     =================================== ============== ========= ==============
+     String Key                          Value Type     Required? Description
+     =================================== ============== ========= ==============
+     "DebuggerABIVersion"                sequence of
+                                         2 integers
+     "ReservedNumVGPRs"                  integer
+     "ReservedFirstVGPR"                 integer
+     "PrivateSegmentBufferSGPR"          integer
+     "WavefrontPrivateSegmentOffsetSGPR" integer
+     =================================== ============== ========= ==============
+
+.. TODO
+   Plan to remove the debug properties metadata.
+
+Kernel Dispatch
+~~~~~~~~~~~~~~~
+
+The HSA architected queuing language (AQL) defines a user space memory interface
+that can be used to control the dispatch of kernels, in an agent independent
+way. An agent can have zero or more AQL queues created for it using the ROCm
+runtime, in which AQL packets (all of which are 64 bytes) can be placed. See the
+*HSA Platform System Architecture Specification* [HSA]_ for the AQL queue
+mechanics and packet layouts.
+
+The packet processor of a kernel agent is responsible for detecting and
+dispatching HSA kernels from the AQL queues associated with it. For AMD GPUs the
+packet processor is implemented by the hardware command processor (CP),
+asynchronous dispatch controller (ADC) and shader processor input controller
+(SPI).
+
+The ROCm runtime can be used to allocate an AQL queue object. It uses the kernel
+mode driver to initialize and register the AQL queue with CP.
+
+To dispatch a kernel the following actions are performed. This can occur in the
+CPU host program, or from an HSA kernel executing on a GPU.
+
+1. A pointer to an AQL queue for the kernel agent on which the kernel is to be
+   executed is obtained.
+2. A pointer to the kernel descriptor (see
+   :ref:`amdgpu-amdhsa-kernel-descriptor`) of the kernel to execute is
+   obtained. It must be for a kernel that is contained in a code object that that
+   was loaded by the ROCm runtime on the kernel agent with which the AQL queue is
+   associated.
+3. Space is allocated for the kernel arguments using the ROCm runtime allocator
+   for a memory region with the kernarg property for the kernel agent that will
+   execute the kernel. It must be at least 16 byte aligned.
+4. Kernel argument values are assigned to the kernel argument memory
+   allocation. The layout is defined in the *HSA Programmer's Language Reference*
+   [HSA]_. For AMDGPU the kernel execution directly accesses the kernel argument
+   memory in the same way constant memory is accessed. (Note that the HSA
+   specification allows an implementation to copy the kernel argument contents to
+   another location that is accessed by the kernel.)
+5. An AQL kernel dispatch packet is created on the AQL queue. The ROCm runtime
+   api uses 64 bit atomic operations to reserve space in the AQL queue for the
+   packet. The packet must be set up, and the final write must use an atomic
+   store release to set the packet kind to ensure the packet contents are
+   visible to the kernel agent. AQL defines a doorbell signal mechanism to
+   notify the kernel agent that the AQL queue has been updated. These rules, and
+   the layout of the AQL queue and kernel dispatch packet is defined in the *HSA
+   System Architecture Specification* [HSA]_.
+6. A kernel dispatch packet includes information about the actual dispatch,
+   such as grid and work-group size, together with information from the code
+   object about the kernel, such as segment sizes. The ROCm runtime queries on
+   the kernel symbol can be used to obtain the code object values which are
+   recorded in the :ref:`amdgpu-amdhsa-hsa-code-object-metadata`.
+7. CP executes micro-code and is responsible for detecting and setting up the
+   GPU to execute the wavefronts of a kernel dispatch.
+8. CP ensures that when the a wavefront starts executing the kernel machine
+   code, the scalar general purpose registers (SGPR) and vector general purpose
+   registers (VGPR) are set up as required by the machine code. The required
+   setup is defined in the :ref:`amdgpu-amdhsa-kernel-descriptor`. The initial
+   register state is defined in
+   :ref:`amdgpu-amdhsa-initial-kernel-execution-state`.
+9. The prolog of the kernel machine code (see
+   :ref:`amdgpu-amdhsa-kernel-prolog`) sets up the machine state as necessary
+   before continuing executing the machine code that corresponds to the kernel.
+10. When the kernel dispatch has completed execution, CP signals the completion
+    signal specified in the kernel dispatch packet if not 0.
+
+.. _amdgpu-amdhsa-memory-spaces:
+
+Memory Spaces
+~~~~~~~~~~~~~
+
+The memory space properties are:
+
+  .. table:: AMDHSA Memory Spaces
+     :name: amdgpu-amdhsa-memory-spaces-table
+
+     ================= =========== ======== ======= ==================
+     Memory Space Name HSA Segment Hardware Address NULL Value
+                       Name        Name     Size
+     ================= =========== ======== ======= ==================
+     Private           private     scratch  32      0x00000000
+     Local             group       LDS      32      0xFFFFFFFF
+     Global            global      global   64      0x0000000000000000
+     Constant          constant    *same as 64      0x0000000000000000
+                                   global*
+     Generic           flat        flat     64      0x0000000000000000
+     Region            N/A         GDS      32      *not implemented
+                                                    for AMDHSA*
+     ================= =========== ======== ======= ==================
+
+The global and constant memory spaces both use global virtual addresses, which
+are the same virtual address space used by the CPU. However, some virtual
+addresses may only be accessible to the CPU, some only accessible by the GPU,
+and some by both.
+
+Using the constant memory space indicates that the data will not change during
+the execution of the kernel. This allows scalar read instructions to be
+used. The vector and scalar L1 caches are invalidated of volatile data before
+each kernel dispatch execution to allow constant memory to change values between
+kernel dispatches.
+
+The local memory space uses the hardware Local Data Store (LDS) which is
+automatically allocated when the hardware creates work-groups of wavefronts, and
+freed when all the wavefronts of a work-group have terminated. The data store
+(DS) instructions can be used to access it.
+
+The private memory space uses the hardware scratch memory support. If the kernel
+uses scratch, then the hardware allocates memory that is accessed using
+wavefront lane dword (4 byte) interleaving. The mapping used from private
+address to physical address is:
+
+  ``wavefront-scratch-base +
+  (private-address * wavefront-size * 4) +
+  (wavefront-lane-id * 4)``
+
+There are different ways that the wavefront scratch base address is determined
+by a wavefront (see :ref:`amdgpu-amdhsa-initial-kernel-execution-state`). This
+memory can be accessed in an interleaved manner using buffer instruction with
+the scratch buffer descriptor and per wave scratch offset, by the scratch
+instructions, or by flat instructions. If each lane of a wavefront accesses the
+same private address, the interleaving results in adjacent dwords being accessed
+and hence requires fewer cache lines to be fetched. Multi-dword access is not
+supported except by flat and scratch instructions in GFX9.
+
+The generic address space uses the hardware flat address support available in
+GFX7-GFX9. This uses two fixed ranges of virtual addresses (the private and
+local appertures), that are outside the range of addressible global memory, to
+map from a flat address to a private or local address.
+
+FLAT instructions can take a flat address and access global, private (scratch)
+and group (LDS) memory depending in if the address is within one of the
+apperture ranges. Flat access to scratch requires hardware aperture setup and
+setup in the kernel prologue (see :ref:`amdgpu-amdhsa-flat-scratch`). Flat
+access to LDS requires hardware aperture setup and M0 (GFX7-GFX8) register setup
+(see :ref:`amdgpu-amdhsa-m0`).
+
+To convert between a segment address and a flat address the base address of the
+appertures address can be used. For GFX7-GFX8 these are available in the
+:ref:`amdgpu-amdhsa-hsa-aql-queue` the address of which can be obtained with
+Queue Ptr SGPR (see :ref:`amdgpu-amdhsa-initial-kernel-execution-state`). For
+GFX9 the appature base addresses are directly available as inline constant
+registers ``SRC_SHARED_BASE/LIMIT`` and ``SRC_PRIVATE_BASE/LIMIT``. In 64 bit
+address mode the apperture sizes are 2^32 bytes and the base is aligned to 2^32
+which makes it easier to convert from flat to segment or segment to flat.
+
+Image and Samplers
+~~~~~~~~~~~~~~~~~~
+
+Image and sample handles created by the ROCm runtime are 64 bit addresses of a
+hardware 32 byte V# and 48 byte S# object respectively. In order to support the
+HSA ``query_sampler`` operations two extra dwords are used to store the HSA BRIG
+enumeration values for the queries that are not trivially deducible from the S#
+representation.
+
+HSA Signals
+~~~~~~~~~~~
+
+HSA signal handles created by the ROCm runtime are 64 bit addresses of a
+structure allocated in memory accessible from both the CPU and GPU. The
+structure is defined by the ROCm runtime and subject to change between releases
+(see [AMD-ROCm-github]_).
+
+.. _amdgpu-amdhsa-hsa-aql-queue:
+
+HSA AQL Queue
+~~~~~~~~~~~~~
+
+The HSA AQL queue structure is defined by the ROCm runtime and subject to change
+between releases (see [AMD-ROCm-github]_). For some processors it contains
+fields needed to implement certain language features such as the flat address
+aperture bases. It also contains fields used by CP such as managing the
+allocation of scratch memory.
+
+.. _amdgpu-amdhsa-kernel-descriptor:
+
+Kernel Descriptor
+~~~~~~~~~~~~~~~~~
+
+A kernel descriptor consists of the information needed by CP to initiate the
+execution of a kernel, including the entry point address of the machine code
+that implements the kernel.
+
+Kernel Descriptor for GFX6-GFX9
++++++++++++++++++++++++++++++++
+
+CP microcode requires the Kernel descritor to be allocated on 64 byte alignment.
+
+  .. table:: Kernel Descriptor for GFX6-GFX9
+     :name: amdgpu-amdhsa-kernel-descriptor-gfx6-gfx9-table
+
+     ======= ======= =============================== ============================
+     Bits    Size    Field Name                      Description
+     ======= ======= =============================== ============================
+     31:0    4 bytes GroupSegmentFixedSize           The amount of fixed local
+                                                     address space memory
+                                                     required for a work-group
+                                                     in bytes. This does not
+                                                     include any dynamically
+                                                     allocated local address
+                                                     space memory that may be
+                                                     added when the kernel is
+                                                     dispatched.
+     63:32   4 bytes PrivateSegmentFixedSize         The amount of fixed
+                                                     private address space
+                                                     memory required for a
+                                                     work-item in bytes. If
+                                                     is_dynamic_callstack is 1
+                                                     then additional space must
+                                                     be added to this value for
+                                                     the call stack.
+     95:64   4 bytes MaxFlatWorkGroupSize            Maximum flat work-group
+                                                     size supported by the
+                                                     kernel in work-items.
+     96      1 bit   IsDynamicCallStack              Indicates if the generated
+                                                     machine code is using a
+                                                     dynamically sized call
+                                                     stack.
+     97      1 bit   IsXNACKEnabled                  Indicates if the generated
+                                                     machine code is capable of
+                                                     suppoting XNACK.
+     127:98  30 bits                                 Reserved, must be 0.
+     191:128 8 bytes KernelCodeEntryByteOffset       Byte offset (possibly
+                                                     negative) from base
+                                                     address of kernel
+                                                     descriptor to kernel's
+                                                     entry point instruction
+                                                     which must be 256 byte
+                                                     aligned.
+     383:192 24                                      Reserved, must be 0.
+             bytes
+     415:384 4 bytes ComputePgmRsrc1                 Compute Shader (CS)
+                                                     program settings used by
+                                                     CP to set up
+                                                     ``COMPUTE_PGM_RSRC1``
+                                                     configuration
+                                                     register. See
+                                                     :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx9-table`.
+     447:416 4 bytes ComputePgmRsrc2                 Compute Shader (CS)
+                                                     program settings used by
+                                                     CP to set up
+                                                     ``COMPUTE_PGM_RSRC2``
+                                                     configuration
+                                                     register. See
+                                                     :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx9-table`.
+     448     1 bit   EnableSGPRPrivateSegmentBuffer  Enable the setup of the
+                                                     SGPR user data registers
+                                                     (see
+                                                     :ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
+
+                                                     The total number of SGPR
+                                                     user data registers
+                                                     requested must not exceed
+                                                     16 and match value in
+                                                     ``compute_pgm_rsrc2.user_sgpr.user_sgpr_count``.
+                                                     Any requests beyond 16
+                                                     will be ignored.
+     449     1 bit   EnableSGPRDispatchPtr           *see above*
+     450     1 bit   EnableSGPRQueuePtr              *see above*
+     451     1 bit   EnableSGPRKernargSegmentPtr     *see above*
+     452     1 bit   EnableSGPRDispatchID            *see above*
+     453     1 bit   EnableSGPRFlatScratchInit       *see above*
+     454     1 bit   EnableSGPRPrivateSegmentSize    *see above*
+     455     1 bit   EnableSGPRGridWorkgroupCountX   Not implemented in CP and
+                                                     should always be 0.
+     456     1 bit   EnableSGPRGridWorkgroupCountY   Not implemented in CP and
+                                                     should always be 0.
+     457     1 bit   EnableSGPRGridWorkgroupCountZ   Not implemented in CP and
+                                                     should always be 0.
+     463:458 6 bits                                  Reserved, must be 0.
+     511:464 6                                       Reserved, must be 0.
+             bytes
+     512     **Total size 64 bytes.**
+     ======= ====================================================================
+
+..
+
+  .. table:: compute_pgm_rsrc1 for GFX6-GFX9
+     :name: amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx9-table
+
+     ======= ======= =============================== ===========================================================================
+     Bits    Size    Field Name                      Description
+     ======= ======= =============================== ===========================================================================
+     5:0     6 bits  GRANULATED_WORKITEM_VGPR_COUNT  Number of vector registers
+                                                     used by each work-item,
+                                                     granularity is device
+                                                     specific:
+
+                                                     GFX6-9
+                                                       - max_vgpr 1..256
+                                                       - roundup((max_vgpg + 1)
+                                                         / 4) - 1
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC1.VGPRS``.
+     9:6     4 bits  GRANULATED_WAVEFRONT_SGPR_COUNT Number of scalar registers
+                                                     used by a wavefront,
+                                                     granularity is device
+                                                     specific:
+
+                                                     GFX6-8
+                                                       - max_sgpr 1..112
+                                                       - roundup((max_sgpg + 1)
+                                                         / 8) - 1
+                                                     GFX9
+                                                       - max_sgpr 1..112
+                                                       - roundup((max_sgpg + 1)
+                                                         / 16) - 1
+
+                                                     Includes the special SGPRs
+                                                     for VCC, Flat Scratch (for
+                                                     GFX7 onwards) and XNACK
+                                                     (for GFX8 onwards). It does
+                                                     not include the 16 SGPR
+                                                     added if a trap handler is
+                                                     enabled.
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC1.SGPRS``.
+     11:10   2 bits  PRIORITY                        Must be 0.
+
+                                                     Start executing wavefront
+                                                     at the specified priority.
+
+                                                     CP is responsible for
+                                                     filling in
+                                                     ``COMPUTE_PGM_RSRC1.PRIORITY``.
+     13:12   2 bits  FLOAT_ROUND_MODE_32             Wavefront starts execution
+                                                     with specified rounding
+                                                     mode for single (32
+                                                     bit) floating point
+                                                     precision floating point
+                                                     operations.
+
+                                                     Floating point rounding
+                                                     mode values are defined in
+                                                     :ref:`amdgpu-amdhsa-floating-point-rounding-mode-enumeration-values-table`.
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC1.FLOAT_MODE``.
+     15:14   2 bits  FLOAT_ROUND_MODE_16_64          Wavefront starts execution
+                                                     with specified rounding
+                                                     denorm mode for half/double (16
+                                                     and 64 bit) floating point
+                                                     precision floating point
+                                                     operations.
+
+                                                     Floating point rounding
+                                                     mode values are defined in
+                                                     :ref:`amdgpu-amdhsa-floating-point-rounding-mode-enumeration-values-table`.
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC1.FLOAT_MODE``.
+     17:16   2 bits  FLOAT_DENORM_MODE_32            Wavefront starts execution
+                                                     with specified denorm mode
+                                                     for single (32
+                                                     bit)  floating point
+                                                     precision floating point
+                                                     operations.
+
+                                                     Floating point denorm mode
+                                                     values are defined in
+                                                     :ref:`amdgpu-amdhsa-floating-point-denorm-mode-enumeration-values-table`.
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC1.FLOAT_MODE``.
+     19:18   2 bits  FLOAT_DENORM_MODE_16_64         Wavefront starts execution
+                                                     with specified denorm mode
+                                                     for half/double (16
+                                                     and 64 bit) floating point
+                                                     precision floating point
+                                                     operations.
+
+                                                     Floating point denorm mode
+                                                     values are defined in
+                                                     :ref:`amdgpu-amdhsa-floating-point-denorm-mode-enumeration-values-table`.
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC1.FLOAT_MODE``.
+     20      1 bit   PRIV                            Must be 0.
+
+                                                     Start executing wavefront
+                                                     in privilege trap handler
+                                                     mode.
+
+                                                     CP is responsible for
+                                                     filling in
+                                                     ``COMPUTE_PGM_RSRC1.PRIV``.
+     21      1 bit   ENABLE_DX10_CLAMP               Wavefront starts execution
+                                                     with DX10 clamp mode
+                                                     enabled. Used by the vector
+                                                     ALU to force DX10 style
+                                                     treatment of NaN's (when
+                                                     set, clamp NaN to zero,
+                                                     otherwise pass NaN
+                                                     through).
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC1.DX10_CLAMP``.
+     22      1 bit   DEBUG_MODE                      Must be 0.
+
+                                                     Start executing wavefront
+                                                     in single step mode.
+
+                                                     CP is responsible for
+                                                     filling in
+                                                     ``COMPUTE_PGM_RSRC1.DEBUG_MODE``.
+     23      1 bit   ENABLE_IEEE_MODE                Wavefront starts execution
+                                                     with IEEE mode
+                                                     enabled. Floating point
+                                                     opcodes that support
+                                                     exception flag gathering
+                                                     will quiet and propagate
+                                                     signaling-NaN inputs per
+                                                     IEEE 754-2008. Min_dx10 and
+                                                     max_dx10 become IEEE
+                                                     754-2008 compliant due to
+                                                     signaling-NaN propagation
+                                                     and quieting.
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC1.IEEE_MODE``.
+     24      1 bit   BULKY                           Must be 0.
+
+                                                     Only one work-group allowed
+                                                     to execute on a compute
+                                                     unit.
+
+                                                     CP is responsible for
+                                                     filling in
+                                                     ``COMPUTE_PGM_RSRC1.BULKY``.
+     25      1 bit   CDBG_USER                       Must be 0.
+
+                                                     Flag that can be used to
+                                                     control debugging code.
+
+                                                     CP is responsible for
+                                                     filling in
+                                                     ``COMPUTE_PGM_RSRC1.CDBG_USER``.
+     26      1 bit   FP16_OVFL                       GFX6-8
+                                                       Reserved, must be 0.
+                                                     GFX9
+                                                       Wavefront starts execution
+                                                       with specified fp16 overflow
+                                                       mode.
+
+                                                       - If 0, fp16 overflow generates
+                                                         +/-INF values.
+                                                       - If 1, fp16 overflow that is the
+                                                         result of an +/-INF input value
+                                                         or divide by 0 produces a +/-INF,
+                                                         otherwise clamps computed
+                                                         overflow to +/-MAX_FP16 as
+                                                         appropriate.
+
+                                                       Used by CP to set up
+                                                       ``COMPUTE_PGM_RSRC1.FP16_OVFL``.
+     31:27   5 bits                                  Reserved, must be 0.
+     32      **Total size 4 bytes**
+     ======= ===================================================================================================================
+
+..
+
+  .. table:: compute_pgm_rsrc2 for GFX6-GFX9
+     :name: amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx9-table
+
+     ======= ======= =============================== ===========================================================================
+     Bits    Size    Field Name                      Description
+     ======= ======= =============================== ===========================================================================
+     0       1 bit   ENABLE_SGPR_PRIVATE_SEGMENT     Enable the setup of the
+                     _WAVE_OFFSET                    SGPR wave scratch offset
+                                                     system register (see
+                                                     :ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC2.SCRATCH_EN``.
+     5:1     5 bits  USER_SGPR_COUNT                 The total number of SGPR
+                                                     user data registers
+                                                     requested. This number must
+                                                     match the number of user
+                                                     data registers enabled.
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC2.USER_SGPR``.
+     6       1 bit   ENABLE_TRAP_HANDLER             Set to 1 if code contains a
+                                                     TRAP instruction which
+                                                     requires a trap handler to
+                                                     be enabled.
+
+                                                     CP sets
+                                                     ``COMPUTE_PGM_RSRC2.TRAP_PRESENT``
+                                                     if the runtime has
+                                                     installed a trap handler
+                                                     regardless of the setting
+                                                     of this field.
+     7       1 bit   ENABLE_SGPR_WORKGROUP_ID_X      Enable the setup of the
+                                                     system SGPR register for
+                                                     the work-group id in the X
+                                                     dimension (see
+                                                     :ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC2.TGID_X_EN``.
+     8       1 bit   ENABLE_SGPR_WORKGROUP_ID_Y      Enable the setup of the
+                                                     system SGPR register for
+                                                     the work-group id in the Y
+                                                     dimension (see
+                                                     :ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC2.TGID_Y_EN``.
+     9       1 bit   ENABLE_SGPR_WORKGROUP_ID_Z      Enable the setup of the
+                                                     system SGPR register for
+                                                     the work-group id in the Z
+                                                     dimension (see
+                                                     :ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC2.TGID_Z_EN``.
+     10      1 bit   ENABLE_SGPR_WORKGROUP_INFO      Enable the setup of the
+                                                     system SGPR register for
+                                                     work-group information (see
+                                                     :ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC2.TGID_SIZE_EN``.
+     12:11   2 bits  ENABLE_VGPR_WORKITEM_ID         Enable the setup of the
+                                                     VGPR system registers used
+                                                     for the work-item ID.
+                                                     :ref:`amdgpu-amdhsa-system-vgpr-work-item-id-enumeration-values-table`
+                                                     defines the values.
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC2.TIDIG_CMP_CNT``.
+     13      1 bit   ENABLE_EXCEPTION_ADDRESS_WATCH  Must be 0.
+
+                                                     Wavefront starts execution
+                                                     with address watch
+                                                     exceptions enabled which
+                                                     are generated when L1 has
+                                                     witnessed a thread access
+                                                     an *address of
+                                                     interest*.
+
+                                                     CP is responsible for
+                                                     filling in the address
+                                                     watch bit in
+                                                     ``COMPUTE_PGM_RSRC2.EXCP_EN_MSB``
+                                                     according to what the
+                                                     runtime requests.
+     14      1 bit   ENABLE_EXCEPTION_MEMORY         Must be 0.
+
+                                                     Wavefront starts execution
+                                                     with memory violation
+                                                     exceptions exceptions
+                                                     enabled which are generated
+                                                     when a memory violation has
+                                                     occurred for this wave from
+                                                     L1 or LDS
+                                                     (write-to-read-only-memory,
+                                                     mis-aligned atomic, LDS
+                                                     address out of range,
+                                                     illegal address, etc.).
+
+                                                     CP sets the memory
+                                                     violation bit in
+                                                     ``COMPUTE_PGM_RSRC2.EXCP_EN_MSB``
+                                                     according to what the
+                                                     runtime requests.
+     23:15   9 bits  GRANULATED_LDS_SIZE             Must be 0.
+
+                                                     CP uses the rounded value
+                                                     from the dispatch packet,
+                                                     not this value, as the
+                                                     dispatch may contain
+                                                     dynamically allocated group
+                                                     segment memory. CP writes
+                                                     directly to
+                                                     ``COMPUTE_PGM_RSRC2.LDS_SIZE``.
+
+                                                     Amount of group segment
+                                                     (LDS) to allocate for each
+                                                     work-group. Granularity is
+                                                     device specific:
+
+                                                     GFX6:
+                                                       roundup(lds-size / (64 * 4))
+                                                     GFX7-GFX9:
+                                                       roundup(lds-size / (128 * 4))
+
+     24      1 bit   ENABLE_EXCEPTION_IEEE_754_FP    Wavefront starts execution
+                     _INVALID_OPERATION              with specified exceptions
+                                                     enabled.
+
+                                                     Used by CP to set up
+                                                     ``COMPUTE_PGM_RSRC2.EXCP_EN``
+                                                     (set from bits 0..6).
+
+                                                     IEEE 754 FP Invalid
+                                                     Operation
+     25      1 bit   ENABLE_EXCEPTION_FP_DENORMAL    FP Denormal one or more
+                     _SOURCE                         input operands is a
+                                                     denormal number
+     26      1 bit   ENABLE_EXCEPTION_IEEE_754_FP    IEEE 754 FP Division by
+                     _DIVISION_BY_ZERO               Zero
+     27      1 bit   ENABLE_EXCEPTION_IEEE_754_FP    IEEE 754 FP FP Overflow
+                     _OVERFLOW
+     28      1 bit   ENABLE_EXCEPTION_IEEE_754_FP    IEEE 754 FP Underflow
+                     _UNDERFLOW
+     29      1 bit   ENABLE_EXCEPTION_IEEE_754_FP    IEEE 754 FP Inexact
+                     _INEXACT
+     30      1 bit   ENABLE_EXCEPTION_INT_DIVIDE_BY  Integer Division by Zero
+                     _ZERO                           (rcp_iflag_f32 instruction
+                                                     only)
+     31      1 bit                                   Reserved, must be 0.
+     32      **Total size 4 bytes.**
+     ======= ===================================================================================================================
+
+..
+
+  .. table:: Floating Point Rounding Mode Enumeration Values
+     :name: amdgpu-amdhsa-floating-point-rounding-mode-enumeration-values-table
+
+     ====================================== ===== ==============================
+     Enumeration Name                       Value Description
+     ====================================== ===== ==============================
+     AMDGPU_FLOAT_ROUND_MODE_NEAR_EVEN      0     Round Ties To Even
+     AMDGPU_FLOAT_ROUND_MODE_PLUS_INFINITY  1     Round Toward +infinity
+     AMDGPU_FLOAT_ROUND_MODE_MINUS_INFINITY 2     Round Toward -infinity
+     AMDGPU_FLOAT_ROUND_MODE_ZERO           3     Round Toward 0
+     ====================================== ===== ==============================
+
+..
+
+  .. table:: Floating Point Denorm Mode Enumeration Values
+     :name: amdgpu-amdhsa-floating-point-denorm-mode-enumeration-values-table
+
+     ====================================== ===== ==============================
+     Enumeration Name                       Value Description
+     ====================================== ===== ==============================
+     AMDGPU_FLOAT_DENORM_MODE_FLUSH_SRC_DST 0     Flush Source and Destination
+                                                  Denorms
+     AMDGPU_FLOAT_DENORM_MODE_FLUSH_DST     1     Flush Output Denorms
+     AMDGPU_FLOAT_DENORM_MODE_FLUSH_SRC     2     Flush Source Denorms
+     AMDGPU_FLOAT_DENORM_MODE_FLUSH_NONE    3     No Flush
+     ====================================== ===== ==============================
+
+..
+
+  .. table:: System VGPR Work-Item ID Enumeration Values
+     :name: amdgpu-amdhsa-system-vgpr-work-item-id-enumeration-values-table
+
+     ======================================== ===== ============================
+     Enumeration Name                         Value Description
+     ======================================== ===== ============================
+     AMDGPU_SYSTEM_VGPR_WORKITEM_ID_X         0     Set work-item X dimension
+                                                    ID.
+     AMDGPU_SYSTEM_VGPR_WORKITEM_ID_X_Y       1     Set work-item X and Y
+                                                    dimensions ID.
+     AMDGPU_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z     2     Set work-item X, Y and Z
+                                                    dimensions ID.
+     AMDGPU_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED 3     Undefined.
+     ======================================== ===== ============================
+
+.. _amdgpu-amdhsa-initial-kernel-execution-state:
+
+Initial Kernel Execution State
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This section defines the register state that will be set up by the packet
+processor prior to the start of execution of every wavefront. This is limited by
+the constraints of the hardware controllers of CP/ADC/SPI.
+
+The order of the SGPR registers is defined, but the compiler can specify which
+ones are actually setup in the kernel descriptor using the ``enable_sgpr_*`` bit
+fields (see :ref:`amdgpu-amdhsa-kernel-descriptor`). The register numbers used
+for enabled registers are dense starting at SGPR0: the first enabled register is
+SGPR0, the next enabled register is SGPR1 etc.; disabled registers do not have
+an SGPR number.
+
+The initial SGPRs comprise up to 16 User SRGPs that are set by CP and apply to
+all waves of the grid. It is possible to specify more than 16 User SGPRs using
+the ``enable_sgpr_*`` bit fields, in which case only the first 16 are actually
+initialized. These are then immediately followed by the System SGPRs that are
+set up by ADC/SPI and can have different values for each wave of the grid
+dispatch.
+
+SGPR register initial state is defined in
+:ref:`amdgpu-amdhsa-sgpr-register-set-up-order-table`.
+
+  .. table:: SGPR Register Set Up Order
+     :name: amdgpu-amdhsa-sgpr-register-set-up-order-table
+
+     ========== ========================== ====== ==============================
+     SGPR Order Name                       Number Description
+                (kernel descriptor enable  of
+                field)                     SGPRs
+     ========== ========================== ====== ==============================
+     First      Private Segment Buffer     4      V# that can be used, together
+                (enable_sgpr_private              with Scratch Wave Offset as an
+                _segment_buffer)                  offset, to access the private
+                                                  memory space using a segment
+                                                  address.
+
+                                                  CP uses the value provided by
+                                                  the runtime.
+     then       Dispatch Ptr               2      64 bit address of AQL dispatch
+                (enable_sgpr_dispatch_ptr)        packet for kernel dispatch
+                                                  actually executing.
+     then       Queue Ptr                  2      64 bit address of amd_queue_t
+                (enable_sgpr_queue_ptr)           object for AQL queue on which
+                                                  the dispatch packet was
+                                                  queued.
+     then       Kernarg Segment Ptr        2      64 bit address of Kernarg
+                (enable_sgpr_kernarg              segment. This is directly
+                _segment_ptr)                     copied from the
+                                                  kernarg_address in the kernel
+                                                  dispatch packet.
+
+                                                  Having CP load it once avoids
+                                                  loading it at the beginning of
+                                                  every wavefront.
+     then       Dispatch Id                2      64 bit Dispatch ID of the
+                (enable_sgpr_dispatch_id)         dispatch packet being
+                                                  executed.
+     then       Flat Scratch Init          2      This is 2 SGPRs:
+                (enable_sgpr_flat_scratch
+                _init)                            GFX6
+                                                    Not supported.
+                                                  GFX7-GFX8
+                                                    The first SGPR is a 32 bit
+                                                    byte offset from
+                                                    ``SH_HIDDEN_PRIVATE_BASE_VIMID``
+                                                    to per SPI base of memory
+                                                    for scratch for the queue
+                                                    executing the kernel
+                                                    dispatch. CP obtains this
+                                                    from the runtime. (The
+                                                    Scratch Segment Buffer base
+                                                    address is
+                                                    ``SH_HIDDEN_PRIVATE_BASE_VIMID``
+                                                    plus this offset.) The value
+                                                    of Scratch Wave Offset must
+                                                    be added to this offset by
+                                                    the kernel machine code,
+                                                    right shifted by 8, and
+                                                    moved to the FLAT_SCRATCH_HI
+                                                    SGPR register.
+                                                    FLAT_SCRATCH_HI corresponds
+                                                    to SGPRn-4 on GFX7, and
+                                                    SGPRn-6 on GFX8 (where SGPRn
+                                                    is the highest numbered SGPR
+                                                    allocated to the wave).
+                                                    FLAT_SCRATCH_HI is
+                                                    multiplied by 256 (as it is
+                                                    in units of 256 bytes) and
+                                                    added to
+                                                    ``SH_HIDDEN_PRIVATE_BASE_VIMID``
+                                                    to calculate the per wave
+                                                    FLAT SCRATCH BASE in flat
+                                                    memory instructions that
+                                                    access the scratch
+                                                    apperture.
+
+                                                    The second SGPR is 32 bit
+                                                    byte size of a single
+                                                    work-item's scratch memory
+                                                    usage. CP obtains this from
+                                                    the runtime, and it is
+                                                    always a multiple of DWORD.
+                                                    CP checks that the value in
+                                                    the kernel dispatch packet
+                                                    Private Segment Byte Size is
+                                                    not larger, and requests the
+                                                    runtime to increase the
+                                                    queue's scratch size if
+                                                    necessary. The kernel code
+                                                    must move it to
+                                                    FLAT_SCRATCH_LO which is
+                                                    SGPRn-3 on GFX7 and SGPRn-5
+                                                    on GFX8. FLAT_SCRATCH_LO is
+                                                    used as the FLAT SCRATCH
+                                                    SIZE in flat memory
+                                                    instructions. Having CP load
+                                                    it once avoids loading it at
+                                                    the beginning of every
+                                                    wavefront. GFX9 This is the
+                                                    64 bit base address of the
+                                                    per SPI scratch backing
+                                                    memory managed by SPI for
+                                                    the queue executing the
+                                                    kernel dispatch. CP obtains
+                                                    this from the runtime (and
+                                                    divides it if there are
+                                                    multiple Shader Arrays each
+                                                    with its own SPI). The value
+                                                    of Scratch Wave Offset must
+                                                    be added by the kernel
+                                                    machine code and the result
+                                                    moved to the FLAT_SCRATCH
+                                                    SGPR which is SGPRn-6 and
+                                                    SGPRn-5. It is used as the
+                                                    FLAT SCRATCH BASE in flat
+                                                    memory instructions. then
+                                                    Private Segment Size 1 The
+                                                    32 bit byte size of a
+                                                    (enable_sgpr_private single
+                                                    work-item's
+                                                    scratch_segment_size) memory
+                                                    allocation. This is the
+                                                    value from the kernel
+                                                    dispatch packet Private
+                                                    Segment Byte Size rounded up
+                                                    by CP to a multiple of
+                                                    DWORD.
+
+                                                  Having CP load it once avoids
+                                                  loading it at the beginning of
+                                                  every wavefront.
+
+                                                  This is not used for
+                                                  GFX7-GFX8 since it is the same
+                                                  value as the second SGPR of
+                                                  Flat Scratch Init. However, it
+                                                  may be needed for GFX9 which
+                                                  changes the meaning of the
+                                                  Flat Scratch Init value.
+     then       Grid Work-Group Count X    1      32 bit count of the number of
+                (enable_sgpr_grid                 work-groups in the X dimension
+                _workgroup_count_X)               for the grid being
+                                                  executed. Computed from the
+                                                  fields in the kernel dispatch
+                                                  packet as ((grid_size.x +
+                                                  workgroup_size.x - 1) /
+                                                  workgroup_size.x).
+     then       Grid Work-Group Count Y    1      32 bit count of the number of
+                (enable_sgpr_grid                 work-groups in the Y dimension
+                _workgroup_count_Y &&             for the grid being
+                less than 16 previous             executed. Computed from the
+                SGPRs)                            fields in the kernel dispatch
+                                                  packet as ((grid_size.y +
+                                                  workgroup_size.y - 1) /
+                                                  workgroupSize.y).
+
+                                                  Only initialized if <16
+                                                  previous SGPRs initialized.
+     then       Grid Work-Group Count Z    1      32 bit count of the number of
+                (enable_sgpr_grid                 work-groups in the Z dimension
+                _workgroup_count_Z &&             for the grid being
+                less than 16 previous             executed. Computed from the
+                SGPRs)                            fields in the kernel dispatch
+                                                  packet as ((grid_size.z +
+                                                  workgroup_size.z - 1) /
+                                                  workgroupSize.z).
+
+                                                  Only initialized if <16
+                                                  previous SGPRs initialized.
+     then       Work-Group Id X            1      32 bit work-group id in X
+                (enable_sgpr_workgroup_id         dimension of grid for
+                _X)                               wavefront.
+     then       Work-Group Id Y            1      32 bit work-group id in Y
+                (enable_sgpr_workgroup_id         dimension of grid for
+                _Y)                               wavefront.
+     then       Work-Group Id Z            1      32 bit work-group id in Z
+                (enable_sgpr_workgroup_id         dimension of grid for
+                _Z)                               wavefront.
+     then       Work-Group Info            1      {first_wave, 14'b0000,
+                (enable_sgpr_workgroup            ordered_append_term[10:0],
+                _info)                            threadgroup_size_in_waves[5:0]}
+     then       Scratch Wave Offset        1      32 bit byte offset from base
+                (enable_sgpr_private              of scratch base of queue
+                _segment_wave_offset)             executing the kernel
+                                                  dispatch. Must be used as an
+                                                  offset with Private
+                                                  segment address when using
+                                                  Scratch Segment Buffer. It
+                                                  must be used to set up FLAT
+                                                  SCRATCH for flat addressing
+                                                  (see
+                                                  :ref:`amdgpu-amdhsa-flat-scratch`).
+     ========== ========================== ====== ==============================
+
+The order of the VGPR registers is defined, but the compiler can specify which
+ones are actually setup in the kernel descriptor using the ``enable_vgpr*`` bit
+fields (see :ref:`amdgpu-amdhsa-kernel-descriptor`). The register numbers used
+for enabled registers are dense starting at VGPR0: the first enabled register is
+VGPR0, the next enabled register is VGPR1 etc.; disabled registers do not have a
+VGPR number.
+
+VGPR register initial state is defined in
+:ref:`amdgpu-amdhsa-vgpr-register-set-up-order-table`.
+
+  .. table:: VGPR Register Set Up Order
+     :name: amdgpu-amdhsa-vgpr-register-set-up-order-table
+
+     ========== ========================== ====== ==============================
+     VGPR Order Name                       Number Description
+                (kernel descriptor enable  of
+                field)                     VGPRs
+     ========== ========================== ====== ==============================
+     First      Work-Item Id X             1      32 bit work item id in X
+                (Always initialized)              dimension of work-group for
+                                                  wavefront lane.
+     then       Work-Item Id Y             1      32 bit work item id in Y
+                (enable_vgpr_workitem_id          dimension of work-group for
+                > 0)                              wavefront lane.
+     then       Work-Item Id Z             1      32 bit work item id in Z
+                (enable_vgpr_workitem_id          dimension of work-group for
+                > 1)                              wavefront lane.
+     ========== ========================== ====== ==============================
+
+The setting of registers is is done by GPU CP/ADC/SPI hardware as follows:
+
+1. SGPRs before the Work-Group Ids are set by CP using the 16 User Data
+   registers.
+2. Work-group Id registers X, Y, Z are set by ADC which supports any
+   combination including none.
+3. Scratch Wave Offset is set by SPI in a per wave basis which is why its value
+   cannot included with the flat scratch init value which is per queue.
+4. The VGPRs are set by SPI which only supports specifying either (X), (X, Y)
+   or (X, Y, Z).
+
+Flat Scratch register pair are adjacent SGRRs so they can be moved as a 64 bit
+value to the hardware required SGPRn-3 and SGPRn-4 respectively.
+
+The global segment can be accessed either using buffer instructions (GFX6 which
+has V# 64 bit address support), flat instructions (GFX7-9), or global
+instructions (GFX9).
+
+If buffer operations are used then the compiler can generate a V# with the
+following properties:
+
+* base address of 0
+* no swizzle
+* ATC: 1 if IOMMU present (such as APU)
+* ptr64: 1
+* MTYPE set to support memory coherence that matches the runtime (such as CC for
+  APU and NC for dGPU).
+
+.. _amdgpu-amdhsa-kernel-prolog:
+
+Kernel Prolog
+~~~~~~~~~~~~~
+
+.. _amdgpu-amdhsa-m0:
+
+M0
+++
+
+GFX6-GFX8
+  The M0 register must be initialized with a value at least the total LDS size
+  if the kernel may access LDS via DS or flat operations. Total LDS size is
+  available in dispatch packet. For M0, it is also possible to use maximum
+  possible value of LDS for given target (0x7FFF for GFX6 and 0xFFFF for
+  GFX7-GFX8).
+GFX9
+  The M0 register is not used for range checking LDS accesses and so does not
+  need to be initialized in the prolog.
+
+.. _amdgpu-amdhsa-flat-scratch:
+
+Flat Scratch
+++++++++++++
+
+If the kernel may use flat operations to access scratch memory, the prolog code
+must set up FLAT_SCRATCH register pair (FLAT_SCRATCH_LO/FLAT_SCRATCH_HI which
+are in SGPRn-4/SGPRn-3). Initialization uses Flat Scratch Init and Scratch Wave
+Offset SGPR registers (see :ref:`amdgpu-amdhsa-initial-kernel-execution-state`):
+
+GFX6
+  Flat scratch is not supported.
+
+GFX7-8
+  1. The low word of Flat Scratch Init is 32 bit byte offset from
+     ``SH_HIDDEN_PRIVATE_BASE_VIMID`` to the base of scratch backing memory
+     being managed by SPI for the queue executing the kernel dispatch. This is
+     the same value used in the Scratch Segment Buffer V# base address. The
+     prolog must add the value of Scratch Wave Offset to get the wave's byte
+     scratch backing memory offset from ``SH_HIDDEN_PRIVATE_BASE_VIMID``. Since
+     FLAT_SCRATCH_LO is in units of 256 bytes, the offset must be right shifted
+     by 8 before moving into FLAT_SCRATCH_LO.
+  2. The second word of Flat Scratch Init is 32 bit byte size of a single
+     work-items scratch memory usage. This is directly loaded from the kernel
+     dispatch packet Private Segment Byte Size and rounded up to a multiple of
+     DWORD. Having CP load it once avoids loading it at the beginning of every
+     wavefront. The prolog must move it to FLAT_SCRATCH_LO for use as FLAT SCRATCH
+     SIZE.
+GFX9
+  The Flat Scratch Init is the 64 bit address of the base of scratch backing
+  memory being managed by SPI for the queue executing the kernel dispatch. The
+  prolog must add the value of Scratch Wave Offset and moved to the FLAT_SCRATCH
+  pair for use as the flat scratch base in flat memory instructions.
+
+.. _amdgpu-amdhsa-memory-model:
+
+Memory Model
+~~~~~~~~~~~~
+
+This section describes the mapping of LLVM memory model onto AMDGPU machine code
+(see :ref:`memmodel`). *The implementation is WIP.*
+
+.. TODO
+   Update when implementation complete.
+
+The AMDGPU backend supports the memory synchronization scopes specified in
+:ref:`amdgpu-memory-scopes`.
+
+The code sequences used to implement the memory model are defined in table
+:ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
+
+The sequences specify the order of instructions that a single thread must
+execute. The ``s_waitcnt`` and ``buffer_wbinvl1_vol`` are defined with respect
+to other memory instructions executed by the same thread. This allows them to be
+moved earlier or later which can allow them to be combined with other instances
+of the same instruction, or hoisted/sunk out of loops to improve
+performance. Only the instructions related to the memory model are given;
+additional ``s_waitcnt`` instructions are required to ensure registers are
+defined before being used. These may be able to be combined with the memory
+model ``s_waitcnt`` instructions as described above.
+
+The AMDGPU backend supports the following memory models:
+
+  HSA Memory Model [HSA]_
+    The HSA memory model uses a single happens-before relation for all address
+    spaces (see :ref:`amdgpu-address-spaces`).
+  OpenCL Memory Model [OpenCL]_
+    The OpenCL memory model which has separate happens-before relations for the
+    global and local address spaces. Only a fence specifying both global and
+    local address space, and seq_cst instructions join the relationships. Since
+    the LLVM ``memfence`` instruction does not allow an address space to be
+    specified the OpenCL fence has to convervatively assume both local and
+    global address space was specified. However, optimizations can often be
+    done to eliminate the additional ``s_waitcnt`` instructions when there are
+    no intervening memory instructions which access the corresponding address
+    space. The code sequences in the table indicate what can be omitted for the
+    OpenCL memory. The target triple environment is used to determine if the
+    source language is OpenCL (see :ref:`amdgpu-opencl`).
+
+``ds/flat_load/store/atomic`` instructions to local memory are termed LDS
+operations.
+
+``buffer/global/flat_load/store/atomic`` instructions to global memory are
+termed vector memory operations.
+
+For GFX6-GFX9:
+
+* Each agent has multiple compute units (CU).
+* Each CU has multiple SIMDs that execute wavefronts.
+* The wavefronts for a single work-group are executed in the same CU but may be
+  executed by different SIMDs.
+* Each CU has a single LDS memory shared by the wavefronts of the work-groups
+  executing on it.
+* All LDS operations of a CU are performed as wavefront wide operations in a
+  global order and involve no caching. Completion is reported to a wavefront in
+  execution order.
+* The LDS memory has multiple request queues shared by the SIMDs of a
+  CU. Therefore, the LDS operations performed by different waves of a work-group
+  can be reordered relative to each other, which can result in reordering the
+  visibility of vector memory operations with respect to LDS operations of other
+  wavefronts in the same work-group. A ``s_waitcnt lgkmcnt(0)`` is required to
+  ensure synchronization between LDS operations and vector memory operations
+  between waves of a work-group, but not between operations performed by the
+  same wavefront.
+* The vector memory operations are performed as wavefront wide operations and
+  completion is reported to a wavefront in execution order. The exception is
+  that for GFX7-9 ``flat_load/store/atomic`` instructions can report out of
+  vector memory order if they access LDS memory, and out of LDS operation order
+  if they access global memory.
+* The vector memory operations access a single vector L1 cache shared by all
+  SIMDs a CU. Therefore, no special action is required for coherence between the
+  lanes of a single wavefront, or for coherence between wavefronts in the same
+  work-group. A ``buffer_wbinvl1_vol`` is required for coherence between waves
+  executing in different work-groups as they may be executing on different CUs.
+* The scalar memory operations access a scalar L1 cache shared by all wavefronts
+  on a group of CUs. The scalar and vector L1 caches are not coherent. However,
+  scalar operations are used in a restricted way so do not impact the memory
+  model. See :ref:`amdgpu-amdhsa-memory-spaces`.
+* The vector and scalar memory operations use an L2 cache shared by all CUs on
+  the same agent.
+* The L2 cache has independent channels to service disjoint ranges of virtual
+  addresses.
+* Each CU has a separate request queue per channel. Therefore, the vector and
+  scalar memory operations performed by waves executing in different work-groups
+  (which may be executing on different CUs) of an agent can be reordered
+  relative to each other. A ``s_waitcnt vmcnt(0)`` is required to ensure
+  synchronization between vector memory operations of different CUs. It ensures a
+  previous vector memory operation has completed before executing a subsequent
+  vector memory or LDS operation and so can be used to meet the requirements of
+  acquire and release.
+* The L2 cache can be kept coherent with other agents on some targets, or ranges
+  of virtual addresses can be set up to bypass it to ensure system coherence.
+
+Private address space uses ``buffer_load/store`` using the scratch V# (GFX6-8),
+or ``scratch_load/store`` (GFX9). Since only a single thread is accessing the
+memory, atomic memory orderings are not meaningful and all accesses are treated
+as non-atomic.
+
+Constant address space uses ``buffer/global_load`` instructions (or equivalent
+scalar memory instructions). Since the constant address space contents do not
+change during the execution of a kernel dispatch it is not legal to perform
+stores, and atomic memory orderings are not meaningful and all access are
+treated as non-atomic.
+
+A memory synchronization scope wider than work-group is not meaningful for the
+group (LDS) address space and is treated as work-group.
+
+The memory model does not support the region address space which is treated as
+non-atomic.
+
+Acquire memory ordering is not meaningful on store atomic instructions and is
+treated as non-atomic.
+
+Release memory ordering is not meaningful on load atomic instructions and is
+treated a non-atomic.
+
+Acquire-release memory ordering is not meaningful on load or store atomic
+instructions and is treated as acquire and release respectively.
+
+AMDGPU backend only uses scalar memory operations to access memory that is
+proven to not change during the execution of the kernel dispatch. This includes
+constant address space and global address space for program scope const
+variables. Therefore the kernel machine code does not have to maintain the
+scalar L1 cache to ensure it is coherent with the vector L1 cache. The scalar
+and vector L1 caches are invalidated between kernel dispatches by CP since
+constant address space data may change between kernel dispatch executions. See
+:ref:`amdgpu-amdhsa-memory-spaces`.
+
+The one execption is if scalar writes are used to spill SGPR registers. In this
+case the AMDGPU backend ensures the memory location used to spill is never
+accessed by vector memory operations at the same time. If scalar writes are used
+then a ``s_dcache_wb`` is inserted before the ``s_endpgm`` and before a function
+return since the locations may be used for vector memory instructions by a
+future wave that uses the same scratch area, or a function call that creates a
+frame at the same address, respectively. There is no need for a ``s_dcache_inv``
+as all scalar writes are write-before-read in the same thread.
+
+Scratch backing memory (which is used for the private address space)
+is accessed with MTYPE NC_NV (non-coherenent non-volatile). Since the private
+address space is only accessed by a single thread, and is always
+write-before-read, there is never a need to invalidate these entries from the L1
+cache. Hence all cache invalidates are done as ``*_vol`` to only invalidate the
+volatile cache lines.
+
+On dGPU the kernarg backing memory is accessed as UC (uncached) to avoid needing
+to invalidate the L2 cache. This also causes it to be treated as
+non-volatile and so is not invalidated by ``*_vol``. On APU it is accessed as CC
+(cache coherent) and so the L2 cache will coherent with the CPU and other
+agents.
+
+  .. table:: AMDHSA Memory Model Code Sequences GFX6-GFX9
+     :name: amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table
+
+     ============ ============ ============== ========== ===============================
+     LLVM Instr   LLVM Memory  LLVM Memory    AMDGPU     AMDGPU Machine Code
+                  Ordering     Sync Scope     Address
+                                              Space
+     ============ ============ ============== ========== ===============================
+     **Non-Atomic**
+     -----------------------------------------------------------------------------------
+     load         *none*       *none*         - global   - !volatile & !nontemporal
+                                              - generic
+                                              - private    1. buffer/global/flat_load
+                                              - constant
+                                                         - volatile & !nontemporal
+
+                                                           1. buffer/global/flat_load
+                                                              glc=1
+
+                                                         - nontemporal
+
+                                                           1. buffer/global/flat_load
+                                                              glc=1 slc=1
+
+     load         *none*       *none*         - local    1. ds_load
+     store        *none*       *none*         - global   - !nontemporal
+                                              - generic
+                                              - private    1. buffer/global/flat_store
+                                              - constant
+                                                         - nontemporal
+
+                                                           1. buffer/global/flat_stote
+                                                              glc=1 slc=1
+
+     store        *none*       *none*         - local    1. ds_store
+     **Unordered Atomic**
+     -----------------------------------------------------------------------------------
+     load atomic  unordered    *any*          *any*      *Same as non-atomic*.
+     store atomic unordered    *any*          *any*      *Same as non-atomic*.
+     atomicrmw    unordered    *any*          *any*      *Same as monotonic
+                                                         atomic*.
+     **Monotonic Atomic**
+     -----------------------------------------------------------------------------------
+     load atomic  monotonic    - singlethread - global   1. buffer/global/flat_load
+                               - wavefront    - generic
+                               - workgroup
+     load atomic  monotonic    - singlethread - local    1. ds_load
+                               - wavefront
+                               - workgroup
+     load atomic  monotonic    - agent        - global   1. buffer/global/flat_load
+                               - system       - generic     glc=1
+     store atomic monotonic    - singlethread - global   1. buffer/global/flat_store
+                               - wavefront    - generic
+                               - workgroup
+                               - agent
+                               - system
+     store atomic monotonic    - singlethread - local    1. ds_store
+                               - wavefront
+                               - workgroup
+     atomicrmw    monotonic    - singlethread - global   1. buffer/global/flat_atomic
+                               - wavefront    - generic
+                               - workgroup
+                               - agent
+                               - system
+     atomicrmw    monotonic    - singlethread - local    1. ds_atomic
+                               - wavefront
+                               - workgroup
+     **Acquire Atomic**
+     -----------------------------------------------------------------------------------
+     load atomic  acquire      - singlethread - global   1. buffer/global/ds/flat_load
+                               - wavefront    - local
+                                              - generic
+     load atomic  acquire      - workgroup    - global   1. buffer/global/flat_load
+     load atomic  acquire      - workgroup    - local    1. ds_load
+                                                         2. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL, omit.
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures any
+                                                             following global
+                                                             data read is no
+                                                             older than the load
+                                                             atomic value being
+                                                             acquired.
+     load atomic  acquire      - workgroup    - generic  1. flat_load
+                                                         2. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL, omit.
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures any
+                                                             following global
+                                                             data read is no
+                                                             older than the load
+                                                             atomic value being
+                                                             acquired.
+     load atomic  acquire      - agent        - global   1. buffer/global/flat_load
+                               - system                     glc=1
+                                                         2. s_waitcnt vmcnt(0)
+
+                                                           - Must happen before
+                                                             following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the load
+                                                             has completed
+                                                             before invalidating
+                                                             the cache.
+
+                                                         3. buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following
+                                                             loads will not see
+                                                             stale global data.
+
+     load atomic  acquire      - agent        - generic  1. flat_load glc=1
+                               - system                  2. s_waitcnt vmcnt(0) &
+                                                            lgkmcnt(0)
+
+                                                           - If OpenCL omit
+                                                             lgkmcnt(0).
+                                                           - Must happen before
+                                                             following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the flat_load
+                                                             has completed
+                                                             before invalidating
+                                                             the cache.
+
+                                                         3. buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data.
+
+     atomicrmw    acquire      - singlethread - global   1. buffer/global/ds/flat_atomic
+                               - wavefront    - local
+                                              - generic
+     atomicrmw    acquire      - workgroup    - global   1. buffer/global/flat_atomic
+     atomicrmw    acquire      - workgroup    - local    1. ds_atomic
+                                                         2. waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL, omit.
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures any
+                                                             following global
+                                                             data read is no
+                                                             older than the
+                                                             atomicrmw value
+                                                             being acquired.
+
+     atomicrmw    acquire      - workgroup    - generic  1. flat_atomic
+                                                         2. waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL, omit.
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures any
+                                                             following global
+                                                             data read is no
+                                                             older than the
+                                                             atomicrmw value
+                                                             being acquired.
+
+     atomicrmw    acquire      - agent        - global   1. buffer/global/flat_atomic
+                               - system                  2. s_waitcnt vmcnt(0)
+
+                                                           - Must happen before
+                                                             following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the
+                                                             atomicrmw has
+                                                             completed before
+                                                             invalidating the
+                                                             cache.
+
+                                                         3. buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data.
+
+     atomicrmw    acquire      - agent        - generic  1. flat_atomic
+                               - system                  2. s_waitcnt vmcnt(0) &
+                                                            lgkmcnt(0)
+
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Must happen before
+                                                             following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the
+                                                             atomicrmw has
+                                                             completed before
+                                                             invalidating the
+                                                             cache.
+
+                                                         3. buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data.
+
+     fence        acquire      - singlethread *none*     *none*
+                               - wavefront
+     fence        acquire      - workgroup    *none*     1. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit.
+                                                           - However, since LLVM
+                                                             currently has no
+                                                             address space on
+                                                             the fence need to
+                                                             conservatively
+                                                             always generate. If
+                                                             fence had an
+                                                             address space then
+                                                             set to address
+                                                             space of OpenCL
+                                                             fence flag, or to
+                                                             generic if both
+                                                             local and global
+                                                             flags are
+                                                             specified.
+                                                           - Must happen after
+                                                             any preceding
+                                                             local/generic load
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             fence-paired-atomic).
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures any
+                                                             following global
+                                                             data read is no
+                                                             older than the
+                                                             value read by the
+                                                             fence-paired-atomic.
+
+     fence        acquire      - agent        *none*     1. s_waitcnt lgkmcnt(0) &
+                               - system                     vmcnt(0)
+
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit
+                                                             lgkmcnt(0).
+                                                           - However, since LLVM
+                                                             currently has no
+                                                             address space on
+                                                             the fence need to
+                                                             conservatively
+                                                             always generate
+                                                             (see comment for
+                                                             previous fence).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic load
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             fence-paired-atomic).
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic load
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             fence-paired-atomic).
+                                                           - Must happen before
+                                                             the following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures that the
+                                                             fence-paired atomic
+                                                             has completed
+                                                             before invalidating
+                                                             the
+                                                             cache. Therefore
+                                                             any following
+                                                             locations read must
+                                                             be no older than
+                                                             the value read by
+                                                             the
+                                                             fence-paired-atomic.
+
+                                                         2. buffer_wbinvl1_vol
+
+                                                           - Must happen before any
+                                                             following global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data.
+
+     **Release Atomic**
+     -----------------------------------------------------------------------------------
+     store atomic release      - singlethread - global   1. buffer/global/ds/flat_store
+                               - wavefront    - local
+                                              - generic
+     store atomic release      - workgroup    - global   1. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL, omit.
+                                                           - Must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             store.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to local have
+                                                             completed before
+                                                             performing the
+                                                             store that is being
+                                                             released.
+
+                                                         2. buffer/global/flat_store
+     store atomic release      - workgroup    - local    1. ds_store
+     store atomic release      - workgroup    - generic  1. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL, omit.
+                                                           - Must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             store.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to local have
+                                                             completed before
+                                                             performing the
+                                                             store that is being
+                                                             released.
+
+                                                         2. flat_store
+     store atomic release      - agent        - global   1. s_waitcnt lgkmcnt(0) &
+                               - system       - generic     vmcnt(0)
+
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             store.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to memory have
+                                                             completed before
+                                                             performing the
+                                                             store that is being
+                                                             released.
+
+                                                         2. buffer/global/ds/flat_store
+     atomicrmw    release      - singlethread - global   1. buffer/global/ds/flat_atomic
+                               - wavefront    - local
+                                              - generic
+     atomicrmw    release      - workgroup    - global   1. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL, omit.
+                                                           - Must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to local have
+                                                             completed before
+                                                             performing the
+                                                             atomicrmw that is
+                                                             being released.
+
+                                                         2. buffer/global/flat_atomic
+     atomicrmw    release      - workgroup    - local    1. ds_atomic
+     atomicrmw    release      - workgroup    - generic  1. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL, omit.
+                                                           - Must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to local have
+                                                             completed before
+                                                             performing the
+                                                             atomicrmw that is
+                                                             being released.
+
+                                                         2. flat_atomic
+     atomicrmw    release      - agent        - global   1. s_waitcnt lgkmcnt(0) &
+                               - system       - generic     vmcnt(0)
+
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to global and local
+                                                             have completed
+                                                             before performing
+                                                             the atomicrmw that
+                                                             is being released.
+
+                                                         2. buffer/global/ds/flat_atomic
+     fence        release      - singlethread *none*     *none*
+                               - wavefront
+     fence        release      - workgroup    *none*     1. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit.
+                                                           - However, since LLVM
+                                                             currently has no
+                                                             address space on
+                                                             the fence need to
+                                                             conservatively
+                                                             always generate. If
+                                                             fence had an
+                                                             address space then
+                                                             set to address
+                                                             space of OpenCL
+                                                             fence flag, or to
+                                                             generic if both
+                                                             local and global
+                                                             flags are
+                                                             specified.
+                                                           - Must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             any following store
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             fence-paired-atomic).
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to local have
+                                                             completed before
+                                                             performing the
+                                                             following
+                                                             fence-paired-atomic.
+
+     fence        release      - agent        *none*     1. s_waitcnt lgkmcnt(0) &
+                               - system                     vmcnt(0)
+
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit
+                                                             lgkmcnt(0).
+                                                           - If OpenCL and
+                                                             address space is
+                                                             local, omit
+                                                             vmcnt(0).
+                                                           - However, since LLVM
+                                                             currently has no
+                                                             address space on
+                                                             the fence need to
+                                                             conservatively
+                                                             always generate. If
+                                                             fence had an
+                                                             address space then
+                                                             set to address
+                                                             space of OpenCL
+                                                             fence flag, or to
+                                                             generic if both
+                                                             local and global
+                                                             flags are
+                                                             specified.
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             any following store
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             fence-paired-atomic).
+                                                           - Ensures that all
+                                                             memory operations
+                                                             have
+                                                             completed before
+                                                             performing the
+                                                             following
+                                                             fence-paired-atomic.
+
+     **Acquire-Release Atomic**
+     -----------------------------------------------------------------------------------
+     atomicrmw    acq_rel      - singlethread - global   1. buffer/global/ds/flat_atomic
+                               - wavefront    - local
+                                              - generic
+     atomicrmw    acq_rel      - workgroup    - global   1. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL, omit.
+                                                           - Must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to local have
+                                                             completed before
+                                                             performing the
+                                                             atomicrmw that is
+                                                             being released.
+
+                                                         2. buffer/global/flat_atomic
+     atomicrmw    acq_rel      - workgroup    - local    1. ds_atomic
+                                                         2. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL, omit.
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures any
+                                                             following global
+                                                             data read is no
+                                                             older than the load
+                                                             atomic value being
+                                                             acquired.
+
+     atomicrmw    acq_rel      - workgroup    - generic  1. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL, omit.
+                                                           - Must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to local have
+                                                             completed before
+                                                             performing the
+                                                             atomicrmw that is
+                                                             being released.
+
+                                                         2. flat_atomic
+                                                         3. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL, omit.
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures any
+                                                             following global
+                                                             data read is no
+                                                             older than the load
+                                                             atomic value being
+                                                             acquired.
+
+     atomicrmw    acq_rel      - agent        - global   1. s_waitcnt lgkmcnt(0) &
+                               - system                     vmcnt(0)
+
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to global have
+                                                             completed before
+                                                             performing the
+                                                             atomicrmw that is
+                                                             being released.
+
+                                                         2. buffer/global/flat_atomic
+                                                         3. s_waitcnt vmcnt(0)
+
+                                                           - Must happen before
+                                                             following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the
+                                                             atomicrmw has
+                                                             completed before
+                                                             invalidating the
+                                                             cache.
+
+                                                         4. buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data.
+
+     atomicrmw    acq_rel      - agent        - generic  1. s_waitcnt lgkmcnt(0) &
+                               - system                     vmcnt(0)
+
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to global have
+                                                             completed before
+                                                             performing the
+                                                             atomicrmw that is
+                                                             being released.
+
+                                                         2. flat_atomic
+                                                         3. s_waitcnt vmcnt(0) &
+                                                            lgkmcnt(0)
+
+                                                           - If OpenCL, omit
+                                                             lgkmcnt(0).
+                                                           - Must happen before
+                                                             following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures the
+                                                             atomicrmw has
+                                                             completed before
+                                                             invalidating the
+                                                             cache.
+
+                                                         4. buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data.
+
+     fence        acq_rel      - singlethread *none*     *none*
+                               - wavefront
+     fence        acq_rel      - workgroup    *none*     1. s_waitcnt lgkmcnt(0)
+
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit.
+                                                           - However,
+                                                             since LLVM
+                                                             currently has no
+                                                             address space on
+                                                             the fence need to
+                                                             conservatively
+                                                             always generate
+                                                             (see comment for
+                                                             previous fence).
+                                                           - Must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures that all
+                                                             memory operations
+                                                             to local have
+                                                             completed before
+                                                             performing any
+                                                             following global
+                                                             memory operations.
+                                                           - Ensures that the
+                                                             preceding
+                                                             local/generic load
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             acquire-fence-paired-atomic
+                                                             ) has completed
+                                                             before following
+                                                             global memory
+                                                             operations. This
+                                                             satisfies the
+                                                             requirements of
+                                                             acquire.
+                                                           - Ensures that all
+                                                             previous memory
+                                                             operations have
+                                                             completed before a
+                                                             following
+                                                             local/generic store
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             release-fence-paired-atomic
+                                                             ). This satisfies the
+                                                             requirements of
+                                                             release.
+
+     fence        acq_rel      - agent        *none*     1. s_waitcnt lgkmcnt(0) &
+                               - system                     vmcnt(0)
+
+                                                           - If OpenCL and
+                                                             address space is
+                                                             not generic, omit
+                                                             lgkmcnt(0).
+                                                           - However, since LLVM
+                                                             currently has no
+                                                             address space on
+                                                             the fence need to
+                                                             conservatively
+                                                             always generate
+                                                             (see comment for
+                                                             previous fence).
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0) and
+                                                             s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - s_waitcnt vmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             global/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - s_waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             any preceding
+                                                             local/generic
+                                                             load/store/load
+                                                             atomic/store
+                                                             atomic/atomicrmw.
+                                                           - Must happen before
+                                                             the following
+                                                             buffer_wbinvl1_vol.
+                                                           - Ensures that the
+                                                             preceding
+                                                             global/local/generic
+                                                             load
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             acquire-fence-paired-atomic
+                                                             ) has completed
+                                                             before invalidating
+                                                             the cache. This
+                                                             satisfies the
+                                                             requirements of
+                                                             acquire.
+                                                           - Ensures that all
+                                                             previous memory
+                                                             operations have
+                                                             completed before a
+                                                             following
+                                                             global/local/generic
+                                                             store
+                                                             atomic/atomicrmw
+                                                             with an equal or
+                                                             wider sync scope
+                                                             and memory ordering
+                                                             stronger than
+                                                             unordered (this is
+                                                             termed the
+                                                             release-fence-paired-atomic
+                                                             ). This satisfies the
+                                                             requirements of
+                                                             release.
+
+                                                         2. buffer_wbinvl1_vol
+
+                                                           - Must happen before
+                                                             any following
+                                                             global/generic
+                                                             load/load
+                                                             atomic/store/store
+                                                             atomic/atomicrmw.
+                                                           - Ensures that
+                                                             following loads
+                                                             will not see stale
+                                                             global data. This
+                                                             satisfies the
+                                                             requirements of
+                                                             acquire.
+
+     **Sequential Consistent Atomic**
+     -----------------------------------------------------------------------------------
+     load atomic  seq_cst      - singlethread - global   *Same as corresponding
+                               - wavefront    - local    load atomic acquire,
+                                              - generic  except must generated
+                                                         all instructions even
+                                                         for OpenCL.*
+     load atomic  seq_cst      - workgroup    - global   1. s_waitcnt lgkmcnt(0)
+                                              - generic
+                                                           - Must
+                                                             happen after
+                                                             preceding
+                                                             global/generic load
+                                                             atomic/store
+                                                             atomic/atomicrmw
+                                                             with memory
+                                                             ordering of seq_cst
+                                                             and with equal or
+                                                             wider sync scope.
+                                                             (Note that seq_cst
+                                                             fences have their
+                                                             own s_waitcnt
+                                                             lgkmcnt(0) and so do
+                                                             not need to be
+                                                             considered.)
+                                                           - Ensures any
+                                                             preceding
+                                                             sequential
+                                                             consistent local
+                                                             memory instructions
+                                                             have completed
+                                                             before executing
+                                                             this sequentially
+                                                             consistent
+                                                             instruction. This
+                                                             prevents reordering
+                                                             a seq_cst store
+                                                             followed by a
+                                                             seq_cst load. (Note
+                                                             that seq_cst is
+                                                             stronger than
+                                                             acquire/release as
+                                                             the reordering of
+                                                             load acquire
+                                                             followed by a store
+                                                             release is
+                                                             prevented by the
+                                                             waitcnt of
+                                                             the release, but
+                                                             there is nothing
+                                                             preventing a store
+                                                             release followed by
+                                                             load acquire from
+                                                             competing out of
+                                                             order.)
+
+                                                         2. *Following
+                                                            instructions same as
+                                                            corresponding load
+                                                            atomic acquire,
+                                                            except must generated
+                                                            all instructions even
+                                                            for OpenCL.*
+     load atomic  seq_cst      - workgroup    - local    *Same as corresponding
+                                                         load atomic acquire,
+                                                         except must generated
+                                                         all instructions even
+                                                         for OpenCL.*
+     load atomic  seq_cst      - agent        - global   1. s_waitcnt lgkmcnt(0) &
+                               - system       - generic     vmcnt(0)
+
+                                                           - Could be split into
+                                                             separate s_waitcnt
+                                                             vmcnt(0)
+                                                             and s_waitcnt
+                                                             lgkmcnt(0) to allow
+                                                             them to be
+                                                             independently moved
+                                                             according to the
+                                                             following rules.
+                                                           - waitcnt lgkmcnt(0)
+                                                             must happen after
+                                                             preceding
+                                                             global/generic load
+                                                             atomic/store
+                                                             atomic/atomicrmw
+                                                             with memory
+                                                             ordering of seq_cst
+                                                             and with equal or
+                                                             wider sync scope.
+                                                             (Note that seq_cst
+                                                             fences have their
+                                                             own s_waitcnt
+                                                             lgkmcnt(0) and so do
+                                                             not need to be
+                                                             considered.)
+                                                           - waitcnt vmcnt(0)
+                                                             must happen after
+                                                             preceding
+                                                             global/generic load
+                                                             atomic/store
+                                                             atomic/atomicrmw
+                                                             with memory
+                                                             ordering of seq_cst
+                                                             and with equal or
+                                                             wider sync scope.
+                                                             (Note that seq_cst
+                                                             fences have their
+                                                             own s_waitcnt
+                                                             vmcnt(0) and so do
+                                                             not need to be
+                                                             considered.)
+                                                           - Ensures any
+                                                             preceding
+                                                             sequential
+                                                             consistent global
+                                                             memory instructions
+                                                             have completed
+                                                             before executing
+                                                             this sequentially
+                                                             consistent
+                                                             instruction. This
+                                                             prevents reordering
+                                                             a seq_cst store
+                                                             followed by a
+                                                             seq_cst load. (Note
+                                                             that seq_cst is
+                                                             stronger than
+                                                             acquire/release as
+                                                             the reordering of
+                                                             load acquire
+                                                             followed by a store
+                                                             release is
+                                                             prevented by the
+                                                             waitcnt of
+                                                             the release, but
+                                                             there is nothing
+                                                             preventing a store
+                                                             release followed by
+                                                             load acquire from
+                                                             competing out of
+                                                             order.)
+
+                                                         2. *Following
+                                                            instructions same as
+                                                            corresponding load
+                                                            atomic acquire,
+                                                            except must generated
+                                                            all instructions even
+                                                            for OpenCL.*
+     store atomic seq_cst      - singlethread - global   *Same as corresponding
+                               - wavefront    - local    store atomic release,
+                               - workgroup    - generic  except must generated
+                                                         all instructions even
+                                                         for OpenCL.*
+     store atomic seq_cst      - agent        - global   *Same as corresponding
+                               - system       - generic  store atomic release,
+                                                         except must generated
+                                                         all instructions even
+                                                         for OpenCL.*
+     atomicrmw    seq_cst      - singlethread - global   *Same as corresponding
+                               - wavefront    - local    atomicrmw acq_rel,
+                               - workgroup    - generic  except must generated
+                                                         all instructions even
+                                                         for OpenCL.*
+     atomicrmw    seq_cst      - agent        - global   *Same as corresponding
+                               - system       - generic  atomicrmw acq_rel,
+                                                         except must generated
+                                                         all instructions even
+                                                         for OpenCL.*
+     fence        seq_cst      - singlethread *none*     *Same as corresponding
+                               - wavefront               fence acq_rel,
+                               - workgroup               except must generated
+                               - agent                   all instructions even
+                               - system                  for OpenCL.*
+     ============ ============ ============== ========== ===============================
+
+The memory order also adds the single thread optimization constrains defined in
+table
+:ref:`amdgpu-amdhsa-memory-model-single-thread-optimization-constraints-gfx6-gfx9-table`.
+
+  .. table:: AMDHSA Memory Model Single Thread Optimization Constraints GFX6-GFX9
+     :name: amdgpu-amdhsa-memory-model-single-thread-optimization-constraints-gfx6-gfx9-table
+
+     ============ ==============================================================
+     LLVM Memory  Optimization Constraints
+     Ordering
+     ============ ==============================================================
+     unordered    *none*
+     monotonic    *none*
+     acquire      - If a load atomic/atomicrmw then no following load/load
+                    atomic/store/ store atomic/atomicrmw/fence instruction can
+                    be moved before the acquire.
+                  - If a fence then same as load atomic, plus no preceding
+                    associated fence-paired-atomic can be moved after the fence.
+     release      - If a store atomic/atomicrmw then no preceding load/load
+                    atomic/store/ store atomic/atomicrmw/fence instruction can
+                    be moved after the release.
+                  - If a fence then same as store atomic, plus no following
+                    associated fence-paired-atomic can be moved before the
+                    fence.
+     acq_rel      Same constraints as both acquire and release.
+     seq_cst      - If a load atomic then same constraints as acquire, plus no
+                    preceding sequentially consistent load atomic/store
+                    atomic/atomicrmw/fence instruction can be moved after the
+                    seq_cst.
+                  - If a store atomic then the same constraints as release, plus
+                    no following sequentially consistent load atomic/store
+                    atomic/atomicrmw/fence instruction can be moved before the
+                    seq_cst.
+                  - If an atomicrmw/fence then same constraints as acq_rel.
+     ============ ==============================================================
 
 Trap Handler ABI
-----------------
-The OS element of the target triple controls the trap handler behavior.
+~~~~~~~~~~~~~~~~
 
-HSA OS
-^^^^^^
-For code objects generated by AMDGPU back-end for the HSA OS, the runtime
-installs a trap handler that supports the s_trap instruction with the following
-usage:
+For code objects generated by AMDGPU backend for HSA [HSA]_ compatible runtimes
+(such as ROCm [AMD-ROCm]_), the runtime installs a trap handler that supports
+the ``s_trap`` instruction with the following usage:
 
- +--------------+-------------+-------------------+----------------------------+
- |Usage         |Code Sequence|Trap Handler Inputs|Description                 |
- +==============+=============+===================+============================+
- |reserved      |s_trap 0x00  |                   |Reserved by hardware.       |
- +--------------+-------------+-------------------+----------------------------+
- |HSA debugtrap |s_trap 0x01  |SGPR0-1: queue_ptr |Reserved for HSA debugtrap  |
- |(arg)         |             |VGPR0: arg         |intrinsic (not implemented).|
- +--------------+-------------+-------------------+----------------------------+
- |llvm.trap     |s_trap 0x02  |SGPR0-1: queue_ptr |Causes dispatch to be       |
- |              |             |                   |terminated and its          |
- |              |             |                   |associated queue put into   |
- |              |             |                   |the error state.            |
- +--------------+-------------+-------------------+----------------------------+
- |llvm.debugtrap| s_trap 0x03 |SGPR0-1: queue_ptr |If debugger not installed   |
- |              |             |                   |handled same as llvm.trap.  |
- +--------------+-------------+-------------------+----------------------------+
- |debugger      |s_trap 0x07  |                   |Reserved for debugger       |
- |breakpoint    |             |                   |breakpoints.                |
- +--------------+-------------+-------------------+----------------------------+
- |debugger      |s_trap 0x08  |                   |Reserved for debugger.      |
- +--------------+-------------+-------------------+----------------------------+
- |debugger      |s_trap 0xfe  |                   |Reserved for debugger.      |
- +--------------+-------------+-------------------+----------------------------+
- |debugger      |s_trap 0xff  |                   |Reserved for debugger.      |
- +--------------+-------------+-------------------+----------------------------+
+  .. table:: AMDGPU Trap Handler for AMDHSA OS
+     :name: amdgpu-trap-handler-for-amdhsa-os-table
 
-Non-HSA OS
-^^^^^^^^^^
-For code objects generated by AMDGPU back-end for non-HSA OS, the runtime does
-not install a trap handler. The llvm.trap and llvm.debugtrap instructions are
-handler as follows:
+     =================== =============== =============== =======================
+     Usage               Code Sequence   Trap Handler    Description
+                                         Inputs
+     =================== =============== =============== =======================
+     reserved            ``s_trap 0x00``                 Reserved by hardware.
+     ``debugtrap(arg)``  ``s_trap 0x01`` ``SGPR0-1``:    Reserved for HSA
+                                           ``queue_ptr`` ``debugtrap``
+                                         ``VGPR0``:      intrinsic (not
+                                           ``arg``       implemented).
+     ``llvm.trap``       ``s_trap 0x02`` ``SGPR0-1``:    Causes dispatch to be
+                                           ``queue_ptr`` terminated and its
+                                                         associated queue put
+                                                         into the error state.
+     ``llvm.debugtrap``  ``s_trap 0x03`` ``SGPR0-1``:    If debugger not
+                                           ``queue_ptr`` installed handled
+                                                         same as ``llvm.trap``.
+     debugger breakpoint ``s_trap 0x07``                 Reserved for  debugger
+                                                         breakpoints.
+     debugger            ``s_trap 0x08``                 Reserved for debugger.
+     debugger            ``s_trap 0xfe``                 Reserved for debugger.
+     debugger            ``s_trap 0xff``                 Reserved for debugger.
+     =================== =============== =============== =======================
 
-   =============== ============= ===============================================
-   Usage           Code Sequence Description
-   =============== ============= ===============================================
-   llvm.trap       s_endpgm      Causes wavefront to be terminated.
-   llvm.debugtrap  s_nop         No operation. Compiler warning generated that
-                                 there is no trap handler installed.
-   =============== ============= ===============================================
+Unspecified OS
+--------------
+
+This section provides code conventions used when the target triple OS is
+empty (see :ref:`amdgpu-target-triples`).
+
+Trap Handler ABI
+~~~~~~~~~~~~~~~~
+
+For code objects generated by AMDGPU backend for non-amdhsa OS, the runtime does
+not install a trap handler. The ``llvm.trap`` and ``llvm.debugtrap``
+instructions are handled as follows:
+
+  .. table:: AMDGPU Trap Handler for Non-AMDHSA OS
+     :name: amdgpu-trap-handler-for-non-amdhsa-os-table
+
+     =============== =============== ===========================================
+     Usage           Code Sequence   Description
+     =============== =============== ===========================================
+     llvm.trap       s_endpgm        Causes wavefront to be terminated.
+     llvm.debugtrap  *none*          Compiler warning given that there is no
+                                     trap handler installed.
+     =============== =============== ===========================================
+
+Source Languages
+================
+
+.. _amdgpu-opencl:
+
+OpenCL
+------
+
+When generating code for the OpenCL language the target triple environment
+should be ``opencl`` or ``amdgizcl`` (see :ref:`amdgpu-target-triples`).
+
+When the language is OpenCL the following differences occur:
+
+1. The OpenCL memory model is used (see :ref:`amdgpu-amdhsa-memory-model`).
+2. The AMDGPU backend adds additional arguments to the kernel.
+3. Additional metadata is generated
+   (:ref:`amdgpu-amdhsa-hsa-code-object-metadata`).
+
+.. TODO
+   Specify what affect this has. Hidden arguments added. Additional metadata
+   generated.
+
+.. _amdgpu-hcc:
+
+HCC
+---
+
+When generating code for the OpenCL language the target triple environment
+should be ``hcc`` (see :ref:`amdgpu-target-triples`).
+
+When the language is OpenCL the following differences occur:
+
+1. The HSA memory model is used (see :ref:`amdgpu-amdhsa-memory-model`).
+
+.. TODO
+   Specify what affect this has.
 
 Assembler
-=========
+---------
 
 AMDGPU backend has LLVM-MC based assembler which is currently in development.
-It supports Southern Islands ISA, Sea Islands and Volcanic Islands.
+It supports AMDGCN GFX6-GFX8.
 
-This document describes general syntax for instructions and operands. For more
-information about instructions, their semantics and supported combinations
-of operands, refer to one of Instruction Set Architecture manuals.
+This section describes general syntax for instructions and operands. For more
+information about instructions, their semantics and supported combinations of
+operands, refer to one of instruction set architecture manuals
+[AMD-GCN-GFX6]_, [AMD-GCN-GFX7]_, [AMD-GCN-GFX8]_ and [AMD-GCN-GFX9]_.
 
-An instruction has the following syntax (register operands are
-normally comma-separated while extra operands are space-separated):
+An instruction has the following syntax (register operands are normally
+comma-separated while extra operands are space-separated):
 
 *<opcode> <register_operand0>, ... <extra_operand0> ...*
 
-
 Operands
---------
+~~~~~~~~
 
 The following syntax for register operands is supported:
 
@@ -141,8 +3777,11 @@
   - dst_unused (UNUSED_PAD, UNUSED_SEXT, UNUSED_PRESERVE)
   - abs, neg, sext
 
-DS Instructions Examples
-------------------------
+Instruction Examples
+~~~~~~~~~~~~~~~~~~~~
+
+DS
+~~
 
 .. code-block:: nasm
 
@@ -154,8 +3793,8 @@
 
 For full list of supported instructions, refer to "LDS/GDS instructions" in ISA Manual.
 
-FLAT Instruction Examples
---------------------------
+FLAT
+++++
 
 .. code-block:: nasm
 
@@ -167,8 +3806,8 @@
 
 For full list of supported instructions, refer to "FLAT instructions" in ISA Manual.
 
-MUBUF Instruction Examples
----------------------------
+MUBUF
++++++
 
 .. code-block:: nasm
 
@@ -180,8 +3819,8 @@
 
 For full list of supported instructions, refer to "MUBUF Instructions" in ISA Manual.
 
-SMRD/SMEM Instruction Examples
--------------------------------
+SMRD/SMEM
++++++++++
 
 .. code-block:: nasm
 
@@ -193,8 +3832,8 @@
 
 For full list of supported instructions, refer to "Scalar Memory Operations" in ISA Manual.
 
-SOP1 Instruction Examples
---------------------------
+SOP1
+++++
 
 .. code-block:: nasm
 
@@ -208,8 +3847,8 @@
 
 For full list of supported instructions, refer to "SOP1 Instructions" in ISA Manual.
 
-SOP2 Instruction Examples
--------------------------
+SOP2
+++++
 
 .. code-block:: nasm
 
@@ -225,8 +3864,8 @@
 
 For full list of supported instructions, refer to "SOP2 Instructions" in ISA Manual.
 
-SOPC Instruction Examples
---------------------------
+SOPC
+++++
 
 .. code-block:: nasm
 
@@ -237,8 +3876,8 @@
 
 For full list of supported instructions, refer to "SOPC Instructions" in ISA Manual.
 
-SOPP Instruction Examples
---------------------------
+SOPP
+++++
 
 .. code-block:: nasm
 
@@ -260,8 +3899,8 @@
 of SOPP Instructions, so it is up to the programmer to be familiar with the
 range or acceptable values.
 
-Vector ALU Instruction Examples
--------------------------------
+VALU
+++++
 
 For vector ALU instruction opcodes (VOP1, VOP2, VOP3, VOPC, VOP_DPP, VOP_SDWA),
 the assembler will automatically use optimal encoding based on its operands.
@@ -315,19 +3954,20 @@
 For full list of supported instructions, refer to "Vector ALU instructions".
 
 HSA Code Object Directives
---------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 AMDGPU ABI defines auxiliary data in output code object. In assembly source,
 one can specify them with assembler directives.
 
 .hsa_code_object_version major, minor
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
++++++++++++++++++++++++++++++++++++++
 
 *major* and *minor* are integers that specify the version of the HSA code
 object that will be generated by the assembler.
 
 .hsa_code_object_isa [major, minor, stepping, vendor, arch]
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
 
 *major*, *minor*, and *stepping* are all integers that describe the instruction
 set architecture (ISA) version of the assembly program.
@@ -339,13 +3979,13 @@
 from the value of the -mcpu option that is passed to the assembler.
 
 .amdgpu_hsa_kernel (name)
-^^^^^^^^^^^^^^^^^^^^^^^^^
++++++++++++++++++++++++++
 
 This directives specifies that the symbol with given name is a kernel entry point
 (label) and the object should contain corresponding symbol of type STT_AMDGPU_HSA_KERNEL.
 
 .amd_kernel_code_t
-^^^^^^^^^^^^^^^^^^
+++++++++++++++++++
 
 This directive marks the beginning of a list of key / value pairs that are used
 to specify the amd_kernel_code_t object that will be emitted by the assembler.
@@ -361,7 +4001,7 @@
 - *kernel_code_entry_byte_offset* defaults to 256.
 - *wavefront_size* defaults to 6.
 - *kernarg_segment_alignment*, *group_segment_alignment*, and
-  *private_segment_alignment* default to 4.  Note that alignments are specified
+  *private_segment_alignment* default to 4. Note that alignments are specified
   as a power of two, so a value of **n** means an alignment of 2^ **n**.
 
 The *.amd_kernel_code_t* directive must be placed immediately after the
@@ -404,3 +4044,26 @@
      s_endpgm
    .Lfunc_end0:
         .size   hello_world, .Lfunc_end0-hello_world
+
+Additional Documentation
+========================
+
+.. [AMD-RADEON-HD-2000-3000] `AMD R6xx shader ISA <http://developer.amd.com/wordpress/media/2012/10/R600_Instruction_Set_Architecture.pdf>`__
+.. [AMD-RADEON-HD-4000] `AMD R7xx shader ISA <http://developer.amd.com/wordpress/media/2012/10/R700-Family_Instruction_Set_Architecture.pdf>`__
+.. [AMD-RADEON-HD-5000] `AMD Evergreen shader ISA <http://developer.amd.com/wordpress/media/2012/10/AMD_Evergreen-Family_Instruction_Set_Architecture.pdf>`__
+.. [AMD-RADEON-HD-6000] `AMD Cayman/Trinity shader ISA <http://developer.amd.com/wordpress/media/2012/10/AMD_HD_6900_Series_Instruction_Set_Architecture.pdf>`__
+.. [AMD-GCN-GFX6] `AMD Southern Islands Series ISA <http://developer.amd.com/wordpress/media/2012/12/AMD_Southern_Islands_Instruction_Set_Architecture.pdf>`__
+.. [AMD-GCN-GFX7] `AMD Sea Islands Series ISA <http://developer.amd.com/wordpress/media/2013/07/AMD_Sea_Islands_Instruction_Set_Architecture.pdf>`_
+.. [AMD-GCN-GFX8] `AMD GCN3 Instruction Set Architecture <http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_GCN3_Instruction_Set_Architecture_rev1.1.pdf>`__
+.. [AMD-GCN-GFX9] `AMD "Vega" Instruction Set Architecture <http://developer.amd.com/wordpress/media/2013/12/Vega_Shader_ISA_28July2017.pdf>`__
+.. [AMD-OpenCL_Programming-Guide]  `AMD Accelerated Parallel Processing OpenCL Programming Guide <http://developer.amd.com/download/AMD_Accelerated_Parallel_Processing_OpenCL_Programming_Guide.pdf>`_
+.. [AMD-APP-SDK] `AMD Accelerated Parallel Processing APP SDK Documentation <http://developer.amd.com/tools/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/documentation/>`__
+.. [AMD-ROCm] `ROCm: Open Platform for Development, Discovery and Education Around GPU Computing <http://gpuopen.com/compute-product/rocm/>`__
+.. [AMD-ROCm-github] `ROCm github <http://github.com/RadeonOpenCompute>`__
+.. [HSA] `Heterogeneous System Architecture (HSA) Foundation <http://www.hsafoundation.com/>`__
+.. [ELF] `Executable and Linkable Format (ELF) <http://www.sco.com/developers/gabi/>`__
+.. [DWARF] `DWARF Debugging Information Format <http://dwarfstd.org/>`__
+.. [YAML] `YAML Ain't Markup Language (YAML™) Version 1.2 <http://www.yaml.org/spec/1.2/spec.html>`__
+.. [OpenCL] `The OpenCL Specification Version 2.0 <http://www.khronos.org/registry/cl/specs/opencl-2.0.pdf>`__
+.. [HRF] `Heterogeneous-race-free Memory Models <http://benedictgaster.org/wp-content/uploads/2014/01/asplos269-FINAL.pdf>`__
+.. [AMD-AMDGPU-Compute-Application-Binary-Interface] `AMDGPU Compute Application Binary Interface <https://github.com/RadeonOpenCompute/ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md>`__
diff --git a/docs/AliasAnalysis.rst b/docs/AliasAnalysis.rst
index 02b749f..0a5cb00 100644
--- a/docs/AliasAnalysis.rst
+++ b/docs/AliasAnalysis.rst
@@ -132,11 +132,12 @@
 same object.
 
 The ``PartialAlias`` response is used when the two memory objects are known to
-be overlapping in some way, but do not start at the same address.
+be overlapping in some way, regardless whether they start at the same address
+or not.
 
 The ``MustAlias`` response may only be returned if the two memory objects are
 guaranteed to always start at exactly the same location. A ``MustAlias``
-response implies that the pointers compare equal.
+response does not imply that the pointers compare equal.
 
 The ``getModRefInfo`` methods
 -----------------------------
diff --git a/docs/Benchmarking.rst b/docs/Benchmarking.rst
new file mode 100644
index 0000000..0f88db7
--- /dev/null
+++ b/docs/Benchmarking.rst
@@ -0,0 +1,87 @@
+==================================
+Benchmarking tips
+==================================
+
+
+Introduction
+============
+
+For benchmarking a patch we want to reduce all possible sources of
+noise as much as possible. How to do that is very OS dependent.
+
+Note that low noise is required, but not sufficient. It does not
+exclude measurement bias. See
+https://www.cis.upenn.edu/~cis501/papers/producing-wrong-data.pdf for
+example.
+
+General
+================================
+
+* Use a high resolution timer, e.g. perf under linux.
+
+* Run the benchmark multiple times to be able to recognize noise.
+
+* Disable as many processes or services as possible on the target system.
+
+* Disable frequency scaling, turbo boost and address space
+  randomization (see OS specific section).
+
+* Static link if the OS supports it. That avoids any variation that
+  might be introduced by loading dynamic libraries. This can be done
+  by passing ``-DLLVM_BUILD_STATIC=ON`` to cmake.
+
+* Try to avoid storage. On some systems you can use tmpfs. Putting the
+  program, inputs and outputs on tmpfs avoids touching a real storage
+  system, which can have a pretty big variability.
+
+  To mount it (on linux and freebsd at least)::
+
+    mount -t tmpfs -o size=<XX>g none dir_to_mount
+
+Linux
+=====
+
+* Disable address space randomization::
+
+    echo 0 > /proc/sys/kernel/randomize_va_space
+
+* Set scaling_governor to performance::
+
+   for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+   do
+     echo performance > /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+   done
+
+* Use https://github.com/lpechacek/cpuset to reserve cpus for just the
+  program you are benchmarking. If using perf, leave at least 2 cores
+  so that perf runs in one and your program in another::
+
+    cset shield -c N1,N2 -k on
+
+  This will move all threads out of N1 and N2. The ``-k on`` means
+  that even kernel threads are moved out.
+
+* Disable the SMT pair of the cpus you will use for the benchmark. The
+  pair of cpu N can be found in
+  ``/sys/devices/system/cpu/cpuN/topology/thread_siblings_list`` and
+  disabled with::
+
+    echo 0 > /sys/devices/system/cpu/cpuX/online
+
+
+* Run the program with::
+
+    cset shield --exec -- perf stat -r 10 <cmd>
+
+  This will run the command after ``--`` in the isolated cpus. The
+  particular perf command runs the ``<cmd>`` 10 times and reports
+  statistics.
+
+With these in place you can expect perf variations of less than 0.1%.
+
+Linux Intel
+-----------
+
+* Disable turbo mode::
+
+    echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
diff --git a/docs/BitCodeFormat.rst b/docs/BitCodeFormat.rst
index 3c9aa10..98a3156 100644
--- a/docs/BitCodeFormat.rst
+++ b/docs/BitCodeFormat.rst
@@ -550,6 +550,8 @@
 
 * 17 --- `TYPE_BLOCK`_ --- This describes all of the types in the module.
 
+* 23 --- `STRTAB_BLOCK`_ --- The bitcode file's string table.
+
 .. _MODULE_BLOCK:
 
 MODULE_BLOCK Contents
@@ -577,7 +579,7 @@
 ``[VERSION, version#]``
 
 The ``VERSION`` record (code 1) contains a single value indicating the format
-version. Versions 0 and 1 are supported at this time. The difference between
+version. Versions 0, 1 and 2 are supported at this time. The difference between
 version 0 and 1 is in the encoding of instruction operands in
 each `FUNCTION_BLOCK`_.
 
@@ -620,6 +622,12 @@
 case of phi instructions. For phi instructions, operands are encoded as
 `Signed VBRs`_ to deal with forward references.
 
+In version 2, the meaning of module records ``FUNCTION``, ``GLOBALVAR``,
+``ALIAS``, ``IFUNC`` and ``COMDAT`` change such that the first two operands
+specify an offset and size of a string in a string table (see `STRTAB_BLOCK
+Contents`_), the function name is removed from the ``FNENTRY`` record in the
+value symbol table, and the top-level ``VALUE_SYMTAB_BLOCK`` may only contain
+``FNENTRY`` records.
 
 MODULE_CODE_TRIPLE Record
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -673,11 +681,14 @@
 MODULE_CODE_GLOBALVAR Record
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-``[GLOBALVAR, pointer type, isconst, initid, linkage, alignment, section, visibility, threadlocal, unnamed_addr, externally_initialized, dllstorageclass, comdat]``
+``[GLOBALVAR, strtab offset, strtab size, pointer type, isconst, initid, linkage, alignment, section, visibility, threadlocal, unnamed_addr, externally_initialized, dllstorageclass, comdat, attributes, preemptionspecifier]``
 
 The ``GLOBALVAR`` record (code 7) marks the declaration or definition of a
 global variable. The operand fields are:
 
+* *strtab offset*, *strtab size*: Specifies the name of the global variable.
+  See `STRTAB_BLOCK Contents`_.
+
 * *pointer type*: The type index of the pointer type used to point to this
   global variable
 
@@ -750,16 +761,28 @@
 
 * *comdat*: An encoding of the COMDAT of this function
 
+* *attributes*: If nonzero, the 1-based index into the table of AttributeLists.
+
+.. _bcpreemptionspecifier:
+
+* *preemptionspecifier*: If present, an encoding of the runtime preemption specifier of this variable:
+
+  * ``dso_preemptable``: code 0
+  * ``dso_local``: code 1
+
 .. _FUNCTION:
 
 MODULE_CODE_FUNCTION Record
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-``[FUNCTION, type, callingconv, isproto, linkage, paramattr, alignment, section, visibility, gc, prologuedata, dllstorageclass, comdat, prefixdata, personalityfn]``
+``[FUNCTION, strtab offset, strtab size, type, callingconv, isproto, linkage, paramattr, alignment, section, visibility, gc, prologuedata, dllstorageclass, comdat, prefixdata, personalityfn, preemptionspecifier]``
 
 The ``FUNCTION`` record (code 8) marks the declaration or definition of a
 function. The operand fields are:
 
+* *strtab offset*, *strtab size*: Specifies the name of the function.
+  See `STRTAB_BLOCK Contents`_.
+
 * *type*: The type index of the function type describing this function
 
 * *callingconv*: The calling convention number:
@@ -814,14 +837,19 @@
 * *personalityfn*: If non-zero, the value index of the personality function for this function,
   plus 1.
 
+* *preemptionspecifier*: If present, an encoding of the :ref:`runtime preemption specifier<bcpreemptionspecifier>`  of this function.
+ 
 MODULE_CODE_ALIAS Record
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
-``[ALIAS, alias type, aliasee val#, linkage, visibility, dllstorageclass, threadlocal, unnamed_addr]``
+``[ALIAS, strtab offset, strtab size, alias type, aliasee val#, linkage, visibility, dllstorageclass, threadlocal, unnamed_addr, preemptionspecifier]``
 
 The ``ALIAS`` record (code 9) marks the definition of an alias. The operand
 fields are
 
+* *strtab offset*, *strtab size*: Specifies the name of the alias.
+  See `STRTAB_BLOCK Contents`_.
+
 * *alias type*: The type index of the alias
 
 * *aliasee val#*: The value index of the aliased value
@@ -839,15 +867,7 @@
 * *unnamed_addr*: If present, an encoding of the
   :ref:`unnamed_addr<bcunnamedaddr>` attribute of this alias
 
-MODULE_CODE_PURGEVALS Record
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-``[PURGEVALS, numvals]``
-
-The ``PURGEVALS`` record (code 10) resets the module-level value list to the
-size given by the single operand value. Module-level value list items are added
-by ``GLOBALVAR``, ``FUNCTION``, and ``ALIAS`` records.  After a ``PURGEVALS``
-record is seen, new value indices will start from the given *numvals* value.
+* *preemptionspecifier*: If present, an encoding of the :ref:`runtime preemption specifier<bcpreemptionspecifier>`  of this alias.
 
 .. _MODULE_CODE_GCNAME:
 
@@ -1310,3 +1330,20 @@
 ----------------------------
 
 The ``METADATA_ATTACHMENT`` block (id 16) ...
+
+.. _STRTAB_BLOCK:
+
+STRTAB_BLOCK Contents
+---------------------
+
+The ``STRTAB`` block (id 23) contains a single record (``STRTAB_BLOB``, id 1)
+with a single blob operand containing the bitcode file's string table.
+
+Strings in the string table are not null terminated. A record's *strtab
+offset* and *strtab size* operands specify the byte offset and size of a
+string within the string table.
+
+The string table is used by all preceding blocks in the bitcode file that are
+not succeeded by another intervening ``STRTAB`` block. Normally a bitcode
+file will have a single string table, but it may have more than one if it
+was created by binary concatenation of multiple bitcode files.
diff --git a/docs/BranchWeightMetadata.rst b/docs/BranchWeightMetadata.rst
index b941d0d..9bd8bd4 100644
--- a/docs/BranchWeightMetadata.rst
+++ b/docs/BranchWeightMetadata.rst
@@ -64,6 +64,20 @@
     [ , i32 <LABEL_BRANCH_WEIGHT> ... ]
   }
 
+``CallInst``
+^^^^^^^^^^^^^^^^^^
+
+Calls may have branch weight metadata, containing the execution count of
+the call. It is currently used in SamplePGO mode only, to augment the
+block and entry counts which may not be accurate with sampling.
+
+.. code-block:: none
+
+  !0 = metadata !{
+    metadata !"branch_weights",
+    i32 <CALL_BRANCH_WEIGHT>
+  }
+
 Other
 ^^^^^
 
diff --git a/docs/CFIVerify.rst b/docs/CFIVerify.rst
new file mode 100644
index 0000000..7424d01
--- /dev/null
+++ b/docs/CFIVerify.rst
@@ -0,0 +1,91 @@
+==============================================
+Control Flow Verification Tool Design Document
+==============================================
+
+.. contents::
+   :local:
+
+Objective
+=========
+
+This document provides an overview of an external tool to verify the protection
+mechanisms implemented by Clang's *Control Flow Integrity* (CFI) schemes
+(``-fsanitize=cfi``). This tool, provided a binary or DSO, should infer whether
+indirect control flow operations are protected by CFI, and should output these
+results in a human-readable form.
+
+This tool should also be added as part of Clang's continuous integration testing
+framework, where modifications to the compiler ensure that CFI protection
+schemes are still present in the final binary.
+
+Location
+========
+
+This tool will be present as a part of the LLVM toolchain, and will reside in
+the "/llvm/tools/llvm-cfi-verify" directory, relative to the LLVM trunk. It will
+be tested in two methods:
+
+- Unit tests to validate code sections, present in "/llvm/unittests/llvm-cfi-
+  verify".
+- Integration tests, present in "/llvm/tools/clang/test/LLVMCFIVerify". These
+  integration tests are part of clang as part of a continuous integration
+  framework, ensuring updates to the compiler that reduce CFI coverage on
+  indirect control flow instructions are identified.
+
+Background
+==========
+
+This tool will continuously validate that CFI directives are properly
+implemented around all indirect control flows by analysing the output machine
+code. The analysis of machine code is important as it ensures that any bugs
+present in linker or compiler do not subvert CFI protections in the final
+shipped binary.
+
+Unprotected indirect control flow instructions will be flagged for manual
+review. These unexpected control flows may simply have not been accounted for in
+the compiler implementation of CFI (e.g. indirect jumps to facilitate switch
+statements may not be fully protected).
+
+It may be possible in the future to extend this tool to flag unnecessary CFI
+directives (e.g. CFI directives around a static call to a non-polymorphic base
+type). This type of directive has no security implications, but may present
+performance impacts.
+
+Design Ideas
+============
+
+This tool will disassemble binaries and DSO's from their machine code format and
+analyse the disassembled machine code. The tool will inspect virtual calls and
+indirect function calls. This tool will also inspect indirect jumps, as inlined
+functions and jump tables should also be subject to CFI protections. Non-virtual
+calls (``-fsanitize=cfi-nvcall``) and cast checks (``-fsanitize=cfi-*cast*``)
+are not implemented due to a lack of information provided by the bytecode.
+
+The tool would operate by searching for indirect control flow instructions in
+the disassembly. A control flow graph would be generated from a small buffer of
+the instructions surrounding the 'target' control flow instruction. If the
+target instruction is branched-to, the fallthrough of the branch should be the
+CFI trap (on x86, this is a ``ud2`` instruction). If the target instruction is
+the fallthrough (i.e. immediately succeeds) of a conditional jump, the
+conditional jump target should be the CFI trap. If an indirect control flow
+instruction does not conform to one of these formats, the target will be noted
+as being CFI-unprotected.
+
+Note that in the second case outlined above (where the target instruction is the
+fallthrough of a conditional jump), if the target represents a vcall that takes
+arguments, these arguments may be pushed to the stack after the branch but
+before the target instruction. In these cases, a secondary 'spill graph' in
+constructed, to ensure the register argument used by the indirect jump/call is
+not spilled from the stack at any point in the interim period. If there are no
+spills that affect the target register, the target is marked as CFI-protected.
+
+Other Design Notes
+~~~~~~~~~~~~~~~~~~
+
+Only machine code sections that are marked as executable will be subject to this
+analysis. Non-executable sections do not require analysis as any execution
+present in these sections has already violated the control flow integrity.
+
+Suitable extensions may be made at a later date to include anaylsis for indirect
+control flow operations across DSO boundaries. Currently, these CFI features are
+only experimental with an unstable ABI, making them unsuitable for analysis.
diff --git a/docs/CMake.rst b/docs/CMake.rst
index 0a32d39..473672b 100644
--- a/docs/CMake.rst
+++ b/docs/CMake.rst
@@ -186,8 +186,8 @@
   Sets the build type for ``make``-based generators. Possible values are
   Release, Debug, RelWithDebInfo and MinSizeRel. If you are using an IDE such as
   Visual Studio, you should use the IDE settings to set the build type.
-  Be aware that Release and RelWithDebInfo are not using the same optimization
-  level on most platform.
+  Be aware that Release and RelWithDebInfo use different optimization levels on
+  most platforms.
 
 **CMAKE_INSTALL_PREFIX**:PATH
   Path where LLVM will be installed if "make install" is invoked or the
@@ -247,9 +247,11 @@
   tests.
 
 **LLVM_APPEND_VC_REV**:BOOL
-  Append version control revision info (svn revision number or Git revision id)
-  to LLVM version string (stored in the PACKAGE_VERSION macro). For this to work
-  cmake must be invoked before the build. Defaults to OFF.
+  Embed version control revision info (svn revision number or Git revision id).
+  The version info is provided by the ``LLVM_REVISION`` macro in
+  ``llvm/include/llvm/Support/VCSRevision.h``. Developers using git who don't
+  need revision info can disable this option to avoid re-linking most binaries
+  after a branch switch. Defaults to ON.
 
 **LLVM_ENABLE_THREADS**:BOOL
   Build with threads support, if available. Defaults to ON.
@@ -535,6 +537,16 @@
   during the build. Enabling this option can significantly speed up build times
   especially when building LLVM in Debug configurations.
 
+**LLVM_REVERSE_ITERATION**:BOOL
+  If enabled, all supported unordered llvm containers would be iterated in
+  reverse order. This is useful for uncovering non-determinism caused by
+  iteration of unordered containers.
+
+**LLVM_BUILD_INSTRUMENTED_COVERAGE**:BOOL
+  If enabled, `source-based code coverage
+  <http://clang.llvm.org/docs/SourceBasedCodeCoverage.html>`_ instrumentation
+  is enabled while building llvm.
+
 CMake Caches
 ============
 
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index ad2178d..f1f93c7 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -1,8 +1,8 @@
 
 if (DOXYGEN_FOUND)
 if (LLVM_ENABLE_DOXYGEN)
-  set(abs_top_srcdir ${LLVM_MAIN_SRC_DIR})
-  set(abs_top_builddir ${LLVM_BINARY_DIR})
+  set(abs_top_srcdir ${CMAKE_CURRENT_SOURCE_DIR})
+  set(abs_top_builddir ${CMAKE_CURRENT_BINARY_DIR})
   
   if (HAVE_DOT)
     set(DOT ${LLVM_PATH_DOT})
@@ -103,8 +103,8 @@
 endif()
 
 if (LLVM_ENABLE_SPHINX)
+  include(AddSphinxTarget)
   if (SPHINX_FOUND)
-    include(AddSphinxTarget)
     if (${SPHINX_OUTPUT_HTML})
       add_sphinx_target(html llvm)
     endif()
@@ -112,6 +112,7 @@
 
     if (${SPHINX_OUTPUT_MAN})
       add_sphinx_target(man llvm)
+      add_sphinx_target(man llvm-dwarfdump)
     endif()
 
   endif()
diff --git a/docs/CMakePrimer.rst b/docs/CMakePrimer.rst
index 1e3a09e..72ebffa 100644
--- a/docs/CMakePrimer.rst
+++ b/docs/CMakePrimer.rst
@@ -112,33 +112,6 @@
 targeting an Apple platform. For all other targets the ``extra_sources`` will be
 evaluated as empty before add_executable is given its arguments.
 
-One big "Gotcha" with variable dereferencing is that ``if`` commands implicitly
-dereference values. This has some unexpected results. For example:
-
-.. code-block:: cmake
-
-   if("${SOME_VAR}" STREQUAL "MSVC")
-
-In this code sample MSVC will be implicitly dereferenced, which will result in
-the if command comparing the value of the dereferenced variables ``SOME_VAR``
-and ``MSVC``. A common workaround to this solution is to prepend strings being
-compared with an ``x``.
-
-.. code-block:: cmake
-
-   if("x${SOME_VAR}" STREQUAL "xMSVC")
-
-This works because while ``MSVC`` is a defined variable, ``xMSVC`` is not. This
-pattern is uncommon, but it does occur in LLVM's CMake scripts.
-
-.. note::
-   
-   Once the LLVM project upgrades its minimum CMake version to 3.1 or later we
-   can prevent this behavior by setting CMP0054 to new. For more information on
-   CMake policies please see the cmake-policies manpage or the `cmake-policies
-   online documentation
-   <https://cmake.org/cmake/help/v3.4/manual/cmake-policies.7.html>`_.
-
 Lists
 -----
 
@@ -194,10 +167,9 @@
 
 Variables that are cached or specified on the command line can have types
 associated with them. The variable's type is used by CMake's UI tool to display
-the right input field. The variable's type generally doesn't impact evaluation.
-One of the few examples is PATH variables, which CMake does have some special
-handling for. You can read more about the special handling in `CMake's set
-documentation
+the right input field. A variable's type generally doesn't impact evaluation,
+however CMake does have special handling for some variables such as PATH.
+You can read more about the special handling in `CMake's set documentation
 <https://cmake.org/cmake/help/v3.5/command/set.html#set-cache-entry>`_.
 
 Scope
@@ -230,7 +202,7 @@
 ============
 
 CMake features the same basic control flow constructs you would expect in any
-scripting language, but there are a few quarks because, as with everything in
+scripting language, but there are a few quirks because, as with everything in
 CMake, control flow constructs are commands.
 
 If, ElseIf, Else
@@ -361,21 +333,23 @@
 in this section will all use the CMake ``function`` block, but this all applies
 to the ``macro`` block as well.
 
-CMake commands can have named arguments, but all commands are implicitly
-variable argument. If the command has named arguments they are required and must
-be specified at every call site. Below is a trivial example of providing a
-wrapper function for CMake's built in function ``add_dependencies``.
+CMake commands can have named arguments that are requried at every call site. In
+addition, all commands will implicitly accept a variable number of extra
+arguments (In C parlance, all commands are varargs functions). When a command is
+invoked with extra arguments (beyond the named ones) CMake will store the full
+list of arguments (both named and unnamed) in a list named ``ARGV``, and the
+sublist of unnamed arguments in ``ARGN``. Below is a trivial example of
+providing a wrapper function for CMake's built in function ``add_dependencies``.
 
 .. code-block:: cmake
 
    function(add_deps target)
-     add_dependencies(${target} ${ARGV})
+     add_dependencies(${target} ${ARGN})
    endfunction()
 
 This example defines a new macro named ``add_deps`` which takes a required first
 argument, and just calls another function passing through the first argument and
-all trailing arguments. When variable arguments are present CMake defines them
-in a list named ``ARGV``, and the count of the arguments is defined in ``ARGN``.
+all trailing arguments.
 
 CMake provides a module ``CMakeParseArguments`` which provides an implementation
 of advanced argument parsing. We use this all over LLVM, and it is recommended
diff --git a/docs/CodeGenerator.rst b/docs/CodeGenerator.rst
index 106fc84..bcdc722 100644
--- a/docs/CodeGenerator.rst
+++ b/docs/CodeGenerator.rst
@@ -2642,59 +2642,6 @@
 The AMDGPU backend
 ------------------
 
-The AMDGPU code generator lives in the lib/Target/AMDGPU directory, and is an
-open source native AMD GCN ISA code generator.
-
-Target triples supported
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-The following are the known target triples that are supported by the AMDGPU
-backend.
-
-* **amdgcn--** --- AMD GCN GPUs (AMDGPU.7.0.0+)
-* **amdgcn--amdhsa** --- AMD GCN GPUs (AMDGPU.7.0.0+) with HSA support
-* **r600--** --- AMD GPUs HD2XXX-HD6XXX
-
-Relocations
-^^^^^^^^^^^
-
-Supported relocatable fields are:
-
-* **word32** --- This specifies a 32-bit field occupying 4 bytes with arbitrary
-  byte alignment. These values use the same byte order as other word values in
-  the AMD GPU architecture
-* **word64** --- This specifies a 64-bit field occupying 8 bytes with arbitrary
-  byte alignment. These values use the same byte order as other word values in
-  the AMD GPU architecture
-
-Following notations are used for specifying relocation calculations:
-
-* **A** --- Represents the addend used to compute the value of the relocatable
-  field
-* **G** --- Represents the offset into the global offset table at which the
-  relocation entry’s symbol will reside during execution.
-* **GOT** --- Represents the address of the global offset table.
-* **P** --- Represents the place (section offset or address) of the storage unit
-  being relocated (computed using ``r_offset``)
-* **S** --- Represents the value of the symbol whose index resides in the
-  relocation entry
-
-AMDGPU Backend generates *Elf64_Rela* relocation records with the following
-supported relocation types:
-
-  ==========================  =====  ==========  ==============================
-  Relocation type             Value  Field       Calculation
-  ==========================  =====  ==========  ==============================
-  ``R_AMDGPU_NONE``           0      ``none``    ``none``
-  ``R_AMDGPU_ABS32_LO``       1      ``word32``  (S + A) & 0xFFFFFFFF
-  ``R_AMDGPU_ABS32_HI``       2      ``word32``  (S + A) >> 32
-  ``R_AMDGPU_ABS64``          3      ``word64``  S + A
-  ``R_AMDGPU_REL32``          4      ``word32``  S + A - P
-  ``R_AMDGPU_REL64``          5      ``word64``  S + A - P
-  ``R_AMDGPU_ABS32``          6      ``word32``  S + A
-  ``R_AMDGPU_GOTPCREL``       7      ``word32``  G + GOT + A - P
-  ``R_AMDGPU_GOTPCREL32_LO``  8      ``word32``  (G + GOT + A - P) & 0xFFFFFFFF
-  ``R_AMDGPU_GOTPCREL32_HI``  9      ``word32``  (G + GOT + A - P) >> 32
-  ``R_AMDGPU_REL32_LO``       10     ``word32``  (S + A - P) & 0xFFFFFFFF
-  ``R_AMDGPU_REL32_HI``       11     ``word32``  (S + A - P) >> 32
-  ==========================  =====  ==========  ==============================
+The AMDGPU code generator lives in the ``lib/Target/AMDGPU``
+directory. This code generator is capable of targeting a variety of
+AMD GPU processors. Refer to :doc:`AMDGPUUsage` for more information.
diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst
index 722718b..0de2fc6 100644
--- a/docs/CodingStandards.rst
+++ b/docs/CodingStandards.rst
@@ -34,10 +34,10 @@
 (e.g. the naming convention).  This is because they are relatively new, and a
 lot of code was written before they were put in place.  Our long term goal is
 for the entire codebase to follow the convention, but we explicitly *do not*
-want patches that do large-scale reformating of existing code.  On the other
+want patches that do large-scale reformatting of existing code.  On the other
 hand, it is reasonable to rename the methods of a class if you're about to
-change it in some other way.  Just do the reformating as a separate commit from
-the functionality change.
+change it in some other way.  Just do the reformatting as a separate commit
+from the functionality change.
   
 The ultimate goal of these guidelines is to increase the readability and
 maintainability of our common source base. If you have suggestions for topics to
@@ -811,6 +811,21 @@
   for (const auto *Ptr : Container) { observe(*Ptr); }
   for (auto *Ptr : Container) { Ptr->change(); }
 
+Beware of non-determinism due to ordering of pointers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In general, there is no relative ordering among pointers. As a result,
+when unordered containers like sets and maps are used with pointer keys
+the iteration order is undefined. Hence, iterating such containers may
+result in non-deterministic code generation. While the generated code
+might not necessarily be "wrong code", this non-determinism might result
+in unexpected runtime crashes or simply hard to reproduce bugs on the
+customer side making it harder to debug and fix.
+
+As a rule of thumb, in case an ordered result is expected, remember to
+sort an unordered container before iteration. Or use ordered containers
+like vector/MapVector/SetVector if you want to iterate pointer keys.
+
 Style Issues
 ============
 
@@ -941,8 +956,8 @@
 
 .. code-block:: c++
 
-  for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) {
-    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(II)) {
+  for (Instruction &I : BB) {
+    if (auto *BO = dyn_cast<BinaryOperator>(&I)) {
       Value *LHS = BO->getOperand(0);
       Value *RHS = BO->getOperand(1);
       if (LHS != RHS) {
@@ -961,8 +976,8 @@
 
 .. code-block:: c++
 
-  for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) {
-    BinaryOperator *BO = dyn_cast<BinaryOperator>(II);
+  for (Instruction &I : BB) {
+    auto *BO = dyn_cast<BinaryOperator>(&I);
     if (!BO) continue;
 
     Value *LHS = BO->getOperand(0);
@@ -1232,6 +1247,12 @@
 code for this branch. If the compiler does not support this, it will fall back
 to the "abort" implementation.
 
+Neither assertions or ``llvm_unreachable`` will abort the program on a release
+build. If the error condition can be triggered by user input then the
+recoverable error mechanism described in :doc:`ProgrammersManual` should be
+used instead. In cases where this is not practical, ``report_fatal_error`` may
+be used.
+
 Another issue is that values used only by assertions will produce an "unused
 value" warning when assertions are disabled.  For example, this code will warn:
 
@@ -1316,19 +1337,31 @@
 individual enumerators. To suppress this warning, use ``llvm_unreachable`` after
 the switch.
 
-Don't evaluate ``end()`` every time through a loop
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Use range-based ``for`` loops wherever possible
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Because C++ doesn't have a standard "``foreach``" loop (though it can be
-emulated with macros and may be coming in C++'0x) we end up writing a lot of
-loops that manually iterate from begin to end on a variety of containers or
-through other data structures.  One common mistake is to write a loop in this
-style:
+The introduction of range-based ``for`` loops in C++11 means that explicit
+manipulation of iterators is rarely necessary. We use range-based ``for``
+loops wherever possible for all newly added code. For example:
 
 .. code-block:: c++
 
   BasicBlock *BB = ...
-  for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
+  for (Instruction &I : *BB)
+    ... use I ...
+
+Don't evaluate ``end()`` every time through a loop
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In cases where range-based ``for`` loops can't be used and it is necessary
+to write an explicit iterator-based loop, pay close attention to whether
+``end()`` is re-evaluted on each loop iteration. One common mistake is to
+write a loop in this style:
+
+.. code-block:: c++
+
+  BasicBlock *BB = ...
+  for (auto I = BB->begin(); I != BB->end(); ++I)
     ... use I ...
 
 The problem with this construct is that it evaluates "``BB->end()``" every time
@@ -1339,7 +1372,7 @@
 .. code-block:: c++
 
   BasicBlock *BB = ...
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+  for (auto I = BB->begin(), E = BB->end(); I != E; ++I)
     ... use I ...
 
 The observant may quickly point out that these two loops may have different
diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst
index 8830c39..44cc57c 100644
--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst
@@ -397,10 +397,11 @@
 For most uses of FileCheck, fixed string matching is perfectly sufficient.  For
 some things, a more flexible form of matching is desired.  To support this,
 FileCheck allows you to specify regular expressions in matching strings,
-surrounded by double braces: ``{{yourregex}}``.  Because we want to use fixed
-string matching for a majority of what we do, FileCheck has been designed to
-support mixing and matching fixed string matching with regular expressions.
-This allows you to write things like this:
+surrounded by double braces: ``{{yourregex}}``. FileCheck implements a POSIX
+regular expression matcher; it supports Extended POSIX regular expressions
+(ERE). Because we want to use fixed string matching for a majority of what we
+do, FileCheck has been designed to support mixing and matching fixed string
+matching with regular expressions.  This allows you to write things like this:
 
 .. code-block:: llvm
 
@@ -434,7 +435,7 @@
 variable ``REGISTER``.  The second line verifies that whatever is in
 ``REGISTER`` occurs later in the file after an "``andw``".  :program:`FileCheck`
 variable references are always contained in ``[[ ]]`` pairs, and their names can
-be formed with the regex ``[a-zA-Z][a-zA-Z0-9]*``.  If a colon follows the name,
+be formed with the regex ``[a-zA-Z_][a-zA-Z0-9_]*``.  If a colon follows the name,
 then it is a definition of the variable; otherwise, it is a use.
 
 :program:`FileCheck` variables can be defined multiple times, and uses always
diff --git a/docs/CommandGuide/index.rst b/docs/CommandGuide/index.rst
index 46db57f..5a0a98c 100644
--- a/docs/CommandGuide/index.rst
+++ b/docs/CommandGuide/index.rst
@@ -51,4 +51,5 @@
    tblgen
    lit
    llvm-build
+   llvm-pdbutil
    llvm-readobj
diff --git a/docs/CommandGuide/lit.rst b/docs/CommandGuide/lit.rst
index b8299d4..fbe1a9a 100644
--- a/docs/CommandGuide/lit.rst
+++ b/docs/CommandGuide/lit.rst
@@ -80,6 +80,13 @@
  Show more information on test failures, for example the entire test output
  instead of just the test result.
 
+.. option:: -vv, --echo-all-commands
+
+ Echo all commands to stdout, as they are being executed.
+ This can be valuable for debugging test failures, as the last echoed command
+ will be the one which has failed.
+ This option implies ``--verbose``.
+
 .. option:: -a, --show-all
 
  Show more information about all tests, for example the entire test
@@ -169,6 +176,13 @@
  must be in the range ``1..M``. The environment variable
  ``LIT_RUN_SHARD`` can also be used in place of this option.
 
+.. option:: --filter=REGEXP
+
+  Run only those tests whose name matches the regular expression specified in
+  ``REGEXP``. The environment variable ``LIT_FILTER`` can be also used in place
+  of this option, which is especially useful in environments where the call
+  to ``lit`` is issued indirectly.
+
 ADDITIONAL OPTIONS
 ------------------
 
diff --git a/docs/CommandGuide/llvm-cov.rst b/docs/CommandGuide/llvm-cov.rst
index ea2e625..6ee05ee 100644
--- a/docs/CommandGuide/llvm-cov.rst
+++ b/docs/CommandGuide/llvm-cov.rst
@@ -195,44 +195,53 @@
 
 .. option:: -show-line-counts
 
- Show the execution counts for each line. This is enabled by default, unless
- another ``-show`` option is used.
+ Show the execution counts for each line. Defaults to true, unless another
+ ``-show`` option is used.
 
 .. option:: -show-expansions
 
  Expand inclusions, such as preprocessor macros or textual inclusions, inline
- in the display of the source file.
+ in the display of the source file. Defaults to false.
 
 .. option:: -show-instantiations
 
  For source regions that are instantiated multiple times, such as templates in
  ``C++``, show each instantiation separately as well as the combined summary.
+ Defaults to true.
 
 .. option:: -show-regions
 
  Show the execution counts for each region by displaying a caret that points to
- the character where the region starts.
+ the character where the region starts. Defaults to false.
 
 .. option:: -show-line-counts-or-regions
 
  Show the execution counts for each line if there is only one region on the
  line, but show the individual regions if there are multiple on the line.
+ Defaults to false.
 
-.. option:: -use-color[=VALUE]
+.. option:: -use-color
 
  Enable or disable color output. By default this is autodetected.
 
-.. option:: -arch=<name>
+.. option:: -arch=[*NAMES*]
 
- If the covered binary is a universal binary, select the architecture to use.
- It is an error to specify an architecture that is not included in the
- universal binary or to use an architecture that does not match a
- non-universal binary.
+ Specify a list of architectures such that the Nth entry in the list
+ corresponds to the Nth specified binary. If the covered object is a universal
+ binary, this specifies the architecture to use. It is an error to specify an
+ architecture that is not included in the universal binary or to use an
+ architecture that does not match a non-universal binary.
 
 .. option:: -name=<NAME>
 
  Show code coverage only for functions with the given name.
 
+.. option:: -name-whitelist=<FILE>
+
+ Show code coverage only for functions listed in the given file. Each line in
+ the file should start with `whitelist_fun:`, immediately followed by the name
+ of the function to accept. This name can be a wildcard expression.
+
 .. option:: -name-regex=<PATTERN>
 
  Show code coverage only for functions that match the given regular expression.
@@ -262,6 +271,12 @@
  The demangler is expected to read a newline-separated list of symbols from
  stdin and write a newline-separated list of the same length to stdout.
 
+.. option:: -num-threads=N, -j=N
+
+ Use N threads to write file reports (only applicable when -output-dir is
+ specified). When N=0, llvm-cov auto-detects an appropriate number of threads to
+ use. This is the default.
+
 .. option:: -line-coverage-gt=<N>
 
  Show code coverage only for functions with line coverage greater than the
@@ -282,6 +297,12 @@
  Show code coverage only for functions with region coverage less than the given
  threshold.
 
+.. option:: -path-equivalence=<from>,<to>
+
+ Map the paths in the coverage data to local source file paths. This allows you
+ to generate the coverage data on one machine, and then use llvm-cov on a
+ different machine where you have the same files on a different path.
+
 .. program:: llvm-cov report
 
 .. _llvm-cov-report:
@@ -324,7 +345,11 @@
 
 .. option:: -show-functions
 
- Show coverage summaries for each function.
+ Show coverage summaries for each function. Defaults to false.
+
+.. option:: -show-instantiation-summary
+
+ Show statistics for all function instantiations. Defaults to false.
 
 .. program:: llvm-cov export
 
diff --git a/docs/CommandGuide/llvm-dwarfdump.rst b/docs/CommandGuide/llvm-dwarfdump.rst
index 30c18ad..a3b6266 100644
--- a/docs/CommandGuide/llvm-dwarfdump.rst
+++ b/docs/CommandGuide/llvm-dwarfdump.rst
@@ -1,30 +1,142 @@
-llvm-dwarfdump - print contents of DWARF sections
-=================================================
+llvm-dwarfdump - dump and verify DWARF debug information
+========================================================
 
 SYNOPSIS
 --------
 
-:program:`llvm-dwarfdump` [*options*] [*filenames...*]
+:program:`llvm-dwarfdump` [*options*] [*filename ...*]
 
 DESCRIPTION
 -----------
 
-:program:`llvm-dwarfdump` parses DWARF sections in the object files
-and prints their contents in human-readable form.
+:program:`llvm-dwarfdump` parses DWARF sections in object files,
+archives, and `.dSYM` bundles and prints their contents in
+human-readable form. Only the .debug_info section is printed unless one of
+the section-specific options or :option:`--all` is specified.
 
 OPTIONS
 -------
 
-.. option:: -debug-dump=section
+.. option:: -a, --all
 
-  Specify the DWARF section to dump.
-  For example, use ``abbrev`` to dump the contents of ``.debug_abbrev`` section,
-  ``loc.dwo`` to dump the contents of ``.debug_loc.dwo`` etc.
-  See ``llvm-dwarfdump --help`` for the complete list of supported sections.
-  Use ``all`` to dump all DWARF sections. It is the default.
+            Disassemble all supported DWARF sections.
+
+.. option:: --arch=<arch>
+
+            Dump DWARF debug information for the specified CPU architecture.
+            Architectures may be specified by name or by number.  This
+            option can be specified multiple times, once for each desired
+            architecture.  All CPU architectures will be printed by
+            default.
+
+.. option:: -c, --show-children
+
+            Show a debug info entry's children when using
+            the :option:`--debug-info`, :option:`--find`,
+            and :option:`--name` options.
+
+.. option:: -f <name>, --find=<name>
+
+            Search for the exact text <name> in the accelerator tables
+            and print the matching debug information entries.
+            When there is no accelerator tables or the name of the DIE
+            you are looking for is not found in the accelerator tables,
+            try using the slower but more complete :option:`--name` option.
+
+.. option:: -F, --show-form
+
+            Show DWARF form types after the DWARF attribute types.
+
+.. option:: -h, --help
+
+            Show help and usage for this command.
+
+.. option:: -i, --ignore-case
+
+            Ignore case distinctions in when searching entries by name
+            or by regular expression.
+
+.. option:: -n <pattern>, --name=<pattern>
+
+            Find and print all debug info entries whose name
+            (`DW_AT_name` attribute) matches the exact text in
+            <pattern>. Use the :option:`--regex` option to have
+            <pattern> become a regular expression for more flexible
+            pattern matching.
+
+.. option:: --lookup=<address>
+
+            Lookup <address> in the debug information and print out the file,
+            function, block, and line table details.
+
+.. option:: -o <path>, --out-file=<path>
+
+            Redirect output to a file specified by <path>.
+
+.. option:: -p, --show-parents
+
+            Show a debug info entry's parent objects when using the
+            :option:`--debug-info`, :option:`--find`, and
+            :option:`--name` options.
+
+.. option:: -r <n>, --recurse-depth=<n>
+
+            Only recurse to a maximum depth of <n> when dumping debug info
+            entries.
+
+.. option:: --statistics
+
+            Collect debug info quality metrics and print the results
+            as machine-readable single-line JSON output.
+
+.. option:: -x, --regex
+
+            Treat any <pattern> strings as regular expressions when searching
+            instead of just as an exact string match.
+
+.. option:: -u, --uuid
+
+            Show the UUID for each architecture.
+
+.. option:: --diff
+
+            Dump the output in a format that is more friendly for comparing
+            DWARF output from two different files.
+
+.. option:: -v, --verbose
+
+            Display verbose information when dumping. This can help to debug
+            DWARF issues.
+
+.. option:: --verify
+
+            Verify the structure of the DWARF information by verifying the
+            compile unit chains, DIE relationships graph, address
+            ranges, and more.
+
+.. option:: --version
+
+            Display the version of the tool.
+
+.. option:: --debug-abbrev, --debug-aranges, --debug-cu-index, --debug-frame [=<offset>], --debug-gnu-pubnames, --debug-gnu-pubtypes, --debug-info [=<offset>], --debug-line [=<offset>], --debug-loc [=<offset>], --debug-macro, --debug-pubnames, --debug-pubtypes, --debug-ranges, --debug-str, --debug-str-offsets, --debug-tu-index, --debug-types, --eh-frame, --gdb-index, --apple-names, --apple-types, --apple-namespaces, --apple-objc
+
+            Dump the specified DWARF section by name. Only the
+            `.debug_info` section is shown by default. Some entries
+            support adding an `=<offset>` as a way to provide an
+            optional offset of the exact entry to dump within the
+            respective section. When an offset is provided, only the
+            entry at that offset will be dumped, else the entire
+            section will be dumped. Children of items at a specific
+            offset can be dumped by also using the
+            :option:`--show-children` option where applicable.
 
 EXIT STATUS
 -----------
 
 :program:`llvm-dwarfdump` returns 0 if the input files were parsed and dumped
 successfully. Otherwise, it returns 1.
+
+SEE ALSO
+--------
+
+:manpage:`dsymutil(1)`
diff --git a/docs/CommandGuide/llvm-nm.rst b/docs/CommandGuide/llvm-nm.rst
index 319e6e6..da7edea 100644
--- a/docs/CommandGuide/llvm-nm.rst
+++ b/docs/CommandGuide/llvm-nm.rst
@@ -134,9 +134,6 @@
 BUGS
 ----
 
- * :program:`llvm-nm` cannot demangle C++ mangled names, like GNU :program:`nm`
-   can.
-
  * :program:`llvm-nm` does not support the full set of arguments that GNU
    :program:`nm` does.
 
diff --git a/docs/CommandGuide/llvm-pdbutil.rst b/docs/CommandGuide/llvm-pdbutil.rst
new file mode 100644
index 0000000..8836f3a
--- /dev/null
+++ b/docs/CommandGuide/llvm-pdbutil.rst
@@ -0,0 +1,585 @@
+llvm-pdbutil - PDB File forensics and diagnostics
+=================================================
+
+.. contents::
+   :local:
+
+Synopsis
+--------
+
+:program:`llvm-pdbutil` [*subcommand*] [*options*]
+
+Description
+-----------
+
+Display types, symbols, CodeView records, and other information from a
+PDB file, as well as manipulate and create PDB files.  :program:`llvm-pdbutil`
+is normally used by FileCheck-based tests to test LLVM's PDB reading and
+writing functionality, but can also be used for general PDB file investigation
+and forensics, or as a replacement for cvdump.
+
+Subcommands
+-----------
+
+:program:`llvm-pdbutil` is separated into several subcommands each tailored to
+a different purpose.  A brief summary of each command follows, with more detail
+in the sections that follow.
+
+  * :ref:`pretty_subcommand` - Dump symbol and type information in a format that

+    tries to look as much like the original source code as possible.
+  * :ref:`dump_subcommand` - Dump low level types and structures from the PDB

+    file, including CodeView records, hash tables, PDB streams, etc.
+  * :ref:`bytes_subcommand` - Dump data from the PDB file's streams, records,

+    types, symbols, etc as raw bytes.
+  * :ref:`yaml2pdb_subcommand` - Given a yaml description of a PDB file, produce

+    a valid PDB file that matches that description.
+  * :ref:`pdb2yaml_subcommand` - For a given PDB file, produce a YAML

+    description of some or all of the file in a way that the PDB can be

+    reconstructed.
+  * :ref:`merge_subcommand` - Given two PDBs, produce a third PDB that is the

+    result of merging the two input PDBs.
+
+.. _pretty_subcommand:
+
+pretty
+~~~~~~
+
+.. program:: llvm-pdbutil pretty
+
+.. important::
+   The **pretty** subcommand is built on the Windows DIA SDK, and as such is not

+   supported on non-Windows platforms.
+
+USAGE: :program:`llvm-pdbutil` pretty [*options*] <input PDB file>
+
+Summary
+^^^^^^^^^^^
+
+The *pretty* subcommand displays a very high level representation of your

+program's debug info.  Since it is built on the Windows DIA SDK which is the

+standard API that Windows tools and debuggers query debug information, it

+presents a more authoritative view of how a debugger is going to interpret your

+debug information than a mode which displays low-level CodeView records.
+
+Options
+^^^^^^^
+
+Filtering and Sorting Options
++++++++++++++++++++++++++++++
+
+.. note::
+   *exclude* filters take priority over *include* filters.  So if a filter

+   matches both an include and an exclude rule, then it is excluded.
+
+.. option:: -exclude-compilands=<string>
+
+ When dumping compilands, compiland source-file contributions, or per-compiland

+ symbols, this option instructs **llvm-pdbutil** to omit any compilands that

+ match the specified regular expression.
+
+.. option:: -exclude-symbols=<string>
+
+ When dumping global, public, or per-compiland symbols, this option instructs

+ **llvm-pdbutil** to omit any symbols that match the specified regular

+ expression.
+
+.. option:: -exclude-types=<string>
+
+ When dumping types, this option instructs **llvm-pdbutil** to omit any types

+ that match the specified regular expression.
+
+.. option:: -include-compilands=<string>
+
+ When dumping compilands, compiland source-file contributions, or per-compiland

+ symbols, limit the initial search to only those compilands that match the

+ specified regular expression.
+
+.. option:: -include-symbols=<string>
+
+ When dumping global, public, or per-compiland symbols, limit the initial

+ search to only those symbols that match the specified regular expression.
+
+.. option:: -include-types=<string>
+
+ When dumping types, limit the initial search to only those types that match

+ the specified regular expression.
+
+.. option:: -min-class-padding=<uint>
+
+ Only display types that have at least the specified amount of alignment

+ padding, accounting for padding in base classes and aggregate field members.
+
+.. option:: -min-class-padding-imm=<uint>
+
+ Only display types that have at least the specified amount of alignment

+ padding, ignoring padding in base classes and aggregate field members.
+
+.. option:: -min-type-size=<uint>
+
+ Only display types T where sizeof(T) is greater than or equal to the specified

+ amount.
+
+.. option:: -no-compiler-generated
+
+ Don't show compiler generated types and symbols
+
+.. option:: -no-enum-definitions
+
+ When dumping an enum, don't show the full enum (e.g. the individual enumerator

+ values).
+
+.. option:: -no-system-libs
+
+ Don't show symbols from system libraries
+
+Symbol Type Options
++++++++++++++++++++
+.. option:: -all
+
+ Implies all other options in this category.
+
+.. option:: -class-definitions=<format>
+
+ Displays class definitions in the specified format.
+
+ .. code-block:: perl
+
+    =all      - Display all class members including data, constants, typedefs, functions, etc (default)
+    =layout   - Only display members that contribute to class size.
+    =none     - Don't display class definitions (e.g. only display the name and base list)
+
+.. option:: -class-order
+
+ Displays classes in the specified order.
+
+ .. code-block:: perl
+
+    =none            - Undefined / no particular sort order (default)
+    =name            - Sort classes by name
+    =size            - Sort classes by size
+    =padding         - Sort classes by amount of padding
+    =padding-pct     - Sort classes by percentage of space consumed by padding
+    =padding-imm     - Sort classes by amount of immediate padding
+    =padding-pct-imm - Sort classes by percentage of space consumed by immediate padding
+
+.. option::  -class-recurse-depth=<uint>
+
+ When dumping class definitions, stop after recursing the specified number of times.  The
+ default is 0, which is no limit.
+
+.. option::  -classes
+
+ Display classes
+
+.. option::  -compilands
+
+ Display compilands (e.g. object files)
+
+.. option::  -enums
+
+ Display enums
+
+.. option::  -externals
+
+ Dump external (e.g. exported) symbols
+
+.. option::  -globals
+
+ Dump global symbols
+
+.. option::  -lines
+
+ Dump the mappings between source lines and code addresses.
+
+.. option::  -module-syms
+
+ Display symbols (variables, functions, etc) for each compiland
+
+.. option::  -sym-types=<types>
+
+ Type of symbols to dump when -globals, -externals, or -module-syms is
+ specified. (default all)
+
+ .. code-block:: perl
+
+    =thunks - Display thunk symbols
+    =data   - Display data symbols
+    =funcs  - Display function symbols
+    =all    - Display all symbols (default)
+
+.. option::  -symbol-order=<order>
+
+ For symbols dumped via the -module-syms, -globals, or -externals options, sort
+ the results in specified order.
+
+ .. code-block:: perl
+
+    =none - Undefined / no particular sort order
+    =name - Sort symbols by name
+    =size - Sort symbols by size
+
+.. option::  -typedefs
+
+ Display typedef types
+
+.. option::  -types
+
+ Display all types (implies -classes, -enums, -typedefs)
+
+Other Options
++++++++++++++
+
+.. option:: -color-output
+
+ Force color output on or off.  By default, color if used if outputting to a

+ terminal.
+
+.. option:: -load-address=<uint>
+
+ When displaying relative virtual addresses, assume the process is loaded at the

+ given address and display what would be the absolute address.
+
+.. _dump_subcommand:
+
+dump
+~~~~
+
+USAGE: :program:`llvm-pdbutil` dump [*options*] <input PDB file>
+
+.. program:: llvm-pdbutil dump
+
+Summary
+^^^^^^^^^^^
+
+The **dump** subcommand displays low level information about the structure of a

+PDB file.  It is used heavily by LLVM's testing infrastructure, but can also be

+used for PDB forensics.  It serves a role similar to that of Microsoft's

+`cvdump` tool.

+

+.. note::

+   The **dump** subcommand exposes internal details of the file format.  As

+   such, the reader should be familiar with :doc:`/PDB/index` before using this

+   command.
+
+Options
+^^^^^^^
+
+MSF Container Options
++++++++++++++++++++++
+
+.. option:: -streams
+
+ dump a summary of all of the streams in the PDB file.
+
+.. option:: -stream-blocks
+
+ In conjunction with :option:`-streams`, add information to the output about
+ what blocks the specified stream occupies.
+
+.. option:: -summary
+
+ Dump MSF and PDB header information.
+
+Module & File Options
++++++++++++++++++++++
+
+.. option:: -modi=<uint>
+
+ For all options that dump information from each module/compiland, limit to
+ the specified module.
+
+.. option:: -files
+
+ Dump the source files that contribute to each displayed module.
+
+.. option:: -il
+
+ Dump inlinee line information (DEBUG_S_INLINEELINES CodeView subsection)
+
+.. option:: -l
+
+ Dump line information (DEBUG_S_LINES CodeView subsection)
+
+.. option:: -modules
+
+ Dump compiland information
+
+.. option:: -xme
+
+ Dump cross module exports (DEBUG_S_CROSSSCOPEEXPORTS CodeView subsection)
+
+.. option:: -xmi
+
+ Dump cross module imports (DEBUG_S_CROSSSCOPEIMPORTS CodeView subsection)
+
+Symbol Options
+++++++++++++++
+
+.. option:: -globals
+
+ dump global symbol records
+
+.. option:: -global-extras
+
+ dump additional information about the globals, such as hash buckets and hash
+ values.
+
+.. option:: -publics
+
+ dump public symbol records
+
+.. option:: -public-extras
+
+ dump additional information about the publics, such as hash buckets and hash
+ values.
+
+.. option:: -symbols
+
+ dump symbols (functions, variables, etc) for each module dumped.
+
+.. option:: -sym-data
+
+ For each symbol record dumped as a result of the :option:`-symbols` option,
+ display the full bytes of the record in binary as well.
+
+Type Record Options
++++++++++++++++++++
+
+.. option:: -types
+
+ Dump CodeView type records from TPI stream
+
+.. option:: -type-extras
+
+ Dump additional information from the TPI stream, such as hashes and the type
+ index offsets array.
+
+.. option:: -type-data
+
+ For each type record dumped, display the full bytes of the record in binary as
+ well.
+
+.. option:: -type-index=<uint>
+
+ Only dump types with the specified type index.
+
+.. option:: -ids
+
+ Dump CodeView type records from IPI stream.
+
+.. option:: -id-extras
+
+ Dump additional information from the IPI stream, such as hashes and the type
+ index offsets array.
+
+.. option:: -id-data
+
+ For each ID record dumped, display the full bytes of the record in binary as
+ well.
+
+.. option:: -id-index=<uint>
+
+ only dump ID records with the specified hexadecimal type index.
+
+.. option:: -dependents
+
+ When used in conjunction with :option:`-type-index` or :option:`-id-index`,
+ dumps the entire dependency graph for the specified index instead of just the
+ single record with the specified index.  For example, if type index 0x4000 is
+ a function whose return type has index 0x3000, and you specify

+ `-dependents=0x4000`, then this would dump both records (as well as any other

+ dependents in the tree).
+
+Miscellaneous Options
++++++++++++++++++++++
+
+.. option:: -all
+
+ Implies most other options.
+
+.. option:: -section-contribs
+
+ Dump section contributions.
+
+.. option:: -section-headers
+
+ Dump image section headers.
+
+.. option:: -section-map
+
+ Dump section map.
+
+.. option:: -string-table
+
+ Dump PDB string table.
+
+.. _bytes_subcommand:
+
+bytes
+~~~~~
+
+USAGE: :program:`llvm-pdbutil` bytes [*options*] <input PDB file>
+
+.. program:: llvm-pdbutil bytes
+
+Summary
+^^^^^^^
+
+Like the **dump** subcommand, the **bytes** subcommand displays low level
+information about the structure of a PDB file, but it is used for even deeper
+forensics.  The **bytes** subcommand finds various structures in a PDB file
+based on the command line options specified, and dumps them in hex.  Someone
+working on support for emitting PDBs would use this heavily, for example, to
+compare one PDB against another PDB to ensure byte-for-byte compatibility.  It
+is not enough to simply compare the bytes of an entire file, or an entire stream
+because it's perfectly fine for the same structure to exist at different
+locations in two different PDBs, and "finding" the structure is half the battle.
+
+Options
+^^^^^^^
+
+MSF File Options
+++++++++++++++++
+
+.. option:: -block-range=<start[-end]>
+
+ Dump binary data from specified range of MSF file blocks.
+
+.. option:: -byte-range=<start[-end]>
+
+ Dump binary data from specified range of bytes in the file.
+
+.. option:: -fpm
+
+ Dump the MSF free page map.
+
+.. option:: -stream-data=<string>
+
+ Dump binary data from the specified streams.  Format is SN[:Start][@Size].
+ For example, `-stream-data=7:3@12` dumps 12 bytes from stream 7, starting
+ at offset 3 in the stream.
+
+PDB Stream Options
+++++++++++++++++++
+
+.. option:: -name-map
+
+ Dump bytes of PDB Name Map
+
+DBI Stream Options
+++++++++++++++++++
+
+.. option:: -ec
+
+ Dump the edit and continue map substream of the DBI stream.
+
+.. option:: -files
+
+ Dump the file info substream of the DBI stream.
+
+.. option:: -modi
+
+ Dump the modi substream of the DBI stream.
+
+.. option:: -sc
+
+ Dump section contributions substream of the DBI stream.
+
+.. option:: -sm
+
+ Dump the section map from the DBI stream.
+
+.. option:: -type-server
+
+ Dump the type server map from the DBI stream.
+
+Module Options
+++++++++++++++
+
+.. option:: -mod=<uint>
+
+ Limit all options in this category to the specified module index.  By default,
+ options in this category will dump bytes from all modules.
+
+.. option:: -chunks
+
+ Dump the bytes of each module's C13 debug subsection.
+
+.. option:: -split-chunks
+
+ When specified with :option:`-chunks`, split the C13 debug subsection into a
+ separate chunk for each subsection type, and dump them separately.
+
+.. option:: -syms
+
+ Dump the symbol record substream from each module.
+
+Type Record Options
++++++++++++++++++++
+
+.. option:: -id=<uint>
+
+ Dump the record from the IPI stream with the given type index.
+
+.. option:: -type=<uint>
+
+ Dump the record from the TPI stream with the given type index.
+
+.. _pdb2yaml_subcommand:
+
+pdb2yaml
+~~~~~~~~
+
+USAGE: :program:`llvm-pdbutil` pdb2yaml [*options*] <input PDB file>
+
+.. program:: llvm-pdbutil pdb2yaml
+
+Summary
+^^^^^^^
+
+Options
+^^^^^^^
+
+.. _yaml2pdb_subcommand:
+
+yaml2pdb
+~~~~~~~~
+
+USAGE: :program:`llvm-pdbutil` yaml2pdb [*options*] <input YAML file>
+
+.. program:: llvm-pdbutil yaml2pdb
+
+Summary
+^^^^^^^
+
+Generate a PDB file from a YAML description.  The YAML syntax is not described
+here.  Instead, use :ref:`llvm-pdbutil pdb2yaml <pdb2yaml_subcommand>` and
+examine the output for an example starting point.
+
+Options
+^^^^^^^
+
+.. option:: -pdb=<file-name>
+
+Write the resulting PDB to the specified file.
+
+.. _merge_subcommand:
+
+merge
+~~~~~
+
+USAGE: :program:`llvm-pdbutil` merge [*options*] <input PDB file 1> <input PDB file 2>
+
+.. program:: llvm-pdbutil merge
+
+Summary
+^^^^^^^
+
+Merge two PDB files into a single file.
+
+Options
+^^^^^^^
+
+.. option:: -pdb=<file-name>
+
+Write the resulting PDB to the specified file.
diff --git a/docs/CommandGuide/llvm-profdata.rst b/docs/CommandGuide/llvm-profdata.rst
index bae0ff7..5b6330b 100644
--- a/docs/CommandGuide/llvm-profdata.rst
+++ b/docs/CommandGuide/llvm-profdata.rst
@@ -192,10 +192,20 @@
  information is dumped in a more human readable form (also in text) with
  annotations.
 
+.. option:: -topn=n
+	     
+ Instruct the profile dumper to show the top ``n`` functions with the
+ hottest basic blocks in the summary section. By default, the topn functions
+ are not dumped.
+
 .. option:: -sample
 
  Specify that the input profile is a sample-based profile.
 
+.. option:: -memop-sizes
+
+ Show the profiled sizes of the memory intrinsic calls for shown functions.
+
 EXIT STATUS
 -----------
 
diff --git a/docs/CommandLine.rst b/docs/CommandLine.rst
index a660949..5d2a39d 100644
--- a/docs/CommandLine.rst
+++ b/docs/CommandLine.rst
@@ -1251,9 +1251,7 @@
 customary to use the so-called 'response files' to circumvent this
 restriction. These files are mentioned on the command-line (using the "@file")
 syntax. The program reads these files and inserts the contents into argv,
-thereby working around the command-line length limits. Response files are
-enabled by an optional fourth argument to `cl::ParseEnvironmentOptions`_ and
-`cl::ParseCommandLineOptions`_.
+thereby working around the command-line length limits.
 
 Top-Level Classes and Functions
 -------------------------------
@@ -1324,8 +1322,7 @@
 
 The ``cl::ParseCommandLineOptions`` function requires two parameters (``argc``
 and ``argv``), but may also take an optional third parameter which holds
-`additional extra text`_ to emit when the ``-help`` option is invoked, and a
-fourth boolean parameter that enables `response files`_.
+`additional extra text`_ to emit when the ``-help`` option is invoked.
 
 .. _cl::ParseEnvironmentOptions:
 
@@ -1340,9 +1337,8 @@
 
 It takes four parameters: the name of the program (since ``argv`` may not be
 available, it can't just look in ``argv[0]``), the name of the environment
-variable to examine, the optional `additional extra text`_ to emit when the
-``-help`` option is invoked, and the boolean switch that controls whether
-`response files`_ should be read.
+variable to examine, and the optional `additional extra text`_ to emit when the
+``-help`` option is invoked.
 
 ``cl::ParseEnvironmentOptions`` will break the environment variable's value up
 into words and then process them using `cl::ParseCommandLineOptions`_.
diff --git a/docs/CompilerWriterInfo.rst b/docs/CompilerWriterInfo.rst
index 8ce9990..60f1024 100644
--- a/docs/CompilerWriterInfo.rst
+++ b/docs/CompilerWriterInfo.rst
@@ -72,16 +72,7 @@
 AMDGPU
 ------
 
-* `AMD R6xx shader ISA <http://developer.amd.com/wordpress/media/2012/10/R600_Instruction_Set_Architecture.pdf>`_
-* `AMD R7xx shader ISA <http://developer.amd.com/wordpress/media/2012/10/R700-Family_Instruction_Set_Architecture.pdf>`_
-* `AMD Evergreen shader ISA <http://developer.amd.com/wordpress/media/2012/10/AMD_Evergreen-Family_Instruction_Set_Architecture.pdf>`_
-* `AMD Cayman/Trinity shader ISA <http://developer.amd.com/wordpress/media/2012/10/AMD_HD_6900_Series_Instruction_Set_Architecture.pdf>`_
-* `AMD Southern Islands Series ISA <http://developer.amd.com/wordpress/media/2012/12/AMD_Southern_Islands_Instruction_Set_Architecture.pdf>`_
-* `AMD Sea Islands Series ISA <http://developer.amd.com/wordpress/media/2013/07/AMD_Sea_Islands_Instruction_Set_Architecture.pdf>`_
-* `AMD GCN3 Instruction Set Architecture <http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_GCN3_Instruction_Set_Architecture_rev1.1.pdf>`__
-* `AMD GPU Programming Guide <http://developer.amd.com/download/AMD_Accelerated_Parallel_Processing_OpenCL_Programming_Guide.pdf>`_
-* `AMD Compute Resources <http://developer.amd.com/tools/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/documentation/>`_
-* `AMDGPU Compute Application Binary Interface <https://github.com/RadeonOpenCompute/ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md>`__
+Refer to :doc:`AMDGPUUsage` for additional documentation.
 
 RISC-V
 ------
@@ -128,7 +119,7 @@
 ===
 
 * `System V Application Binary Interface <http://www.sco.com/developers/gabi/latest/contents.html>`_
-* `Itanium C++ ABI <http://mentorembedded.github.io/cxx-abi/>`_
+* `Itanium C++ ABI <http://itanium-cxx-abi.github.io/cxx-abi/>`_
 
 Linux
 -----
diff --git a/docs/Coroutines.rst b/docs/Coroutines.rst
index d330d71..1bea04e 100644
--- a/docs/Coroutines.rst
+++ b/docs/Coroutines.rst
@@ -89,7 +89,7 @@
 
 The LLVM IR for this coroutine looks like this:
 
-.. code-block:: none
+.. code-block:: llvm
 
   define i8* @f(i32 %n) {
   entry:
@@ -156,7 +156,7 @@
 when its identity cannot be determined statically at compile time. For our 
 example, the coroutine frame will be:
 
-.. code-block:: text
+.. code-block:: llvm
 
   %f.frame = type { void (%f.frame*)*, void (%f.frame*)*, i32 }
 
@@ -164,7 +164,7 @@
 code responsible for creation and initialization of the coroutine frame and 
 execution of the coroutine until a suspend point is reached:
 
-.. code-block:: none
+.. code-block:: llvm
 
   define i8* @f(i32 %n) {
   entry:
@@ -224,7 +224,7 @@
 when dynamic allocation is required, and `false` if dynamic allocation is 
 elided.
 
-.. code-block:: none
+.. code-block:: llvm
 
   entry:
     %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
@@ -242,7 +242,7 @@
 `coro.free`_ intrinsic. If allocation is elided, `coro.free`_ returns `null`
 thus skipping the deallocation code:
 
-.. code-block:: text
+.. code-block:: llvm
 
   cleanup:
     %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
@@ -286,7 +286,7 @@
 Matching LLVM code would look like (with the rest of the code remaining the same
 as the code in the previous section):
 
-.. code-block:: text
+.. code-block:: llvm
 
   loop:
     %n.addr = phi i32 [ %n, %entry ], [ %inc, %loop.resume ]
@@ -383,17 +383,17 @@
 should be stored in the coroutine frame, so that it can be resumed at the 
 correct resume point):
 
-.. code-block:: text
+.. code-block:: llvm
 
   if.true:
     %save1 = call token @llvm.coro.save(i8* %hdl)
-    call void async_op1(i8* %hdl)
+    call void @async_op1(i8* %hdl)
     %suspend1 = call i1 @llvm.coro.suspend(token %save1, i1 false)
     switch i8 %suspend1, label %suspend [i8 0, label %resume1
                                          i8 1, label %cleanup]
   if.false:
     %save2 = call token @llvm.coro.save(i8* %hdl)
-    call void async_op2(i8* %hdl)
+    call void @async_op2(i8* %hdl)
     %suspend2 = call i1 @llvm.coro.suspend(token %save2, i1 false)
     switch i8 %suspend1, label %suspend [i8 0, label %resume2
                                          i8 1, label %cleanup]
@@ -411,7 +411,7 @@
 The following coroutine designates a 32 bit integer `promise` and uses it to
 store the current value produced by a coroutine.
 
-.. code-block:: text
+.. code-block:: llvm
 
   define i8* @f(i32 %n) {
   entry:
@@ -692,7 +692,7 @@
 Example:
 """"""""
 
-.. code-block:: text
+.. code-block:: llvm
 
   define i8* @f(i32 %n) {
   entry:
@@ -812,7 +812,7 @@
 Example (custom deallocation function):
 """""""""""""""""""""""""""""""""""""""
 
-.. code-block:: text
+.. code-block:: llvm
 
   cleanup:
     %mem = call i8* @llvm.coro.free(token %id, i8* %frame)
@@ -827,7 +827,7 @@
 Example (standard deallocation functions):
 """"""""""""""""""""""""""""""""""""""""""
 
-.. code-block:: text
+.. code-block:: llvm
 
   cleanup:
     %mem = call i8* @llvm.coro.free(token %id, i8* %frame)
@@ -846,7 +846,7 @@
 """""""""
 
 The '``llvm.coro.alloc``' intrinsic returns `true` if dynamic allocation is
-required to obtain a memory for the corutine frame and `false` otherwise.
+required to obtain a memory for the coroutine frame and `false` otherwise.
 
 Arguments:
 """"""""""
@@ -864,7 +864,7 @@
 Example:
 """"""""
 
-.. code-block:: text
+.. code-block:: llvm
 
   entry:
     %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
@@ -1017,7 +1017,7 @@
 For Windows Exception handling model, a frontend should attach a funclet bundle
 referring to an enclosing cleanuppad as follows:
 
-.. code-block:: text
+.. code-block:: llvm
 
     ehcleanup: 
       %tok = cleanuppad within none []
@@ -1074,7 +1074,7 @@
 Example (normal suspend point):
 """""""""""""""""""""""""""""""
 
-.. code-block:: text
+.. code-block:: llvm
 
     %0 = call i8 @llvm.coro.suspend(token none, i1 false)
     switch i8 %0, label %suspend [i8 0, label %resume
@@ -1083,7 +1083,7 @@
 Example (final suspend point):
 """"""""""""""""""""""""""""""
 
-.. code-block:: text
+.. code-block:: llvm
 
   while.end:
     %s.final = call i8 @llvm.coro.suspend(token none, i1 true)
@@ -1144,10 +1144,10 @@
 a different thread possibly prior to `async_op` call returning control back
 to the coroutine:
 
-.. code-block:: text
+.. code-block:: llvm
 
     %save1 = call token @llvm.coro.save(i8* %hdl)
-    call void async_op1(i8* %hdl)
+    call void @async_op1(i8* %hdl)
     %suspend1 = call i1 @llvm.coro.suspend(token %save1, i1 false)
     switch i8 %suspend1, label %suspend [i8 0, label %resume1
                                          i8 1, label %cleanup]
diff --git a/docs/CoverageMappingFormat.rst b/docs/CoverageMappingFormat.rst
index 46cc9d1..30b11fe 100644
--- a/docs/CoverageMappingFormat.rst
+++ b/docs/CoverageMappingFormat.rst
@@ -258,7 +258,7 @@
       i32 2,  ; The number of function records
       i32 20, ; The length of the string that contains the encoded translation unit filenames
       i32 20, ; The length of the string that contains the encoded coverage mapping data
-      i32 1,  ; Coverage mapping format version
+      i32 2,  ; Coverage mapping format version
     },
     [2 x { i64, i32, i64 }] [ ; Function records
      { i64, i32, i64 } {
@@ -274,6 +274,8 @@
    [40 x i8] c"..." ; Encoded data (dissected later)
   }, section "__llvm_covmap", align 8
 
+The current version of the format is version 3. The only difference from version 2 is that a special encoding for column end locations was introduced to indicate gap regions.
+
 The function record layout has evolved since version 1. In version 1, the function record for *foo* is defined as follows:
 
 .. code-block:: llvm
@@ -296,7 +298,7 @@
 
 * The length of the string in the third field of *__llvm_coverage_mapping* that contains the encoded coverage mapping data.
 
-* The format version. The current version is 2 (encoded as a 1).
+* The format version. The current version is 3 (encoded as a 2).
 
 .. _function records:
 
@@ -602,4 +604,6 @@
 * *numLines*: The difference between the ending line and the starting line
   of the current mapping region.
 
-* *columnEnd*: The ending column of the mapping region.
+* *columnEnd*: The ending column of the mapping region. If the high bit is set,
+  the current mapping region is a gap area. A count for a gap area is only used
+  as the line execution count if there are no other regions on a line.
diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index 97e0572..9bd50f1 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -188,7 +188,7 @@
 responsibility of a code owner is to ensure that a commit to their area of the
 code is appropriately reviewed, either by themself or by someone else.  The list
 of current code owners can be found in the file
-`CODE_OWNERS.TXT <http://llvm.org/klaus/llvm/blob/master/CODE_OWNERS.TXT>`_
+`CODE_OWNERS.TXT <http://git.llvm.org/klaus/llvm/blob/master/CODE_OWNERS.TXT>`_
 in the root of the LLVM source tree.
 
 Note that code ownership is completely different than reviewers: anyone can
diff --git a/docs/Docker.rst b/docs/Docker.rst
new file mode 100644
index 0000000..e606e1b
--- /dev/null
+++ b/docs/Docker.rst
@@ -0,0 +1,199 @@
+=========================================
+A guide to Dockerfiles for building LLVM
+=========================================
+
+Introduction
+============
+You can find a number of sources to build docker images with LLVM components in
+``llvm/utils/docker``. They can be used by anyone who wants to build the docker
+images for their own use, or as a starting point for someone who wants to write
+their own Dockerfiles.
+
+We currently provide Dockerfiles with ``debian8`` and ``nvidia-cuda`` base images.
+We also provide an ``example`` image, which contains placeholders that one would need
+to fill out in order to produce Dockerfiles for a new docker image.
+
+Why?
+----
+Docker images provide a way to produce binary distributions of
+software inside a controlled environment. Having Dockerfiles to builds docker images
+inside LLVM repo makes them much more discoverable than putting them into any other
+place.
+
+Docker basics
+-------------
+If you've never heard about Docker before, you might find this section helpful
+to get a very basic explanation of it.
+`Docker <https://www.docker.com/>`_ is a popular solution for running programs in
+an isolated and reproducible environment, especially to maintain releases for
+software deployed to large distributed fleets.
+It uses linux kernel namespaces and cgroups to provide a lightweight isolation
+inside currently running linux kernel.
+A single active instance of dockerized environment is called a *docker
+container*.
+A snapshot of a docker container filesystem is called a *docker image*.
+One can start a container from a prebuilt docker image.
+
+Docker images are built from a so-called *Dockerfile*, a source file written in
+a specialized language that defines instructions to be used when build
+the docker image (see `official
+documentation <https://docs.docker.com/engine/reference/builder/>`_ for more
+details). A minimal Dockerfile typically contains a base image and a number
+of RUN commands that have to be executed to build the image. When building a new
+image, docker will first download your base image, mount its filesystem as
+read-only and then add a writable overlay on top of it to keep track of all
+filesystem modifications, performed while building your image. When the build
+process is finished, a diff between your image's final filesystem state and the
+base image's filesystem is stored in the resulting image.
+
+Overview
+========
+The ``llvm/utils/docker`` folder contains Dockerfiles and simple bash scripts to
+serve as a basis for anyone who wants to create their own Docker image with
+LLVM components, compiled from sources. The sources are checked out from the
+upstream svn repository when building the image.
+
+Inside each subfolder we host Dockerfiles for two images:
+
+- ``build/`` image is used to compile LLVM, it installs a system compiler and all
+  build dependencies of LLVM. After the build process is finished, the build
+  image will have an archive with compiled components at ``/tmp/clang.tar.gz``.
+- ``release/`` image usually only contains LLVM components, compiled by the
+  ``build/`` image, and also libstdc++ and binutils to make image minimally
+  useful for C++ development. The assumption is that you usually want clang to
+  be one of the provided components.
+
+To build both of those images, use ``build_docker_image.sh`` script.
+It will checkout LLVM sources and build clang in the ``build`` container, copy results
+of the build to the local filesystem and then build the ``release`` container using
+those. The ``build_docker_image.sh`` accepts a list of LLVM repositories to
+checkout, and arguments for CMake invocation.
+
+If you want to write your own docker image, start with an ``example/`` subfolder.
+It provides incomplete Dockerfiles with (very few) FIXMEs explaining the steps
+you need to take in order to make your Dockerfiles functional.
+
+Usage
+=====
+The ``llvm/utils/build_docker_image.sh`` script provides a rather high degree of
+control on how to run the build. It allows you to specify the projects to
+checkout from svn and provide a list of CMake arguments to use during when
+building LLVM inside docker container.
+
+Here's a very simple example of getting a docker image with clang binary,
+compiled by the system compiler in the debian8 image:
+
+.. code-block:: bash
+
+    ./llvm/utils/docker/build_docker_image.sh \
+	--source debian8 \
+	--docker-repository clang-debian8 --docker-tag "staging" \
+	-p clang -i install-clang -i install-clang-headers \
+	-- \
+	-DCMAKE_BUILD_TYPE=Release
+
+Note that a build like that doesn't use a 2-stage build process that
+you probably want for clang. Running a 2-stage build is a little more intricate,
+this command will do that:
+
+.. code-block:: bash
+
+    # Run a 2-stage build.
+    #   LLVM_TARGETS_TO_BUILD=Native is to reduce stage1 compile time.
+    #   Options, starting with BOOTSTRAP_* are passed to stage2 cmake invocation.
+    ./build_docker_image.sh \
+	--source debian8 \
+	--docker-repository clang-debian8 --docker-tag "staging" \
+	-p clang -i stage2-install-clang -i stage2-install-clang-headers \
+	-- \
+	-DLLVM_TARGETS_TO_BUILD=Native -DCMAKE_BUILD_TYPE=Release \
+	-DBOOTSTRAP_CMAKE_BUILD_TYPE=Release \
+	-DCLANG_ENABLE_BOOTSTRAP=ON -DCLANG_BOOTSTRAP_TARGETS="install-clang;install-clang-headers"
+	
+This will produce two images, a release image ``clang-debian8:staging`` and a
+build image ``clang-debian8-build:staging`` from the latest upstream revision.
+After the image is built you can run bash inside a container based on your
+image like this:
+
+.. code-block:: bash
+
+    docker run -ti clang-debian8:staging bash
+
+Now you can run bash commands as you normally would:
+
+.. code-block:: bash
+
+    root@80f351b51825:/# clang -v
+    clang version 5.0.0 (trunk 305064)
+    Target: x86_64-unknown-linux-gnu
+    Thread model: posix
+    InstalledDir: /bin
+    Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8
+    Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8.4
+    Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.9
+    Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.9.2
+    Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.9
+    Candidate multilib: .;@m64
+    Selected multilib: .;@m64
+
+
+Which image should I choose?
+============================
+We currently provide two images: debian8-based and nvidia-cuda-based. They
+differ in the base image that they use, i.e. they have a different set of
+preinstalled binaries. Debian8 is very minimal, nvidia-cuda is larger, but has
+preinstalled CUDA libraries and allows to access a GPU, installed on your
+machine.
+
+If you need a minimal linux distribution with only clang and libstdc++ included,
+you should try debian8-based image.
+
+If you want to use CUDA libraries and have access to a GPU on your machine,
+you should choose nvidia-cuda-based image and use `nvidia-docker
+<https://github.com/NVIDIA/nvidia-docker>`_ to run your docker containers. Note
+that you don't need nvidia-docker to build the images, but you need it in order
+to have an access to GPU from a docker container that is running the built
+image.
+
+If you have a different use-case, you could create your own image based on
+``example/`` folder.
+
+Any docker image can be built and run using only the docker binary, i.e. you can
+run debian8 build on Fedora or any other Linux distribution. You don't need to
+install CMake, compilers or any other clang dependencies. It is all handled
+during the build process inside Docker's isolated environment.
+
+Stable build
+============
+If you want a somewhat recent and somewhat stable build, use the
+``branches/google/stable`` branch, i.e. the following command will produce a
+debian8-based image using the latest ``google/stable`` sources for you:
+
+.. code-block:: bash
+
+    ./llvm/utils/docker/build_docker_image.sh \
+	-s debian8 --d clang-debian8 -t "staging" \
+	--branch branches/google/stable \
+	-p clang -i install-clang -i install-clang-headers \
+	-- \
+	-DCMAKE_BUILD_TYPE=Release
+
+
+Minimizing docker image size
+============================
+Due to Docker restrictions we use two images (i.e., build and release folders)
+for the release image to be as small as possible. It's much easier to achieve
+that using two images, because Docker would store a filesystem layer for each
+command in the  Dockerfile, i.e. if you install some packages in one command,
+then remove  those in a separate command, the size of the resulting image will
+still be proportinal to the size of an image with installed packages.
+Therefore, we strive to provide a very simple release image which only copies
+compiled clang and does not do anything else.
+
+Docker 1.13 added a ``--squash`` flag that allows to flatten the layers of the
+image, i.e. remove the parts that were actually deleted. That is an easier way
+to produce the smallest images possible by using just a single image. We do not
+use it because as of today the flag is in experimental stage and not everyone
+may have the latest docker version available. When the flag is out of
+experimental stage, we should investigate replacing two images approach with
+just a single image, built using ``--squash`` flag.
diff --git a/docs/ExceptionHandling.rst b/docs/ExceptionHandling.rst
index a44fb92..ff8b645 100644
--- a/docs/ExceptionHandling.rst
+++ b/docs/ExceptionHandling.rst
@@ -32,13 +32,13 @@
 
 A more complete description of the Itanium ABI exception handling runtime
 support of can be found at `Itanium C++ ABI: Exception Handling
-<http://mentorembedded.github.com/cxx-abi/abi-eh.html>`_. A description of the
+<http://itanium-cxx-abi.github.io/cxx-abi/abi-eh.html>`_. A description of the
 exception frame format can be found at `Exception Frames
 <http://refspecs.linuxfoundation.org/LSB_3.0.0/LSB-Core-generic/LSB-Core-generic/ehframechpt.html>`_,
 with details of the DWARF 4 specification at `DWARF 4 Standard
 <http://dwarfstd.org/Dwarf4Std.php>`_.  A description for the C++ exception
 table formats can be found at `Exception Handling Tables
-<http://mentorembedded.github.com/cxx-abi/exceptions.pdf>`_.
+<http://itanium-cxx-abi.github.io/cxx-abi/exceptions.pdf>`_.
 
 Setjmp/Longjmp Exception Handling
 ---------------------------------
diff --git a/docs/Extensions.rst b/docs/Extensions.rst
index 782539d..14fea30 100644
--- a/docs/Extensions.rst
+++ b/docs/Extensions.rst
@@ -204,7 +204,7 @@
 The unique number is not present in the resulting object at all. It is just used
 in the assembler to differentiate the sections.
 
-The 'm' flag is mapped to SHF_LINK_ORDER. If it is present, a symbol
+The 'o' flag is mapped to SHF_LINK_ORDER. If it is present, a symbol
 must be given that identifies the section to be placed is the
 .sh_link.
 
@@ -212,14 +212,14 @@
 
         .section .foo,"a",@progbits
         .Ltmp:
-        .section .bar,"am",@progbits,.Ltmp
+        .section .bar,"ao",@progbits,.Ltmp
 
 which is equivalent to just
 
 .. code-block:: gas
 
         .section .foo,"a",@progbits
-        .section .bar,"am",@progbits,.foo
+        .section .bar,"ao",@progbits,.foo
 
 
 Target Specific Behaviour
diff --git a/docs/FuzzingLLVM.rst b/docs/FuzzingLLVM.rst
new file mode 100644
index 0000000..e6ebeaf
--- /dev/null
+++ b/docs/FuzzingLLVM.rst
@@ -0,0 +1,252 @@
+================================
+Fuzzing LLVM libraries and tools
+================================
+
+.. contents::
+   :local:
+   :depth: 2
+
+Introduction
+============
+
+The LLVM tree includes a number of fuzzers for various components. These are
+built on top of :doc:`LibFuzzer <LibFuzzer>`.
+
+
+Available Fuzzers
+=================
+
+clang-fuzzer
+------------
+
+A |generic fuzzer| that tries to compile textual input as C++ code. Some of the
+bugs this fuzzer has reported are `on bugzilla`__ and `on OSS Fuzz's
+tracker`__.
+
+__ https://llvm.org/pr23057
+__ https://bugs.chromium.org/p/oss-fuzz/issues/list?q=proj-llvm+clang-fuzzer
+
+clang-proto-fuzzer
+------------------
+
+A |protobuf fuzzer| that compiles valid C++ programs generated from a protobuf
+class that describes a subset of the C++ language.
+
+This fuzzer accepts clang command line options after `ignore_remaining_args=1`.
+For example, the following command will fuzz clang with a higher optimization
+level:
+
+.. code-block:: shell
+
+   % bin/clang-proto-fuzzer <corpus-dir> -ignore_remaining_args=1 -O3
+
+clang-format-fuzzer
+-------------------
+
+A |generic fuzzer| that runs clang-format_ on C++ text fragments. Some of the
+bugs this fuzzer has reported are `on bugzilla`__
+and `on OSS Fuzz's tracker`__.
+
+.. _clang-format: https://clang.llvm.org/docs/ClangFormat.html
+__ https://llvm.org/pr23052
+__ https://bugs.chromium.org/p/oss-fuzz/issues/list?q=proj-llvm+clang-format-fuzzer
+
+llvm-as-fuzzer
+--------------
+
+A |generic fuzzer| that tries to parse text as :doc:`LLVM assembly <LangRef>`.
+Some of the bugs this fuzzer has reported are `on bugzilla`__.
+
+__ https://llvm.org/pr24639
+
+llvm-dwarfdump-fuzzer
+---------------------
+
+A |generic fuzzer| that interprets inputs as object files and runs
+:doc:`llvm-dwarfdump <CommandGuide/llvm-dwarfdump>` on them. Some of the bugs
+this fuzzer has reported are `on OSS Fuzz's tracker`__
+
+__ https://bugs.chromium.org/p/oss-fuzz/issues/list?q=proj-llvm+llvm-dwarfdump-fuzzer
+
+llvm-demangle-fuzzer
+---------------------
+
+A |generic fuzzer| for the Itanium demangler used in various LLVM tools. We've
+fuzzed __cxa_demangle to death, why not fuzz LLVM's implementation of the same
+function!
+
+llvm-isel-fuzzer
+----------------
+
+A |LLVM IR fuzzer| aimed at finding bugs in instruction selection.
+
+This fuzzer accepts flags after `ignore_remaining_args=1`. The flags match
+those of :doc:`llc <CommandGuide/llc>` and the triple is required. For example,
+the following command would fuzz AArch64 with :doc:`GlobalISel`:
+
+.. code-block:: shell
+
+   % bin/llvm-isel-fuzzer <corpus-dir> -ignore_remaining_args=1 -mtriple aarch64 -global-isel -O0
+
+Some flags can also be specified in the binary name itself in order to support
+OSS Fuzz, which has trouble with required arguments. To do this, you can copy
+or move ``llvm-isel-fuzzer`` to ``llvm-isel-fuzzer--x-y-z``, separating options
+from the binary name using "--". The valid options are architecture names
+(``aarch64``, ``x86_64``), optimization levels (``O0``, ``O2``), or specific
+keywords, like ``gisel`` for enabling global instruction selection. In this
+mode, the same example could be run like so:
+
+.. code-block:: shell
+
+   % bin/llvm-isel-fuzzer--aarch64-O0-gisel <corpus-dir>
+
+llvm-mc-assemble-fuzzer
+-----------------------
+
+A |generic fuzzer| that fuzzes the MC layer's assemblers by treating inputs as
+target specific assembly.
+
+Note that this fuzzer has an unusual command line interface which is not fully
+compatible with all of libFuzzer's features. Fuzzer arguments must be passed
+after ``--fuzzer-args``, and any ``llc`` flags must use two dashes. For
+example, to fuzz the AArch64 assembler you might use the following command:
+
+.. code-block:: console
+
+  llvm-mc-fuzzer --triple=aarch64-linux-gnu --fuzzer-args -max_len=4
+
+This scheme will likely change in the future.
+
+llvm-mc-disassemble-fuzzer
+--------------------------
+
+A |generic fuzzer| that fuzzes the MC layer's disassemblers by treating inputs
+as assembled binary data.
+
+Note that this fuzzer has an unusual command line interface which is not fully
+compatible with all of libFuzzer's features. See the notes above about
+``llvm-mc-assemble-fuzzer`` for details.
+
+
+.. |generic fuzzer| replace:: :ref:`generic fuzzer <fuzzing-llvm-generic>`
+.. |protobuf fuzzer|
+   replace:: :ref:`libprotobuf-mutator based fuzzer <fuzzing-llvm-protobuf>`
+.. |LLVM IR fuzzer|
+   replace:: :ref:`structured LLVM IR fuzzer <fuzzing-llvm-ir>`
+
+
+Mutators and Input Generators
+=============================
+
+The inputs for a fuzz target are generated via random mutations of a
+:ref:`corpus <libfuzzer-corpus>`. There are a few options for the kinds of
+mutations that a fuzzer in LLVM might want.
+
+.. _fuzzing-llvm-generic:
+
+Generic Random Fuzzing
+----------------------
+
+The most basic form of input mutation is to use the built in mutators of
+LibFuzzer. These simply treat the input corpus as a bag of bits and make random
+mutations. This type of fuzzer is good for stressing the surface layers of a
+program, and is good at testing things like lexers, parsers, or binary
+protocols.
+
+Some of the in-tree fuzzers that use this type of mutator are `clang-fuzzer`_,
+`clang-format-fuzzer`_, `llvm-as-fuzzer`_, `llvm-dwarfdump-fuzzer`_,
+`llvm-mc-assemble-fuzzer`_, and `llvm-mc-disassemble-fuzzer`_.
+
+.. _fuzzing-llvm-protobuf:
+
+Structured Fuzzing using ``libprotobuf-mutator``
+------------------------------------------------
+
+We can use libprotobuf-mutator_ in order to perform structured fuzzing and
+stress deeper layers of programs. This works by defining a protobuf class that
+translates arbitrary data into structurally interesting input. Specifically, we
+use this to work with a subset of the C++ language and perform mutations that
+produce valid C++ programs in order to exercise parts of clang that are more
+interesting than parser error handling.
+
+To build this kind of fuzzer you need `protobuf`_ and its dependencies
+installed, and you need to specify some extra flags when configuring the build
+with :doc:`CMake <CMake>`. For example, `clang-proto-fuzzer`_ can be enabled by
+adding ``-DCLANG_ENABLE_PROTO_FUZZER=ON`` to the flags described in
+:ref:`building-fuzzers`.
+
+The only in-tree fuzzer that uses ``libprotobuf-mutator`` today is
+`clang-proto-fuzzer`_.
+
+.. _libprotobuf-mutator: https://github.com/google/libprotobuf-mutator
+.. _protobuf: https://github.com/google/protobuf
+
+.. _fuzzing-llvm-ir:
+
+Structured Fuzzing of LLVM IR
+-----------------------------
+
+We also use a more direct form of structured fuzzing for fuzzers that take
+:doc:`LLVM IR <LangRef>` as input. This is achieved through the ``FuzzMutate``
+library, which was `discussed at EuroLLVM 2017`_.
+
+The ``FuzzMutate`` library is used to structurally fuzz backends in
+`llvm-isel-fuzzer`_.
+
+.. _discussed at EuroLLVM 2017: https://www.youtube.com/watch?v=UBbQ_s6hNgg
+
+
+Building and Running
+====================
+
+.. _building-fuzzers:
+
+Configuring LLVM to Build Fuzzers
+---------------------------------
+
+Fuzzers will be built and linked to libFuzzer by default as long as you build
+LLVM with sanitizer coverage enabled. You would typically also enable at least
+one sanitizer to find bugs faster. The most common way to build the fuzzers is
+by adding the following two flags to your CMake invocation:
+``-DLLVM_USE_SANITIZER=Address -DLLVM_USE_SANITIZE_COVERAGE=On``.
+
+.. note:: If you have ``compiler-rt`` checked out in an LLVM tree when building
+          with sanitizers, you'll want to specify ``-DLLVM_BUILD_RUNTIME=Off``
+          to avoid building the sanitizers themselves with sanitizers enabled.
+
+Continuously Running and Finding Bugs
+-------------------------------------
+
+There used to be a public buildbot running LLVM fuzzers continuously, and while
+this did find issues, it didn't have a very good way to report problems in an
+actionable way. Because of this, we're moving towards using `OSS Fuzz`_ more
+instead.
+
+You can browse the `LLVM project issue list`_ for the bugs found by
+`LLVM on OSS Fuzz`_. These are also mailed to the `llvm-bugs mailing
+list`_.
+
+.. _OSS Fuzz: https://github.com/google/oss-fuzz
+.. _LLVM project issue list:
+   https://bugs.chromium.org/p/oss-fuzz/issues/list?q=Proj-llvm
+.. _LLVM on OSS Fuzz:
+   https://github.com/google/oss-fuzz/blob/master/projects/llvm
+.. _llvm-bugs mailing list:
+   http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs
+
+
+Utilities for Writing Fuzzers
+=============================
+
+There are some utilities available for writing fuzzers in LLVM.
+
+Some helpers for handling the command line interface are available in
+``include/llvm/FuzzMutate/FuzzerCLI.h``, including functions to parse command
+line options in a consistent way and to implement standalone main functions so
+your fuzzer can be built and tested when not built against libFuzzer.
+
+There is also some handling of the CMake config for fuzzers, where you should
+use the ``add_llvm_fuzzer`` to set up fuzzer targets. This function works
+similarly to functions such as ``add_llvm_tool``, but they take care of linking
+to LibFuzzer when appropriate and can be passed the ``DUMMY_MAIN`` argument to
+enable standalone testing.
diff --git a/docs/GetElementPtr.rst b/docs/GetElementPtr.rst
index f39f1d9..c2da640 100644
--- a/docs/GetElementPtr.rst
+++ b/docs/GetElementPtr.rst
@@ -9,10 +9,11 @@
 ============
 
 This document seeks to dispel the mystery and confusion surrounding LLVM's
-`GetElementPtr <LangRef.html#i_getelementptr>`_ (GEP) instruction.  Questions
-about the wily GEP instruction are probably the most frequently occurring
-questions once a developer gets down to coding with LLVM. Here we lay out the
-sources of confusion and show that the GEP instruction is really quite simple.
+`GetElementPtr <LangRef.html#getelementptr-instruction>`_ (GEP) instruction.
+Questions about the wily GEP instruction are probably the most frequently
+occurring questions once a developer gets down to coding with LLVM. Here we lay
+out the sources of confusion and show that the GEP instruction is really quite
+simple.
 
 Address Computation
 ===================
@@ -26,7 +27,7 @@
 What is the first index of the GEP instruction?
 -----------------------------------------------
 
-Quick answer: The index stepping through the first operand.
+Quick answer: The index stepping through the second operand.
 
 The confusion with the first index usually arises from thinking about the
 GetElementPtr instruction as if it was a C index operator. They aren't the
@@ -58,7 +59,7 @@
   won't be dereferenced?*
 
 The answer is simply because memory does not have to be accessed to perform the
-computation. The first operand to the GEP instruction must be a value of a
+computation. The second operand to the GEP instruction must be a value of a
 pointer type. The value of the pointer is provided directly to the GEP
 instruction as an operand without any need for accessing memory. It must,
 therefore be indexed and requires an index operand. Consider this example:
@@ -79,8 +80,8 @@
 
 In this "C" example, the front end compiler (Clang) will generate three GEP
 instructions for the three indices through "P" in the assignment statement.  The
-function argument ``P`` will be the first operand of each of these GEP
-instructions.  The second operand indexes through that pointer.  The third
+function argument ``P`` will be the second operand of each of these GEP
+instructions.  The third operand indexes through that pointer.  The fourth
 operand will be the field offset into the ``struct munger_struct`` type, for
 either the ``f1`` or ``f2`` field. So, in LLVM assembly the ``munge`` function
 looks like:
@@ -99,8 +100,8 @@
     ret void
   }
 
-In each case the first operand is the pointer through which the GEP instruction
-starts. The same is true whether the first operand is an argument, allocated
+In each case the second operand is the pointer through which the GEP instruction
+starts. The same is true whether the second operand is an argument, allocated
 memory, or a global variable.
 
 To make this clear, let's consider a more obtuse example:
@@ -158,11 +159,11 @@
    i32 }*``. That is, ``%MyStruct`` is a pointer to a structure containing a
    pointer to a ``float`` and an ``i32``.
 
-#. Point #1 is evidenced by noticing the type of the first operand of the GEP
+#. Point #1 is evidenced by noticing the type of the second operand of the GEP
    instruction (``%MyStruct``) which is ``{ float*, i32 }*``.
 
 #. The first index, ``i64 0`` is required to step over the global variable
-   ``%MyStruct``.  Since the first argument to the GEP instruction must always
+   ``%MyStruct``.  Since the second argument to the GEP instruction must always
    be a value of pointer type, the first index steps through that pointer. A
    value of 0 means 0 elements offset from that pointer.
 
@@ -195,7 +196,7 @@
 In order to access the 18th integer in the array, you would need to do the
 following:
 
-.. code-block:: text
+.. code-block:: llvm
 
   %idx = getelementptr { [40 x i32]* }, { [40 x i32]* }* %, i64 0, i32 0
   %arr = load [40 x i32]** %idx
@@ -266,7 +267,7 @@
 What effect do address spaces have on GEPs?
 -------------------------------------------
 
-None, except that the address space qualifier on the first operand pointer type
+None, except that the address space qualifier on the second operand pointer type
 always matches the address space qualifier on the result type.
 
 How is GEP different from ``ptrtoint``, arithmetic, and ``inttoptr``?
@@ -429,7 +430,8 @@
 
 LLVM's type-based alias analysis pass uses metadata to describe a different type
 system (such as the C type system), and performs type-based aliasing on top of
-that.  Further details are in the `language reference <LangRef.html#tbaa>`_.
+that.  Further details are in the
+`language reference <LangRef.html#tbaa-metadata>`_.
 
 What happens if a GEP computation overflows?
 --------------------------------------------
@@ -524,7 +526,7 @@
 #. The GEP instruction never accesses memory, it only provides pointer
    computations.
 
-#. The first operand to the GEP instruction is always a pointer and it must be
+#. The second operand to the GEP instruction is always a pointer and it must be
    indexed.
 
 #. There are no superfluous indices for the GEP instruction.
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index 2918a5c..a90a4b0 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -52,6 +52,12 @@
    * ``cd llvm/tools``
    * ``svn co http://llvm.org/svn/llvm-project/cfe/trunk clang``
 
+#. Checkout Extra Clang Tools **[Optional]**:
+
+   * ``cd where-you-want-llvm-to-live``
+   * ``cd llvm/tools/clang/tools``
+   * ``svn co http://llvm.org/svn/llvm-project/clang-tools-extra/trunk extra``
+   
 #. Checkout LLD linker **[Optional]**:
 
    * ``cd where-you-want-llvm-to-live``
@@ -91,9 +97,9 @@
 
 #. Configure and build LLVM and Clang:
 
-   *Warning:* Make sure you've checked out *all of* the source code 
+   *Warning:* Make sure you've checked out *all of* the source code
    before trying to configure with cmake.  cmake does not pickup newly
-   added source directories in incremental builds. 
+   added source directories in incremental builds.
 
    The build uses `CMake <CMake.html>`_. LLVM requires CMake 3.4.3 to build. It
    is generally recommended to use a recent CMake, especially if you're
@@ -137,8 +143,8 @@
      * CMake will generate build targets for each tool and library, and most
        LLVM sub-projects generate their own ``check-<project>`` target.
 
-     * Running a serial build will be *slow*.  Make sure you run a 
-       parallel build; for ``make``, use ``make -j``.  
+     * Running a serial build will be *slow*.  Make sure you run a
+       parallel build; for ``make``, use ``make -j``.
 
    * For more information see `CMake <CMake.html>`_
 
@@ -146,7 +152,7 @@
      `below`_.
 
 Consult the `Getting Started with LLVM`_ section for detailed information on
-configuring and compiling LLVM.  Go to `Directory Layout`_ to learn about the 
+configuring and compiling LLVM.  Go to `Directory Layout`_ to learn about the
 layout of the source code tree.
 
 Requirements
@@ -171,6 +177,8 @@
 Solaris            V9 (Ultrasparc)       GCC
 FreeBSD            x86\ :sup:`1`         GCC, Clang
 FreeBSD            amd64                 GCC, Clang
+NetBSD             x86\ :sup:`1`         GCC, Clang
+NetBSD             amd64                 GCC, Clang
 MacOS X\ :sup:`2`  PowerPC               GCC
 MacOS X            x86                   GCC, Clang
 Cygwin/Win32       x86\ :sup:`1, 3`      GCC
@@ -189,10 +197,10 @@
 Note that Debug builds require a lot of time and disk space.  An LLVM-only build
 will need about 1-3 GB of space.  A full build of LLVM and Clang will need around
 15-20 GB of disk space.  The exact space requirements will vary by system.  (It
-is so large because of all the debugging information and the fact that the 
-libraries are statically linked into multiple tools).  
+is so large because of all the debugging information and the fact that the
+libraries are statically linked into multiple tools).
 
-If you you are space-constrained, you can build only selected tools or only 
+If you you are space-constrained, you can build only selected tools or only
 selected targets.  The Release build requires considerably less space.
 
 The LLVM suite *may* compile on other platforms, but it is not guaranteed to do
@@ -510,43 +518,43 @@
 
 .. code-block:: console
 
-  % git clone http://llvm.org/git/llvm.git
+  % git clone https://git.llvm.org/git/llvm.git/
 
 If you want to check out clang too, run:
 
 .. code-block:: console
 
   % cd llvm/tools
-  % git clone http://llvm.org/git/clang.git
+  % git clone https://git.llvm.org/git/clang.git/
 
 If you want to check out compiler-rt (required to build the sanitizers), run:
 
 .. code-block:: console
 
   % cd llvm/projects
-  % git clone http://llvm.org/git/compiler-rt.git
+  % git clone https://git.llvm.org/git/compiler-rt.git/
 
 If you want to check out libomp (required for OpenMP support), run:
 
 .. code-block:: console
 
   % cd llvm/projects
-  % git clone http://llvm.org/git/openmp.git
+  % git clone https://git.llvm.org/git/openmp.git/
 
 If you want to check out libcxx and libcxxabi (optional), run:
 
 .. code-block:: console
 
   % cd llvm/projects
-  % git clone http://llvm.org/git/libcxx.git
-  % git clone http://llvm.org/git/libcxxabi.git
+  % git clone https://git.llvm.org/git/libcxx.git/
+  % git clone https://git.llvm.org/git/libcxxabi.git/
 
 If you want to check out the Test Suite Source Code (optional), run:
 
 .. code-block:: console
 
   % cd llvm/projects
-  % git clone http://llvm.org/git/test-suite.git
+  % git clone https://git.llvm.org/git/test-suite.git/
 
 Since the upstream repository is in Subversion, you should use ``git
 pull --rebase`` instead of ``git pull`` to avoid generating a non-linear history
@@ -620,7 +628,7 @@
 
 .. code-block:: console
 
-  % git clone http://llvm.org/git/llvm.git
+  % git clone https://git.llvm.org/git/llvm.git/
   % cd llvm
   % git svn init https://llvm.org/svn/llvm-project/llvm/trunk --username=<username>
   % git config svn-remote.svn.fetch :refs/remotes/origin/master
@@ -628,7 +636,7 @@
 
   # If you have clang too:
   % cd tools
-  % git clone http://llvm.org/git/clang.git
+  % git clone https://git.llvm.org/git/clang.git/
   % cd clang
   % git svn init https://llvm.org/svn/llvm-project/cfe/trunk --username=<username>
   % git config svn-remote.svn.fetch :refs/remotes/origin/master
@@ -697,14 +705,14 @@
 
 .. note::
 
-   This set-up is using unofficial mirror hosted on GitHub, use with caution.
+   This set-up is using an unofficial mirror hosted on GitHub, use with caution.
 
 To set up a clone of all the llvm projects using a unified repository:
 
 .. code-block:: console
 
   % export TOP_LEVEL_DIR=`pwd`
-  % git clone https://github.com/llvm-project/llvm-project/
+  % git clone https://github.com/llvm-project/llvm-project-20170507/ llvm-project
   % cd llvm-project
   % git config branch.master.rebase true
 
@@ -733,8 +741,8 @@
   % mkdir clang-build && cd clang-build
   % cmake -GNinja ../llvm-project/llvm -DLLVM_ENABLE_PROJECTS="clang;libcxx;libcxxabi"
 
-A helper script is provided in `llvm/utils/git-svn/git-llvm`. After you add it
-to your path, you can push committed changes upstream with `git llvm push`.
+A helper script is provided in ``llvm/utils/git-svn/git-llvm``. After you add it
+to your path, you can push committed changes upstream with ``git llvm push``.
 
 .. code-block:: console
 
@@ -743,10 +751,22 @@
 
 While this is using SVN under the hood, it does not require any interaction from
 you with git-svn.
-After a few minutes, `git pull` should get back the changes as they were
-committed. Note that a current limitation is that `git` does not directly record
-file rename, and thus it is propagated to SVN as a combination of delete-add
-instead of a file rename.
+After a few minutes, ``git pull`` should get back the changes as they were
+committed. Note that a current limitation is that ``git`` does not directly
+record file rename, and thus it is propagated to SVN as a combination of
+delete-add instead of a file rename.
+
+The SVN revision of each monorepo commit can be found in the commit notes.  git
+does not fetch notes by default. The following commands will fetch the notes and
+configure git to fetch future notes. Use ``git notes show $commit`` to look up
+the SVN revision of a git commit. The notes show up ``git log``, and searching
+the log is currently the recommended way to look up the git commit for a given
+SVN revision.
+
+.. code-block:: console
+
+  % git config --add remote.origin.fetch +refs/notes/commits:refs/notes/commits
+  % git fetch
 
 If you are using `arc` to interact with Phabricator, you need to manually put it
 at the root of the checkout:
@@ -805,7 +825,8 @@
 +-------------------------+----------------------------------------------------+
 | LLVM_ENABLE_SPHINX      | Build sphinx-based documentation from the source   |
 |                         | code. This is disabled by default because it is    |
-|                         | slow and generates a lot of output.                |
+|                         | slow and generates a lot of output. Sphinx version |
+|                         | 1.5 or later recommended.                          |
 +-------------------------+----------------------------------------------------+
 | LLVM_BUILD_LLVM_DYLIB   | Generate libLLVM.so. This library contains a       |
 |                         | default set of LLVM components that can be         |
@@ -995,7 +1016,7 @@
 ================
 
 One useful source of information about the LLVM source base is the LLVM `doxygen
-<http://www.doxygen.org/>`_ documentation available at 
+<http://www.doxygen.org/>`_ documentation available at
 `<http://llvm.org/doxygen/>`_.  The following is a brief introduction to code
 layout:
 
@@ -1011,13 +1032,13 @@
 
 ``llvm/include/llvm``
 
-  All LLVM-specific header files, and  subdirectories for different portions of 
+  All LLVM-specific header files, and  subdirectories for different portions of
   LLVM: ``Analysis``, ``CodeGen``, ``Target``, ``Transforms``, etc...
 
 ``llvm/include/llvm/Support``
 
-  Generic support libraries provided with LLVM but not necessarily specific to 
-  LLVM. For example, some C++ STL utilities and a Command Line option processing 
+  Generic support libraries provided with LLVM but not necessarily specific to
+  LLVM. For example, some C++ STL utilities and a Command Line option processing
   library store header files here.
 
 ``llvm/include/llvm/Config``
@@ -1030,12 +1051,12 @@
 ``llvm/lib``
 ------------
 
-Most source files are here. By putting code in libraries, LLVM makes it easy to 
+Most source files are here. By putting code in libraries, LLVM makes it easy to
 share code among the `tools`_.
 
 ``llvm/lib/IR/``
 
-  Core LLVM source files that implement core classes like Instruction and 
+  Core LLVM source files that implement core classes like Instruction and
   BasicBlock.
 
 ``llvm/lib/AsmParser/``
@@ -1048,23 +1069,23 @@
 
 ``llvm/lib/Analysis/``
 
-  A variety of program analyses, such as Call Graphs, Induction Variables, 
+  A variety of program analyses, such as Call Graphs, Induction Variables,
   Natural Loop Identification, etc.
 
 ``llvm/lib/Transforms/``
 
-  IR-to-IR program transformations, such as Aggressive Dead Code Elimination, 
-  Sparse Conditional Constant Propagation, Inlining, Loop Invariant Code Motion, 
+  IR-to-IR program transformations, such as Aggressive Dead Code Elimination,
+  Sparse Conditional Constant Propagation, Inlining, Loop Invariant Code Motion,
   Dead Global Elimination, and many others.
 
 ``llvm/lib/Target/``
 
-  Files describing target architectures for code generation.  For example, 
+  Files describing target architectures for code generation.  For example,
   ``llvm/lib/Target/X86`` holds the X86 machine description.
 
 ``llvm/lib/CodeGen/``
 
-  The major parts of the code generator: Instruction Selector, Instruction 
+  The major parts of the code generator: Instruction Selector, Instruction
   Scheduling, and Register Allocation.
 
 ``llvm/lib/MC/``
@@ -1073,7 +1094,7 @@
 
 ``llvm/lib/ExecutionEngine/``
 
-  Libraries for directly executing bitcode at runtime in interpreted and 
+  Libraries for directly executing bitcode at runtime in interpreted and
   JIT-compiled scenarios.
 
 ``llvm/lib/Support/``
@@ -1084,7 +1105,7 @@
 ``llvm/projects``
 -----------------
 
-Projects not strictly part of LLVM but shipped with LLVM. This is also the 
+Projects not strictly part of LLVM but shipped with LLVM. This is also the
 directory for creating your own LLVM-based projects which leverage the LLVM
 build system.
 
@@ -1097,8 +1118,8 @@
 ``test-suite``
 --------------
 
-A comprehensive correctness, performance, and benchmarking test suite for LLVM. 
-Comes in a separate Subversion module because not every LLVM user is interested 
+A comprehensive correctness, performance, and benchmarking test suite for LLVM.
+Comes in a separate Subversion module because not every LLVM user is interested
 in such a comprehensive suite. For details see the :doc:`Testing Guide
 <TestingGuide>` document.
 
@@ -1150,7 +1171,7 @@
 ``llc``
 
   ``llc`` is the LLVM backend compiler, which translates LLVM bitcode to a
-  native code assembly file or to C code (with the ``-march=c`` option).
+  native code assembly file.
 
 ``opt``
 
@@ -1179,7 +1200,7 @@
 
 ``emacs/``
 
-   Emacs and XEmacs syntax highlighting  for LLVM   assembly files and TableGen 
+   Emacs and XEmacs syntax highlighting  for LLVM   assembly files and TableGen
    description files.  See the ``README`` for information on using them.
 
 ``getsrcs.sh``
diff --git a/docs/GettingStartedVS.rst b/docs/GettingStartedVS.rst
index 1e46767..50f7aa1 100644
--- a/docs/GettingStartedVS.rst
+++ b/docs/GettingStartedVS.rst
@@ -100,6 +100,10 @@
    * CMake generates project files for all build types. To select a specific
      build type, use the Configuration manager from the VS IDE or the 
      ``/property:Configuration`` command line option when using MSBuild.
+   * By default, the Visual Studio project files generated by CMake use the
+     32-bit toolset. If you are developing on a 64-bit version of Windows and
+     want to use the 64-bit toolset, pass the ``-Thost=x64`` flag when
+     generating the Visual Studio solution. This requires CMake 3.8.0 or later.
 
 6. Start Visual Studio
 
diff --git a/docs/GlobalISel.rst b/docs/GlobalISel.rst
index 176bd4e..8746685 100644
--- a/docs/GlobalISel.rst
+++ b/docs/GlobalISel.rst
@@ -304,10 +304,13 @@
 Legalization is iterative, and all state is contained in GMIR.  To maintain the
 validity of the intermediate code, instructions are introduced:
 
-* ``G_SEQUENCE`` --- concatenate multiple registers into a single wider
-  register.
+* ``G_MERGE_VALUES`` --- concatenate multiple registers of the same
+  size into a single wider register.
 
-* ``G_EXTRACT`` --- extract multiple registers (as contiguous sequences of bits)
+* ``G_UNMERGE_VALUES`` --- extract multiple registers of the same size
+  from a single wider register.
+
+* ``G_EXTRACT`` --- extract a simple register (as contiguous sequences of bits)
   from a single wider register.
 
 As they are expected to be temporary byproducts of the legalization process,
@@ -500,16 +503,69 @@
 This target-provided method is responsible for mutating (or replacing) a
 possibly-generic MI into a fully target-specific equivalent.
 It is also responsible for doing the necessary constraining of gvregs into the
-appropriate register classes.
+appropriate register classes as well as passing through COPY instructions to
+the register allocator.
 
 The ``InstructionSelector`` can fold other instructions into the selected MI,
 by walking the use-def chain of the vreg operands.
 As GlobalISel is Global, this folding can occur across basic blocks.
 
-``TODO``:
-Currently, the Select pass is implemented with hand-written c++, similar to
-FastISel, rather than backed by tblgen'erated pattern-matching.
-We intend to eventually reuse SelectionDAG patterns.
+SelectionDAG Rule Imports
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+TableGen will import SelectionDAG rules and provide the following function to
+execute them:
+
+  .. code-block:: c++
+
+    bool selectImpl(MachineInstr &MI)
+
+The ``--stats`` option can be used to determine what proportion of rules were
+successfully imported. The easiest way to use this is to copy the
+``-gen-globalisel`` tablegen command from ``ninja -v`` and modify it.
+
+Similarly, the ``--warn-on-skipped-patterns`` option can be used to obtain the
+reasons that rules weren't imported. This can be used to focus on the most
+important rejection reasons.
+
+PatLeaf Predicates
+^^^^^^^^^^^^^^^^^^
+
+PatLeafs cannot be imported because their C++ is implemented in terms of
+``SDNode`` objects. PatLeafs that handle immediate predicates should be
+replaced by ``ImmLeaf``, ``IntImmLeaf``, or ``FPImmLeaf`` as appropriate.
+
+There's no standard answer for other PatLeafs. Some standard predicates have
+been baked into TableGen but this should not generally be done.
+
+Custom SDNodes
+^^^^^^^^^^^^^^
+
+Custom SDNodes should be mapped to Target Pseudos using ``GINodeEquiv``. This
+will cause the instruction selector to import them but you will also need to
+ensure the target pseudo is introduced to the MIR before the instruction
+selector. Any preceeding pass is suitable but the legalizer will be a
+particularly common choice.
+
+ComplexPatterns
+^^^^^^^^^^^^^^^
+
+ComplexPatterns cannot be imported because their C++ is implemented in terms of
+``SDNode`` objects. GlobalISel versions should be defined with
+``GIComplexOperandMatcher`` and mapped to ComplexPattern with
+``GIComplexPatternEquiv``.
+
+The following predicates are useful for porting ComplexPattern:
+
+* isBaseWithConstantOffset() - Check for base+offset structures
+* isOperandImmEqual() - Check for a particular constant
+* isObviouslySafeToFold() - Check for reasons an instruction can't be sunk and folded into another.
+
+There are some important points for the C++ implementation:
+
+* Don't modify MIR in the predicate
+* Renderer lambdas should capture by value to avoid use-after-free. They will be used after the predicate returns.
+* Only create instructions in a renderer lambda. GlobalISel won't clean up things you create but don't use.
 
 
 .. _maintainability:
@@ -633,5 +689,14 @@
 
 * ``TargetPassConfig`` --- create the passes constituting the pipeline,
   including additional passes not included in the :ref:`pipeline`.
-* ``GISelAccessor`` --- setup the various subtarget-provided classes, with a
-  graceful fallback to no-op when GlobalISel isn't enabled.
+
+.. _other_resources:
+
+Resources
+=========
+
+* `Global Instruction Selection - A Proposal by Quentin Colombet @LLVMDevMeeting 2015 <https://www.youtube.com/watch?v=F6GGbYtae3g>`_
+* `Global Instruction Selection - Status by Quentin Colombet, Ahmed Bougacha, and Tim Northover @LLVMDevMeeting 2016 <https://www.youtube.com/watch?v=6tfb344A7w8>`_
+* `GlobalISel - LLVM's Latest Instruction Selection Framework by Diana Picus @FOSDEM17 <https://www.youtube.com/watch?v=d6dF6E4BPeU>`_
+* GlobalISel: Past, Present, and Future by Quentin Colombet and Ahmed Bougacha @LLVMDevMeeting 2017
+* Head First into GlobalISel by Daniel Sanders, Aditya Nandakumar, and Justin Bogner @LLVMDevMeeting 2017
diff --git a/docs/GoldPlugin.rst b/docs/GoldPlugin.rst
index 88b944a..78d38cc 100644
--- a/docs/GoldPlugin.rst
+++ b/docs/GoldPlugin.rst
@@ -7,7 +7,7 @@
 
 Building with link time optimization requires cooperation from
 the system linker. LTO support on Linux systems requires that you use the
-`gold linker`_ which supports LTO via plugins. This is the same mechanism
+`gold linker`_ or ld.bfd from binutils >= 2.21.51.0.2, as they support LTO via plugins. This is the same mechanism
 used by the `GCC LTO`_ project.
 
 The LLVM gold plugin implements the gold plugin interface on top of
@@ -23,24 +23,22 @@
 How to build it
 ===============
 
-You need to have gold with plugin support and build the LLVMgold plugin.
-Check whether you have gold running ``/usr/bin/ld -v``. It will report "GNU
-gold" or else "GNU ld" if not. If you have gold, check for plugin support
-by running ``/usr/bin/ld -plugin``. If it complains "missing argument" then
-you have plugin support. If not, such as an "unknown option" error then you
-will either need to build gold or install a version with plugin support.
+Check for plugin support by running ``/usr/bin/ld -plugin``. If it complains
+"missing argument" then you have plugin support. If not, such as an "unknown option"
+error then you will either need to build gold or install a recent version
+of ld.bfd with plugin support and then build gold plugin.
 
-* Download, configure and build gold with plugin support:
+* Download, configure and build ld.bfd with plugin support:
 
   .. code-block:: bash
 
      $ git clone --depth 1 git://sourceware.org/git/binutils-gdb.git binutils
      $ mkdir build
      $ cd build
-     $ ../binutils/configure --enable-gold --enable-plugins --disable-werror
-     $ make all-gold
+     $ ../binutils/configure --disable-werror # ld.bfd includes plugin support by default
+     $ make all-ld
 
-  That should leave you with ``build/gold/ld-new`` which supports
+  That should leave you with ``build/ld/ld-new`` which supports
   the ``-plugin`` option. Running ``make`` will additionally build
   ``build/binutils/ar`` and ``nm-new`` binaries supporting plugins.
 
diff --git a/docs/HowToAddABuilder.rst b/docs/HowToAddABuilder.rst
index fcc2293..201c71b 100644
--- a/docs/HowToAddABuilder.rst
+++ b/docs/HowToAddABuilder.rst
@@ -62,6 +62,9 @@
                     lab.llvm.org:9990 \
                     <buildslave-access-name> <buildslave-access-password>
 
+   To point a slave to silent master please use lab.llvm.org:9994 instead
+   of lab.llvm.org:9990.
+
 #. Fill the buildslave description and admin name/e-mail.  Here is an
    example of the buildslave description::
 
@@ -83,6 +86,8 @@
    * slaves are added to ``buildbot/osuosl/master/config/slaves.py``
    * builders are added to ``buildbot/osuosl/master/config/builders.py``
 
+   Please make sure your builder name and its builddir are unique through the file.
+
    It is possible to whitelist email addresses to unconditionally receive notifications
    on build failure; for this you'll need to add an ``InformativeMailNotifier`` to
    ``buildbot/osuosl/master/config/status.py``. This is particularly useful for the
diff --git a/docs/HowToReleaseLLVM.rst b/docs/HowToReleaseLLVM.rst
index 5ea6d49..ec3362e 100644
--- a/docs/HowToReleaseLLVM.rst
+++ b/docs/HowToReleaseLLVM.rst
@@ -256,6 +256,28 @@
 from the Meta and its priority decreased to *normal*. Debugging can continue,
 but on trunk.
 
+Merge Requests
+--------------
+
+You can use any of the following methods to request that a revision from trunk
+be merged into a release branch:
+
+#. Use the ``utils/release/merge-request.sh`` script which will automatically
+   file a bug_ requesting that the patch be merged. e.g. To request revision
+   12345 be merged into the branch for the 5.0.1 release:
+   ``llvm.src/utils/release/merge-request.sh -stable-version 5.0 -r 12345 -user bugzilla@example.com``
+
+#. Manually file a bug_ with the subject: "Merge r12345 into the X.Y branch",
+   enter the commit(s) that you want merged in the "Fixed by Commit(s)" and mark
+   it as a blocker of the current release bug.  Release bugs are given aliases
+   in the form of release-x.y.z, so to mark a bug as a blocker for the 5.0.1
+   release, just enter release-5.0.1 in the "Blocks" field.
+
+#. Reply to the commit email on llvm-commits for the revision to merge and cc
+   the release manager.
+
+.. _bug: https://bugs.llvm.org/
+
 Release Patch Rules
 -------------------
 
diff --git a/docs/HowToUseAttributes.rst b/docs/HowToUseAttributes.rst
index 66c44c0..1d05e23 100644
--- a/docs/HowToUseAttributes.rst
+++ b/docs/HowToUseAttributes.rst
@@ -38,36 +38,35 @@
 convert any code which does treat them as a bit mask to use the new query
 methods on the Attribute class.
 
-``AttributeSet``
-================
+``AttributeList``
+=================
 
-The ``AttributeSet`` class replaces the old ``AttributeList`` class.  The
-``AttributeSet`` stores a collection of Attribute objects for each kind of
-object that may have an attribute associated with it: the function as a
-whole, the return type, or the function's parameters.  A function's attributes
-are at index ``AttributeSet::FunctionIndex``; the return type's attributes are
-at index ``AttributeSet::ReturnIndex``; and the function's parameters'
-attributes are at indices 1, ..., n (where 'n' is the number of parameters).
-Most methods on the ``AttributeSet`` class take an index parameter.
+The ``AttributeList`` stores a collection of Attribute objects for each kind of
+object that may have an attribute associated with it: the function as a whole,
+the return type, or the function's parameters.  A function's attributes are at
+index ``AttributeList::FunctionIndex``; the return type's attributes are at
+index ``AttributeList::ReturnIndex``; and the function's parameters' attributes
+are at indices 1, ..., n (where 'n' is the number of parameters).  Most methods
+on the ``AttributeList`` class take an index parameter.
 
-An ``AttributeSet`` is also a uniqued and immutable object.  You create an
-``AttributeSet`` through the ``AttributeSet::get`` methods.  You can add and
-remove attributes, which result in the creation of a new ``AttributeSet``.
+An ``AttributeList`` is also a uniqued and immutable object.  You create an
+``AttributeList`` through the ``AttributeList::get`` methods.  You can add and
+remove attributes, which result in the creation of a new ``AttributeList``.
 
-An ``AttributeSet`` object is designed to be passed around by value.
+An ``AttributeList`` object is designed to be passed around by value.
 
-Note: It is advised that you do *not* use the ``AttributeSet`` "introspection"
+Note: It is advised that you do *not* use the ``AttributeList`` "introspection"
 methods (e.g. ``Raw``, ``getRawPointer``, etc.).  These methods break
 encapsulation, and may be removed in a future release (i.e. LLVM 4.0).
 
 ``AttrBuilder``
 ===============
 
-Lastly, we have a "builder" class to help create the ``AttributeSet`` object
+Lastly, we have a "builder" class to help create the ``AttributeList`` object
 without having to create several different intermediate uniqued
-``AttributeSet`` objects.  The ``AttrBuilder`` class allows you to add and
+``AttributeList`` objects.  The ``AttrBuilder`` class allows you to add and
 remove attributes at will.  The attributes won't be uniqued until you call the
-appropriate ``AttributeSet::get`` method.
+appropriate ``AttributeList::get`` method.
 
 An ``AttrBuilder`` object is *not* designed to be passed around by value.  It
 should be passed by reference.
diff --git a/docs/LLVMBuild.rst b/docs/LLVMBuild.rst
index a93dcf6..622780a 100644
--- a/docs/LLVMBuild.rst
+++ b/docs/LLVMBuild.rst
@@ -54,7 +54,7 @@
 The build system implementation will load the relevant contents of the
 LLVMBuild files and use that to drive the actual project build.
 Typically, the build system will only need to load this information at
-"configure" time, and use it to generative native information. Build
+"configure" time, and use it to generate native information. Build
 systems will also handle automatically reconfiguring their information
 when the contents of the ``LLVMBuild.txt`` files change.
 
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index c5771ad..9d91056 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -161,7 +161,7 @@
 
     ; Definition of main function
     define i32 @main() {   ; i32()*
-      ; Convert [13 x i8]* to i8  *...
+      ; Convert [13 x i8]* to i8*...
       %cast210 = getelementptr [13 x i8], [13 x i8]* @.str, i64 0, i64 0
 
       ; Call puts function to write out the string to stdout.
@@ -527,6 +527,24 @@
 For platforms without linker support of ELF TLS model, the -femulated-tls
 flag can be used to generate GCC compatible emulated TLS code.
 
+.. _runtime_preemption_model:
+
+Runtime Preemption Specifiers
+-----------------------------
+
+Global variables, functions and aliases may have an optional runtime preemption
+specifier. If a preemption specifier isn't given explicitly, then a
+symbol is assumed to be ``dso_preemptable``.
+
+``dso_preemptable``
+    Indicates that the function or variable may be replaced by a symbol from
+    outside the linkage unit at runtime.
+
+``dso_local``
+    The compiler may assume that a function or variable marked as ``dso_local``
+    will resolve to a symbol within the same linkage unit. Direct access will 
+    be generated even if the definition is not within this compilation unit.
+
 .. _namedtypes:
 
 Structure Types
@@ -579,7 +597,9 @@
 case they don't have an initializer.
 
 Either global variable definitions or declarations may have an explicit section
-to be placed in and may have an optional explicit alignment specified.
+to be placed in and may have an optional explicit alignment specified. If there 
+is a mismatch between the explicit or inferred section information for the 
+variable declaration and its definition the resulting behavior is undefined. 
 
 A variable may be defined as a global ``constant``, which indicates that
 the contents of the variable will **never** be modified (enabling better
@@ -622,6 +642,12 @@
 Additionally, the global can placed in a comdat if the target has the necessary
 support.
 
+External declarations may have an explicit section specified. Section 
+information is retained in LLVM IR for targets that make use of this 
+information. Attaching section information to an external declaration is an 
+assertion that its definition is located in the specified section. If the 
+definition is located in a different section, the behavior is undefined.   
+
 By default, global initializers are optimized by assuming that global
 variables defined within the module are not modified from their
 initial values before the start of the global initializer. This is
@@ -641,15 +667,18 @@
 iterate over them as an array, alignment padding would break this
 iteration. The maximum alignment is ``1 << 29``.
 
-Globals can also have a :ref:`DLL storage class <dllstorageclass>` and
-an optional list of attached :ref:`metadata <metadata>`,
+Globals can also have a :ref:`DLL storage class <dllstorageclass>`,
+an optional :ref:`runtime preemption specifier <runtime_preemption_model>`,
+an optional :ref:`global attributes <glattrs>` and
+an optional list of attached :ref:`metadata <metadata>`.
 
 Variables and aliases can have a
 :ref:`Thread Local Storage Model <tls_model>`.
 
 Syntax::
 
-      @<GlobalVarName> = [Linkage] [Visibility] [DLLStorageClass] [ThreadLocal]
+      @<GlobalVarName> = [Linkage] [PreemptionSpecifier] [Visibility]
+                         [DLLStorageClass] [ThreadLocal]
                          [(unnamed_addr|local_unnamed_addr)] [AddrSpace]
                          [ExternallyInitialized]
                          <global | constant> <Type> [<InitializerConstant>]
@@ -682,7 +711,8 @@
 ---------
 
 LLVM function definitions consist of the "``define``" keyword, an
-optional :ref:`linkage type <linkage>`, an optional :ref:`visibility
+optional :ref:`linkage type <linkage>`, an optional :ref:`runtime preemption
+specifier <runtime_preemption_model>`,  an optional :ref:`visibility
 style <visibility>`, an optional :ref:`DLL storage class <dllstorageclass>`,
 an optional :ref:`calling convention <callingconv>`,
 an optional ``unnamed_addr`` attribute, a return type, an optional
@@ -741,7 +771,7 @@
 
 Syntax::
 
-    define [linkage] [visibility] [DLLStorageClass]
+    define [linkage] [PreemptionSpecifier] [visibility] [DLLStorageClass]
            [cconv] [ret attrs]
            <ResultType> @<FunctionName> ([argument list])
            [(unnamed_addr|local_unnamed_addr)] [fn Attrs] [section "name"]
@@ -768,12 +798,13 @@
 constant expression.
 
 Aliases may have an optional :ref:`linkage type <linkage>`, an optional
+:ref:`runtime preemption specifier <runtime_preemption_model>`, an optional
 :ref:`visibility style <visibility>`, an optional :ref:`DLL storage class
 <dllstorageclass>` and an optional :ref:`tls model <tls_model>`.
 
 Syntax::
 
-    @<Name> = [Linkage] [Visibility] [DLLStorageClass] [ThreadLocal] [(unnamed_addr|local_unnamed_addr)] alias <AliaseeTy>, <AliaseeTy>* @<Aliasee>
+    @<Name> = [Linkage] [PreemptionSpecifier] [Visibility] [DLLStorageClass] [ThreadLocal] [(unnamed_addr|local_unnamed_addr)] alias <AliaseeTy>, <AliaseeTy>* @<Aliasee>
 
 The linkage must be one of ``private``, ``internal``, ``linkonce``, ``weak``,
 ``linkonce_odr``, ``weak_odr``, ``external``. Note that some system linkers
@@ -1380,6 +1411,9 @@
 ``naked``
     This attribute disables prologue / epilogue emission for the
     function. This can have very system-specific consequences.
+``no-jump-tables``
+    When this attribute is set to true, the jump tables and lookup tables that
+    can be generated from a switch case lowering are disabled.
 ``nobuiltin``
     This indicates that the callee function at a call site is not recognized as
     a built-in function. LLVM will retain the original call and not replace it
@@ -1467,6 +1501,19 @@
     This attribute by itself does not imply restrictions on
     inter-procedural optimizations.  All of the semantic effects the
     patching may have to be separately conveyed via the linkage type.
+``"probe-stack"``
+    This attribute indicates that the function will trigger a guard region
+    in the end of the stack. It ensures that accesses to the stack must be
+    no further apart than the size of the guard region to a previous
+    access of the stack. It takes one required string value, the name of
+    the stack probing function that will be called.
+
+    If a function that has a ``"probe-stack"`` attribute is inlined into
+    a function with another ``"probe-stack"`` attribute, the resulting
+    function has the ``"probe-stack"`` attribute of the caller. If a
+    function that has a ``"probe-stack"`` attribute is inlined into a
+    function that has no ``"probe-stack"`` attribute at all, the resulting
+    function has the ``"probe-stack"`` attribute of the callee.
 ``readnone``
     On a function, this attribute indicates that the function computes its
     result (or decides to unwind an exception) based strictly on its arguments,
@@ -1497,6 +1544,21 @@
     On an argument, this attribute indicates that the function does not write
     through this pointer argument, even though it may write to the memory that
     the pointer points to.
+``"stack-probe-size"``
+    This attribute controls the behavior of stack probes: either
+    the ``"probe-stack"`` attribute, or ABI-required stack probes, if any.
+    It defines the size of the guard region. It ensures that if the function
+    may use more stack space than the size of the guard region, stack probing
+    sequence will be emitted. It takes one required integer value, which
+    is 4096 by default.
+
+    If a function that has a ``"stack-probe-size"`` attribute is inlined into
+    a function with another ``"stack-probe-size"`` attribute, the resulting
+    function has the ``"stack-probe-size"`` attribute that has the lower
+    numeric value. If a function that has a ``"stack-probe-size"`` attribute is
+    inlined into a function that has no ``"stack-probe-size"`` attribute
+    at all, the resulting function has the ``"stack-probe-size"`` attribute
+    of the callee.
 ``writeonly``
     On a function, this attribute indicates that the function may write to but
     does not read from memory.
@@ -1535,6 +1597,17 @@
 ``sanitize_thread``
     This attribute indicates that ThreadSanitizer checks
     (dynamic thread safety analysis) are enabled for this function.
+``speculatable``
+    This function attribute indicates that the function does not have any
+    effects besides calculating its result and does not have undefined behavior.
+    Note that ``speculatable`` is not enough to conclude that along any
+    particular execution path the number of calls to this function will not be
+    externally observable. This attribute is only valid on functions
+    and declarations, not on individual call sites. If a function is
+    incorrectly marked as speculatable and really does exhibit
+    undefined behavior, the undefined behavior may be observed even
+    if the call site is dead code.
+
 ``ssp``
     This attribute indicates that the function should emit a stack
     smashing protector. It is in the form of a "canary" --- a random value
@@ -1601,6 +1674,12 @@
     If a function that has an ``sspstrong`` attribute is inlined into a
     function that doesn't have an ``sspstrong`` attribute, then the
     resulting function will have an ``sspstrong`` attribute.
+``strictfp``
+    This attribute indicates that the function was called from a scope that
+    requires strict floating point semantics.  LLVM will not attempt any
+    optimizations that require assumptions about the floating point rounding
+    mode or that might alter the state of floating point status flags that
+    might otherwise be set or cleared by calling this function.
 ``"thunk"``
     This attribute indicates that the function will delegate to some other
     function with a tail call. The prototype of a thunk should not be used for
@@ -1613,6 +1692,14 @@
     the ELF x86-64 abi, but it can be disabled for some compilation
     units.
 
+.. _glattrs:
+
+Global Attributes
+-----------------
+
+Attributes may be set to communicate additional information about a global variable.
+Unlike :ref:`function attributes <fnattrs>`, attributes on a global variable
+are grouped into a single :ref:`attribute group <attrgrp>`.
 
 .. _opbundles:
 
@@ -1812,6 +1899,9 @@
     must be a multiple of 8-bits. If omitted, the natural stack
     alignment defaults to "unspecified", which does not prevent any
     alignment promotions.
+``A<address space>``
+    Specifies the address space of  objects created by '``alloca``'.
+    Defaults to the default address space of 0.
 ``p[n]:<size>:<abi>:<pref>``
     This specifies the *size* of a pointer and its ``<abi>`` and
     ``<pref>``\erred alignments for address space ``n``. All sizes are in
@@ -1965,8 +2055,11 @@
 A pointer value is *based* on another pointer value according to the
 following rules:
 
--  A pointer value formed from a ``getelementptr`` operation is *based*
-   on the first value operand of the ``getelementptr``.
+-  A pointer value formed from a scalar ``getelementptr`` operation is *based* on
+   the pointer-typed operand of the ``getelementptr``.
+-  The pointer in lane *l* of the result of a vector ``getelementptr`` operation
+   is *based* on the pointer in lane *l* of the vector-of-pointers-typed operand
+   of the ``getelementptr``.
 -  The result value of a ``bitcast`` is *based* on the operand of the
    ``bitcast``.
 -  A pointer value formed by an ``inttoptr`` is *based* on all pointer
@@ -2158,12 +2251,21 @@
     same address in this global order. This corresponds to the C++0x/C1x
     ``memory_order_seq_cst`` and Java volatile.
 
-.. _singlethread:
+.. _syncscope:
 
-If an atomic operation is marked ``singlethread``, it only *synchronizes
-with* or participates in modification and seq\_cst total orderings with
-other operations running in the same thread (for example, in signal
-handlers).
+If an atomic operation is marked ``syncscope("singlethread")``, it only
+*synchronizes with* and only participates in the seq\_cst total orderings of
+other operations running in the same thread (for example, in signal handlers).
+
+If an atomic operation is marked ``syncscope("<target-scope>")``, where
+``<target-scope>`` is a target specific synchronization scope, then it is target
+dependent if it *synchronizes with* and participates in the seq\_cst total
+orderings of other operations.
+
+Otherwise, an atomic operation that is not marked ``syncscope("singlethread")``
+or ``syncscope("<target-scope>")`` *synchronizes with* and participates in the
+seq\_cst total orderings of other operations that are not marked
+``syncscope("singlethread")`` or ``syncscope("<target-scope>")``.
 
 .. _fastmath:
 
@@ -2194,6 +2296,10 @@
    Allow Reciprocal - Allow optimizations to use the reciprocal of an
    argument rather than perform division.
 
+``contract``
+   Allow floating-point contraction (e.g. fusing a multiply followed by an
+   addition into a fused multiply-and-add).
+
 ``fast``
    Fast - Allow algebraically equivalent transformations that may
    dramatically change results in floating point (e.g. reassociate). This
@@ -3078,14 +3184,11 @@
 The following is the syntax for constant expressions:
 
 ``trunc (CST to TYPE)``
-    Truncate a constant to another type. The bit size of CST must be
-    larger than the bit size of TYPE. Both types must be integers.
+    Perform the :ref:`trunc operation <i_trunc>` on constants.
 ``zext (CST to TYPE)``
-    Zero extend a constant to another type. The bit size of CST must be
-    smaller than the bit size of TYPE. Both types must be integers.
+    Perform the :ref:`zext operation <i_zext>` on constants.
 ``sext (CST to TYPE)``
-    Sign extend a constant to another type. The bit size of CST must be
-    smaller than the bit size of TYPE. Both types must be integers.
+    Perform the :ref:`sext operation <i_sext>` on constants.
 ``fptrunc (CST to TYPE)``
     Truncate a floating point constant to another floating point type.
     The size of CST must be larger than the size of TYPE. Both types
@@ -3119,19 +3222,14 @@
     be scalars, or vectors of the same number of elements. If the value
     won't fit in the floating point type, the results are undefined.
 ``ptrtoint (CST to TYPE)``
-    Convert a pointer typed constant to the corresponding integer
-    constant. ``TYPE`` must be an integer type. ``CST`` must be of
-    pointer type. The ``CST`` value is zero extended, truncated, or
-    unchanged to make it fit in ``TYPE``.
+    Perform the :ref:`ptrtoint operation <i_ptrtoint>` on constants.
 ``inttoptr (CST to TYPE)``
-    Convert an integer constant to a pointer constant. TYPE must be a
-    pointer type. CST must be of integer type. The CST value is zero
-    extended, truncated, or unchanged to make it fit in a pointer size.
+    Perform the :ref:`inttoptr operation <i_inttoptr>` on constants.
     This one is *really* dangerous!
 ``bitcast (CST to TYPE)``
-    Convert a constant, CST, to another TYPE. The constraints of the
-    operands are the same as those for the :ref:`bitcast
-    instruction <i_bitcast>`.
+    Convert a constant, CST, to another TYPE.
+    The constraints of the operands are the same as those for the
+    :ref:`bitcast instruction <i_bitcast>`.
 ``addrspacecast (CST to TYPE)``
     Convert a constant pointer or constant vector of pointer, CST, to another
     TYPE in a different address space. The constraints of the operands are the
@@ -3139,14 +3237,14 @@
 ``getelementptr (TY, CSTPTR, IDX0, IDX1, ...)``, ``getelementptr inbounds (TY, CSTPTR, IDX0, IDX1, ...)``
     Perform the :ref:`getelementptr operation <i_getelementptr>` on
     constants. As with the :ref:`getelementptr <i_getelementptr>`
-    instruction, the index list may have zero or more indexes, which are
+    instruction, the index list may have one or more indexes, which are
     required to make sense for the type of "pointer to TY".
 ``select (COND, VAL1, VAL2)``
     Perform the :ref:`select operation <i_select>` on constants.
 ``icmp COND (VAL1, VAL2)``
-    Performs the :ref:`icmp operation <i_icmp>` on constants.
+    Perform the :ref:`icmp operation <i_icmp>` on constants.
 ``fcmp COND (VAL1, VAL2)``
-    Performs the :ref:`fcmp operation <i_fcmp>` on constants.
+    Perform the :ref:`fcmp operation <i_fcmp>` on constants.
 ``extractelement (VAL, IDX)``
     Perform the :ref:`extractelement operation <i_extractelement>` on
     constants.
@@ -3646,6 +3744,9 @@
 
 - ``I``: An immediate 13-bit signed integer.
 - ``r``: A 32-bit integer register.
+- ``f``: Any floating-point register on SparcV8, or a floating point
+  register in the "low" half of the registers on SparcV9.
+- ``e``: Any floating point register. (Same as ``f`` on SparcV8.)
 
 SystemZ:
 
@@ -3946,12 +4047,12 @@
 
     !foo = !{!4, !3}
 
-Metadata can be used as function arguments. Here ``llvm.dbg.value``
-function is using two metadata arguments:
+Metadata can be used as function arguments. Here the ``llvm.dbg.value``
+intrinsic is using three metadata arguments:
 
 .. code-block:: llvm
 
-    call void @llvm.dbg.value(metadata !24, i64 0, metadata !25)
+    call void @llvm.dbg.value(metadata !24, metadata !25, metadata !26)
 
 Metadata can be attached to an instruction. Here metadata ``!21`` is attached
 to the ``add`` instruction using the ``!dbg`` identifier:
@@ -4003,26 +4104,26 @@
 """""""""""""
 
 ``DICompileUnit`` nodes represent a compile unit. The ``enums:``,
-``retainedTypes:``, ``subprograms:``, ``globals:``, ``imports:`` and ``macros:``
-fields are tuples containing the debug info to be emitted along with the compile
-unit, regardless of code optimizations (some nodes are only emitted if there are
-references to them from instructions). The ``debugInfoForProfiling:`` field is a
-boolean indicating whether or not line-table discriminators are updated to
-provide more-accurate debug info for profiling results.
+``retainedTypes:``, ``globals:``, ``imports:`` and ``macros:`` fields are tuples
+containing the debug info to be emitted along with the compile unit, regardless
+of code optimizations (some nodes are only emitted if there are references to
+them from instructions). The ``debugInfoForProfiling:`` field is a boolean
+indicating whether or not line-table discriminators are updated to provide
+more-accurate debug info for profiling results.
 
 .. code-block:: text
 
     !0 = !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang",
                         isOptimized: true, flags: "-O2", runtimeVersion: 2,
                         splitDebugFilename: "abc.debug", emissionKind: FullDebug,
-                        enums: !2, retainedTypes: !3, subprograms: !4,
-                        globals: !5, imports: !6, macros: !7, dwoId: 0x0abcd)
+                        enums: !2, retainedTypes: !3, globals: !4, imports: !5,
+                        macros: !6, dwoId: 0x0abcd)
 
 Compile unit descriptors provide the root scope for objects declared in a
-specific compilation unit. File descriptors are defined using this scope.
-These descriptors are collected by a named metadata ``!llvm.dbg.cu``. They
-keep track of subprograms, global variables, type information, and imported
-entities (declarations and namespaces).
+specific compilation unit. File descriptors are defined using this scope.  These
+descriptors are collected by a named metadata node ``!llvm.dbg.cu``. They keep
+track of global variables, type information, and imported entities (declarations
+and namespaces).
 
 .. _DIFile:
 
@@ -4296,8 +4397,8 @@
                                 containingType: !4,
                                 virtuality: DW_VIRTUALITY_pure_virtual,
                                 virtualIndex: 10, flags: DIFlagPrototyped,
-                                isOptimized: true, templateParams: !5,
-                                declaration: !6, variables: !7)
+                                isOptimized: true, unit: !5, templateParams: !6,
+                                declaration: !7, variables: !8, thrownTypes: !9)
 
 .. _DILexicalBlock:
 
@@ -4366,29 +4467,47 @@
 DIExpression
 """"""""""""
 
-``DIExpression`` nodes represent DWARF expression sequences. They are used in
-:ref:`debug intrinsics<dbg_intrinsics>` (such as ``llvm.dbg.declare``) to
-describe how the referenced LLVM variable relates to the source language
-variable.
+``DIExpression`` nodes represent expressions that are inspired by the DWARF
+expression language. They are used in :ref:`debug intrinsics<dbg_intrinsics>`
+(such as ``llvm.dbg.declare`` and ``llvm.dbg.value``) to describe how the
+referenced LLVM variable relates to the source language variable.
 
 The current supported vocabulary is limited:
 
-- ``DW_OP_deref`` dereferences the working expression.
-- ``DW_OP_plus, 93`` adds ``93`` to the working expression.
-- ``DW_OP_bit_piece, 16, 8`` specifies the offset and size (``16`` and ``8``
-  here, respectively) of the variable piece from the working expression.
+- ``DW_OP_deref`` dereferences the top of the expression stack.
+- ``DW_OP_plus`` pops the last two entries from the expression stack, adds
+  them together and appends the result to the expression stack.
+- ``DW_OP_minus`` pops the last two entries from the expression stack, subtracts
+  the last entry from the second last entry and appends the result to the
+  expression stack.
+- ``DW_OP_plus_uconst, 93`` adds ``93`` to the working expression.
+- ``DW_OP_LLVM_fragment, 16, 8`` specifies the offset and size (``16`` and ``8``
+  here, respectively) of the variable fragment from the working expression. Note
+  that contrary to DW_OP_bit_piece, the offset is describing the the location
+  within the described source variable.
 - ``DW_OP_swap`` swaps top two stack entries.
 - ``DW_OP_xderef`` provides extended dereference mechanism. The entry at the top
   of the stack is treated as an address. The second stack entry is treated as an
   address space identifier.
+- ``DW_OP_stack_value`` marks a constant value.
 
-.. code-block:: text
+DWARF specifies three kinds of simple location descriptions: Register, memory,
+and implicit location descriptions. Register and memory location descriptions
+describe the *location* of a source variable (in the sense that a debugger might
+modify its value), whereas implicit locations describe merely the *value* of a
+source variable. DIExpressions also follow this model: A DIExpression that
+doesn't have a trailing ``DW_OP_stack_value`` will describe an *address* when
+combined with a concrete location.
+
+.. code-block:: llvm
 
     !0 = !DIExpression(DW_OP_deref)
-    !1 = !DIExpression(DW_OP_plus, 3)
+    !1 = !DIExpression(DW_OP_plus_uconst, 3)
+    !1 = !DIExpression(DW_OP_constu, 3, DW_OP_plus)
     !2 = !DIExpression(DW_OP_bit_piece, 3, 7)
-    !3 = !DIExpression(DW_OP_deref, DW_OP_plus, 3, DW_OP_bit_piece, 3, 7)
+    !3 = !DIExpression(DW_OP_deref, DW_OP_constu, 3, DW_OP_plus, DW_OP_LLVM_fragment, 3, 7)
     !4 = !DIExpression(DW_OP_constu, 2, DW_OP_swap, DW_OP_xderef)
+    !5 = !DIExpression(DW_OP_constu, 42, DW_OP_stack_value)
 
 DIObjCProperty
 """"""""""""""
@@ -4773,6 +4892,23 @@
     !0 = !{ i64 0, i64 256 }
     !1 = !{ i64 -1, i64 -1 }
 
+'``callees``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^
+
+``callees`` metadata may be attached to indirect call sites. If ``callees``
+metadata is attached to a call site, and any callee is not among the set of
+functions provided by the metadata, the behavior is undefined. The intent of
+this metadata is to facilitate optimizations such as indirect-call promotion.
+For example, in the code below, the call instruction may only target the
+``add`` or ``sub`` functions:
+
+.. code-block:: llvm
+
+    %result = call i64 %binop(i64 %x, i64 %y), !callees !0
+
+    ...
+    !0 = !{i64 (i64, i64)* @add, i64 (i64, i64)* @sub}
+
 '``unpredictable``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -4958,7 +5094,7 @@
 
 Loop distribution allows splitting a loop into multiple loops.  Currently,
 this is only performed if the entire loop cannot be vectorized due to unsafe
-memory dependencies.  The transformation will atempt to isolate the unsafe
+memory dependencies.  The transformation will attempt to isolate the unsafe
 dependencies into their own loop.
 
 This metadata can be used to selectively enable or disable distribution of the
@@ -5101,11 +5237,114 @@
    !0 = !{!"magic ptr"}
    !1 = !{!"other ptr"}
 
+The invariant.group metadata must be dropped when replacing one pointer by
+another based on aliasing information. This is because invariant.group is tied
+to the SSA value of the pointer operand.
+
+.. code-block:: llvm
+  
+  %v = load i8, i8* %x, !invariant.group !0
+  ; if %x mustalias %y then we can replace the above instruction with
+  %v = load i8, i8* %y
+
+
 '``type``' Metadata
 ^^^^^^^^^^^^^^^^^^^
 
 See :doc:`TypeMetadata`.
 
+'``associated``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``associated`` metadata may be attached to a global object
+declaration with a single argument that references another global object.
+
+This metadata prevents discarding of the global object in linker GC
+unless the referenced object is also discarded. The linker support for
+this feature is spotty. For best compatibility, globals carrying this
+metadata may also:
+
+- Be in a comdat with the referenced global.
+- Be in @llvm.compiler.used.
+- Have an explicit section with a name which is a valid C identifier.
+
+It does not have any effect on non-ELF targets.
+
+Example:
+
+.. code-block:: llvm
+
+    $a = comdat any
+    @a = global i32 1, comdat $a
+    @b = internal global i32 2, comdat $a, section "abc", !associated !0
+    !0 = !{i32* @a}
+
+
+'``prof``' Metadata
+^^^^^^^^^^^^^^^^^^^
+
+The ``prof`` metadata is used to record profile data in the IR.
+The first operand of the metadata node indicates the profile metadata
+type. There are currently 3 types:
+:ref:`branch_weights<prof_node_branch_weights>`,
+:ref:`function_entry_count<prof_node_function_entry_count>`, and
+:ref:`VP<prof_node_VP>`.
+
+.. _prof_node_branch_weights:
+
+branch_weights
+""""""""""""""
+
+Branch weight metadata attached to a branch, select, switch or call instruction
+represents the likeliness of the associated branch being taken.
+For more information, see :doc:`BranchWeightMetadata`.
+
+.. _prof_node_function_entry_count:
+
+function_entry_count
+""""""""""""""""""""
+
+Function entry count metadata can be attached to function definitions
+to record the number of times the function is called. Used with BFI
+information, it is also used to derive the basic block profile count.
+For more information, see :doc:`BranchWeightMetadata`.
+
+.. _prof_node_VP:
+
+VP
+""
+
+VP (value profile) metadata can be attached to instructions that have
+value profile information. Currently this is indirect calls (where it
+records the hottest callees) and calls to memory intrinsics such as memcpy,
+memmove, and memset (where it records the hottest byte lengths).
+
+Each VP metadata node contains "VP" string, then a uint32_t value for the value
+profiling kind, a uint64_t value for the total number of times the instruction
+is executed, followed by uint64_t value and execution count pairs.
+The value profiling kind is 0 for indirect call targets and 1 for memory
+operations. For indirect call targets, each profile value is a hash
+of the callee function name, and for memory operations each value is the
+byte length.
+
+Note that the value counts do not need to add up to the total count
+listed in the third operand (in practice only the top hottest values
+are tracked and reported).
+
+Indirect call example:
+
+.. code-block:: llvm
+
+    call void %f(), !prof !1
+    !1 = !{!"VP", i32 0, i64 1600, i64 7651369219802541373, i64 1030, i64 -4377547752858689819, i64 410}
+
+Note that the VP type is 0 (the second operand), which indicates this is
+an indirect call value profile data. The third operand indicates that the
+indirect call executed 1600 times. The 4th and 6th operands give the
+hashes of the 2 hottest target functions' names (this is the same hash used
+to represent function names in the profile database), and the 5th and 7th
+operands give the execution count that each of the respective prior target
+functions was called.
 
 Module Flags Metadata
 =====================
@@ -5181,6 +5420,10 @@
            nodes. However, duplicate entries in the second list are dropped
            during the append operation.
 
+   * - 7
+     - **Max**
+           Takes the max of the two values, which are required to be integers.
+
 It is an error for a particular unique flag ID to have multiple behaviors,
 except in the case of **Require** (which adds restrictions on another metadata
 value) or **Override**.
@@ -5273,40 +5516,6 @@
 -  A module with ``Objective-C Garbage Collection`` set to 0 cannot be
    merged with a module with ``Objective-C GC Only`` set to 6.
 
-Automatic Linker Flags Module Flags Metadata
---------------------------------------------
-
-Some targets support embedding flags to the linker inside individual object
-files. Typically this is used in conjunction with language extensions which
-allow source files to explicitly declare the libraries they depend on, and have
-these automatically be transmitted to the linker via object files.
-
-These flags are encoded in the IR using metadata in the module flags section,
-using the ``Linker Options`` key. The merge behavior for this flag is required
-to be ``AppendUnique``, and the value for the key is expected to be a metadata
-node which should be a list of other metadata nodes, each of which should be a
-list of metadata strings defining linker options.
-
-For example, the following metadata section specifies two separate sets of
-linker options, presumably to link against ``libz`` and the ``Cocoa``
-framework::
-
-    !0 = !{ i32 6, !"Linker Options",
-       !{
-          !{ !"-lz" },
-          !{ !"-framework", !"Cocoa" } } }
-    !llvm.module.flags = !{ !0 }
-
-The metadata encoding as lists of lists of options, as opposed to a collapsed
-list of options, is chosen so that the IR encoding can use multiple option
-strings to specify e.g., a single library, while still having that specifier be
-preserved as an atomic element that can be recognized by a target specific
-assembly writer or object file emitter.
-
-Each individual option is required to be either a valid option for the target's
-linker, or an option that is reserved by the target specific assembly writer or
-object file emitter. No other aspect of these options is defined by the IR.
-
 C type width Module Flags Metadata
 ----------------------------------
 
@@ -5343,6 +5552,37 @@
     !0 = !{i32 1, !"short_wchar", i32 1}
     !1 = !{i32 1, !"short_enum", i32 0}
 
+Automatic Linker Flags Named Metadata
+=====================================
+
+Some targets support embedding flags to the linker inside individual object
+files. Typically this is used in conjunction with language extensions which
+allow source files to explicitly declare the libraries they depend on, and have
+these automatically be transmitted to the linker via object files.
+
+These flags are encoded in the IR using named metadata with the name
+``!llvm.linker.options``. Each operand is expected to be a metadata node
+which should be a list of other metadata nodes, each of which should be a
+list of metadata strings defining linker options.
+
+For example, the following metadata section specifies two separate sets of
+linker options, presumably to link against ``libz`` and the ``Cocoa``
+framework::
+
+    !0 = !{ !"-lz" },
+    !1 = !{ !"-framework", !"Cocoa" } } }
+    !llvm.linker.options = !{ !0, !1 }
+
+The metadata encoding as lists of lists of options, as opposed to a collapsed
+list of options, is chosen so that the IR encoding can use multiple option
+strings to specify e.g., a single library, while still having that specifier be
+preserved as an atomic element that can be recognized by a target specific
+assembly writer or object file emitter.
+
+Each individual option is required to be either a valid option for the target's
+linker, or an option that is reserved by the target specific assembly writer or
+object file emitter. No other aspect of these options is defined by the IR.
+
 .. _intrinsicglobalvariables:
 
 Intrinsic Global Variables
@@ -5755,9 +5995,7 @@
 #. '``exception label``': the label reached when a callee returns via
    the :ref:`resume <i_resume>` instruction or other exception handling
    mechanism.
-#. The optional :ref:`function attributes <fnattrs>` list. Only
-   '``noreturn``', '``nounwind``', '``readonly``' and '``readnone``'
-   attributes are valid here.
+#. The optional :ref:`function attributes <fnattrs>` list.
 #. The optional :ref:`operand bundles <opbundles>` list.
 
 Semantics:
@@ -6614,15 +6852,14 @@
 The value produced is ``op1`` \* 2\ :sup:`op2` mod 2\ :sup:`n`,
 where ``n`` is the width of the result. If ``op2`` is (statically or
 dynamically) equal to or larger than the number of bits in
-``op1``, the result is undefined. If the arguments are vectors, each
-vector element of ``op1`` is shifted by the corresponding shift amount
-in ``op2``.
+``op1``, this instruction returns a :ref:`poison value <poisonvalues>`.
+If the arguments are vectors, each vector element of ``op1`` is shifted
+by the corresponding shift amount in ``op2``.
 
-If the ``nuw`` keyword is present, then the shift produces a :ref:`poison
-value <poisonvalues>` if it shifts out any non-zero bits. If the
-``nsw`` keyword is present, then the shift produces a :ref:`poison
-value <poisonvalues>` if it shifts out any bits that disagree with the
-resultant sign bit.
+If the ``nuw`` keyword is present, then the shift produces a poison
+value if it shifts out any non-zero bits.
+If the ``nsw`` keyword is present, then the shift produces a poison
+value it shifts out any bits that disagree with the resultant sign bit.
 
 Example:
 """"""""
@@ -6665,13 +6902,12 @@
 This instruction always performs a logical shift right operation. The
 most significant bits of the result will be filled with zero bits after
 the shift. If ``op2`` is (statically or dynamically) equal to or larger
-than the number of bits in ``op1``, the result is undefined. If the
-arguments are vectors, each vector element of ``op1`` is shifted by the
-corresponding shift amount in ``op2``.
+than the number of bits in ``op1``, this instruction returns a :ref:`poison
+value <poisonvalues>`. If the arguments are vectors, each vector element
+of ``op1`` is shifted by the corresponding shift amount in ``op2``.
 
 If the ``exact`` keyword is present, the result value of the ``lshr`` is
-a :ref:`poison value <poisonvalues>` if any of the bits shifted out are
-non-zero.
+a poison value if any of the bits shifted out are non-zero.
 
 Example:
 """"""""
@@ -6716,13 +6952,12 @@
 This instruction always performs an arithmetic shift right operation,
 The most significant bits of the result will be filled with the sign bit
 of ``op1``. If ``op2`` is (statically or dynamically) equal to or larger
-than the number of bits in ``op1``, the result is undefined. If the
-arguments are vectors, each vector element of ``op1`` is shifted by the
-corresponding shift amount in ``op2``.
+than the number of bits in ``op1``, this instruction returns a :ref:`poison
+value <poisonvalues>`. If the arguments are vectors, each vector element
+of ``op1`` is shifted by the corresponding shift amount in ``op2``.
 
 If the ``exact`` keyword is present, the result value of the ``ashr`` is
-a :ref:`poison value <poisonvalues>` if any of the bits shifted out are
-non-zero.
+a poison value if any of the bits shifted out are non-zero.
 
 Example:
 """"""""
@@ -7014,9 +7249,10 @@
 The elements of the two input vectors are numbered from left to right
 across both of the vectors. The shuffle mask operand specifies, for each
 element of the result vector, which element of the two input vectors the
-result element gets. The element selector may be undef (meaning "don't
-care") and the second operand may be undef if performing a shuffle from
-only one vector.
+result element gets. If the shuffle mask is undef, the result vector is
+undef. If any element of the mask operand is undef, that element of the
+result is undef. If the shuffle mask selects an undef element from one
+of the input vectors, the resulting element is undef.
 
 Example:
 """"""""
@@ -7149,7 +7385,7 @@
 
 ::
 
-      <result> = alloca [inalloca] <type> [, <ty> <NumElements>] [, align <alignment>]     ; yields type*:result
+      <result> = alloca [inalloca] <type> [, <ty> <NumElements>] [, align <alignment>] [, addrspace(<num>)]     ; yields type addrspace(num)*:result
 
 Overview:
 """""""""
@@ -7157,7 +7393,7 @@
 The '``alloca``' instruction allocates memory on the stack frame of the
 currently executing function, to be automatically released when this
 function returns to its caller. The object is always allocated in the
-generic address space (address space zero).
+address space for allocas indicated in the datalayout.
 
 Arguments:
 """"""""""
@@ -7208,7 +7444,7 @@
 ::
 
       <result> = load [volatile] <ty>, <ty>* <pointer>[, align <alignment>][, !nontemporal !<index>][, !invariant.load !<index>][, !invariant.group !<index>][, !nonnull !<index>][, !dereferenceable !<deref_bytes_node>][, !dereferenceable_or_null !<deref_bytes_node>][, !align !<align_node>]
-      <result> = load atomic [volatile] <ty>, <ty>* <pointer> [singlethread] <ordering>, align <alignment> [, !invariant.group !<index>]
+      <result> = load atomic [volatile] <ty>, <ty>* <pointer> [syncscope("<target-scope>")] <ordering>, align <alignment> [, !invariant.group !<index>]
       !<index> = !{ i32 1 }
       !<deref_bytes_node> = !{i64 <dereferenceable_bytes>}
       !<align_node> = !{ i64 <value_alignment> }
@@ -7229,14 +7465,14 @@
 :ref:`volatile operations <volatile>`.
 
 If the ``load`` is marked as ``atomic``, it takes an extra :ref:`ordering
-<ordering>` and optional ``singlethread`` argument. The ``release`` and
-``acq_rel`` orderings are not valid on ``load`` instructions. Atomic loads
-produce :ref:`defined <memmodel>` results when they may see multiple atomic
-stores. The type of the pointee must be an integer, pointer, or floating-point
-type whose bit width is a power of two greater than or equal to eight and less
-than or equal to a target-specific size limit.  ``align`` must be explicitly
-specified on atomic loads, and the load has undefined behavior if the alignment
-is not set to a value which is at least the size in bytes of the
+<ordering>` and optional ``syncscope("<target-scope>")`` argument. The
+``release`` and ``acq_rel`` orderings are not valid on ``load`` instructions.
+Atomic loads produce :ref:`defined <memmodel>` results when they may see
+multiple atomic stores. The type of the pointee must be an integer, pointer, or
+floating-point type whose bit width is a power of two greater than or equal to
+eight and less than or equal to a target-specific size limit.  ``align`` must be
+explicitly specified on atomic loads, and the load has undefined behavior if the
+alignment is not set to a value which is at least the size in bytes of the
 pointee. ``!nontemporal`` does not have any defined semantics for atomic loads.
 
 The optional constant ``align`` argument specifies the alignment of the
@@ -7337,7 +7573,7 @@
 ::
 
       store [volatile] <ty> <value>, <ty>* <pointer>[, align <alignment>][, !nontemporal !<index>][, !invariant.group !<index>]        ; yields void
-      store atomic [volatile] <ty> <value>, <ty>* <pointer> [singlethread] <ordering>, align <alignment> [, !invariant.group !<index>] ; yields void
+      store atomic [volatile] <ty> <value>, <ty>* <pointer> [syncscope("<target-scope>")] <ordering>, align <alignment> [, !invariant.group !<index>] ; yields void
 
 Overview:
 """""""""
@@ -7357,14 +7593,14 @@
 structural type <t_opaque>`) can be stored.
 
 If the ``store`` is marked as ``atomic``, it takes an extra :ref:`ordering
-<ordering>` and optional ``singlethread`` argument. The ``acquire`` and
-``acq_rel`` orderings aren't valid on ``store`` instructions. Atomic loads
-produce :ref:`defined <memmodel>` results when they may see multiple atomic
-stores. The type of the pointee must be an integer, pointer, or floating-point
-type whose bit width is a power of two greater than or equal to eight and less
-than or equal to a target-specific size limit.  ``align`` must be explicitly
-specified on atomic stores, and the store has undefined behavior if the
-alignment is not set to a value which is at least the size in bytes of the
+<ordering>` and optional ``syncscope("<target-scope>")`` argument. The
+``acquire`` and ``acq_rel`` orderings aren't valid on ``store`` instructions.
+Atomic loads produce :ref:`defined <memmodel>` results when they may see
+multiple atomic stores. The type of the pointee must be an integer, pointer, or
+floating-point type whose bit width is a power of two greater than or equal to
+eight and less than or equal to a target-specific size limit.  ``align`` must be
+explicitly specified on atomic stores, and the store has undefined behavior if
+the alignment is not set to a value which is at least the size in bytes of the
 pointee. ``!nontemporal`` does not have any defined semantics for atomic stores.
 
 The optional constant ``align`` argument specifies the alignment of the
@@ -7425,7 +7661,7 @@
 
 ::
 
-      fence [singlethread] <ordering>                   ; yields void
+      fence [syncscope("<target-scope>")] <ordering>  ; yields void
 
 Overview:
 """""""""
@@ -7459,17 +7695,17 @@
 ``acquire`` and ``release`` semantics specified above, participates in
 the global program order of other ``seq_cst`` operations and/or fences.
 
-The optional ":ref:`singlethread <singlethread>`" argument specifies
-that the fence only synchronizes with other fences in the same thread.
-(This is useful for interacting with signal handlers.)
+A ``fence`` instruction can also take an optional
+":ref:`syncscope <syncscope>`" argument.
 
 Example:
 """"""""
 
 .. code-block:: llvm
 
-      fence acquire                          ; yields void
-      fence singlethread seq_cst             ; yields void
+      fence acquire                                        ; yields void
+      fence syncscope("singlethread") seq_cst              ; yields void
+      fence syncscope("agent") seq_cst                     ; yields void
 
 .. _i_cmpxchg:
 
@@ -7481,7 +7717,7 @@
 
 ::
 
-      cmpxchg [weak] [volatile] <ty>* <pointer>, <ty> <cmp>, <ty> <new> [singlethread] <success ordering> <failure ordering> ; yields  { ty, i1 }
+      cmpxchg [weak] [volatile] <ty>* <pointer>, <ty> <cmp>, <ty> <new> [syncscope("<target-scope>")] <success ordering> <failure ordering> ; yields  { ty, i1 }
 
 Overview:
 """""""""
@@ -7510,10 +7746,8 @@
 stronger than that on success, and the failure ordering cannot be either
 ``release`` or ``acq_rel``.
 
-The optional "``singlethread``" argument declares that the ``cmpxchg``
-is only atomic with respect to code (usually signal handlers) running in
-the same thread as the ``cmpxchg``. Otherwise the cmpxchg is atomic with
-respect to all other code in the system.
+A ``cmpxchg`` instruction can also take an optional
+":ref:`syncscope <syncscope>`" argument.
 
 The pointer passed into cmpxchg must have alignment greater than or
 equal to the size in memory of the operand.
@@ -7522,9 +7756,9 @@
 """"""""""
 
 The contents of memory at the location specified by the '``<pointer>``' operand
-is read and compared to '``<cmp>``'; if the read value is the equal, the
-'``<new>``' is written. The original value at the location is returned, together
-with a flag indicating success (true) or failure (false).
+is read and compared to '``<cmp>``'; if the values are equal, '``<new>``' is
+written to the location. The original value at the location is returned,
+together with a flag indicating success (true) or failure (false).
 
 If the cmpxchg operation is marked as ``weak`` then a spurious failure is
 permitted: the operation may not write ``<new>`` even if the comparison
@@ -7567,7 +7801,7 @@
 
 ::
 
-      atomicrmw [volatile] <operation> <ty>* <pointer>, <ty> <value> [singlethread] <ordering>                   ; yields ty
+      atomicrmw [volatile] <operation> <ty>* <pointer>, <ty> <value> [syncscope("<target-scope>")] <ordering>                   ; yields ty
 
 Overview:
 """""""""
@@ -7601,6 +7835,9 @@
 order of execution of this ``atomicrmw`` with other :ref:`volatile
 operations <volatile>`.
 
+A ``atomicrmw`` instruction can also take an optional
+":ref:`syncscope <syncscope>`" argument.
+
 Semantics:
 """"""""""
 
@@ -7661,7 +7898,7 @@
 that indicate which of the elements of the aggregate object are indexed.
 The interpretation of each index is dependent on the type being indexed
 into. The first index always indexes the pointer value given as the
-first argument, the second index indexes a value of the type pointed to
+second argument, the second index indexes a value of the type pointed to
 (not necessarily the value directly pointed to, since the first index
 can be non-zero), etc. The first type indexed into must be a pointer
 value, subsequent types can be arrays, vectors, and structs. Note that
@@ -7843,7 +8080,7 @@
     ; get pointers for 8 elements from array B
     %ptrs = getelementptr double, double* %B, <8 x i32> %C
     ; load 8 elements from array B into A
-    %A = call <8 x double> @llvm.masked.gather.v8f64(<8 x double*> %ptrs,
+    %A = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs,
          i32 8, <8 x i1> %mask, <8 x double> %passthru)
 
 Conversion Operations
@@ -7853,6 +8090,8 @@
 (casting) which all take a single operand and a type. They perform
 various bit conversions on the operand.
 
+.. _i_trunc:
+
 '``trunc .. to``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -7895,6 +8134,8 @@
       %Z = trunc i32 122 to i1                        ; yields i1:false
       %W = trunc <2 x i16> <i16 8, i16 7> to <2 x i8> ; yields <i8 8, i8 7>
 
+.. _i_zext:
+
 '``zext .. to``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -7935,6 +8176,8 @@
       %Y = zext i1 true to i32              ; yields i32:1
       %Z = zext <2 x i16> <i16 8, i16 7> to <2 x i32> ; yields <i32 8, i32 7>
 
+.. _i_sext:
+
 '``sext .. to``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -8815,9 +9058,7 @@
    be of :ref:`first class <t_firstclass>` type. If the function signature
    indicates the function accepts a variable number of arguments, the
    extra arguments can be specified.
-#. The optional :ref:`function attributes <fnattrs>` list. Only
-   '``noreturn``', '``nounwind``', '``readonly``' , '``readnone``',
-   and '``convergent``' attributes are valid here.
+#. The optional :ref:`function attributes <fnattrs>` list.
 #. The optional :ref:`operand bundles <opbundles>` list.
 
 Semantics:
@@ -9466,7 +9707,7 @@
 
 ::
 
-      declare i8  *@llvm.returnaddress(i32 <level>)
+      declare i8* @llvm.returnaddress(i32 <level>)
 
 Overview:
 """""""""
@@ -9504,7 +9745,7 @@
 
 ::
 
-      declare i8  *@llvm.addressofreturnaddress()
+      declare i8* @llvm.addressofreturnaddress()
 
 Overview:
 """""""""
@@ -9752,7 +9993,7 @@
       compile-time-known constant value.
 
       The return value type of :ref:`llvm.get.dynamic.area.offset <int_get_dynamic_area_offset>`
-      must match the target's generic address space's (address space 0) pointer type.
+      must match the target's default address space's (address space 0) pointer type.
 
 '``llvm.prefetch``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -9954,7 +10195,7 @@
 Arguments:
 """"""""""
 The first four arguments are the same as '``llvm.instrprof_increment``'
-instrinsic.
+intrinsic.
 
 The last argument specifies the value of the increment of the counter variable.
 
@@ -10102,6 +10343,8 @@
 to be aligned to some boundary, this can be specified as the fourth
 argument, otherwise it should be set to 0 or 1 (both meaning no alignment).
 
+.. _int_memmove:
+
 '``llvm.memmove``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -10157,6 +10400,8 @@
 aligned to some boundary, this can be specified as the fourth argument,
 otherwise it should be set to 0 or 1 (both meaning no alignment).
 
+.. _int_memset:
+
 '``llvm.memset.*``' Intrinsics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -10229,21 +10474,20 @@
 Overview:
 """""""""
 
-The '``llvm.sqrt``' intrinsics return the sqrt of the specified operand,
+The '``llvm.sqrt``' intrinsics return the square root of the specified value,
 returning the same value as the libm '``sqrt``' functions would, but without
 trapping or setting ``errno``.
 
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
 
-This function returns the sqrt of the specified operand if it is a
-nonnegative floating point number.
+This function returns the square root of the operand if it is a nonnegative
+floating point number.
 
 '``llvm.powi.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -10309,8 +10553,7 @@
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
@@ -10345,8 +10588,7 @@
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
@@ -10413,13 +10655,13 @@
 Overview:
 """""""""
 
-The '``llvm.exp.*``' intrinsics perform the exp function.
+The '``llvm.exp.*``' intrinsics compute the base-e exponential of the specified
+value.
 
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
@@ -10448,13 +10690,13 @@
 Overview:
 """""""""
 
-The '``llvm.exp2.*``' intrinsics perform the exp2 function.
+The '``llvm.exp2.*``' intrinsics compute the base-2 exponential of the
+specified value.
 
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
@@ -10483,13 +10725,13 @@
 Overview:
 """""""""
 
-The '``llvm.log.*``' intrinsics perform the log function.
+The '``llvm.log.*``' intrinsics compute the base-e logarithm of the specified
+value.
 
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
@@ -10518,13 +10760,13 @@
 Overview:
 """""""""
 
-The '``llvm.log10.*``' intrinsics perform the log10 function.
+The '``llvm.log10.*``' intrinsics compute the base-10 logarithm of the
+specified value.
 
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
@@ -10553,13 +10795,13 @@
 Overview:
 """""""""
 
-The '``llvm.log2.*``' intrinsics perform the log2 function.
+The '``llvm.log2.*``' intrinsics compute the base-2 logarithm of the specified
+value.
 
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
@@ -11620,6 +11862,338 @@
 
       %r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields float:r2 = (a * b) + c
 
+
+Experimental Vector Reduction Intrinsics
+----------------------------------------
+
+Horizontal reductions of vectors can be expressed using the following
+intrinsics. Each one takes a vector operand as an input and applies its
+respective operation across all elements of the vector, returning a single
+scalar result of the same element type.
+
+
+'``llvm.experimental.vector.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a)
+      declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.add.*``' intrinsics do an integer ``ADD``
+reduction of a vector, returning the result as a scalar. The return type matches
+the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.fadd.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %a)
+      declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %acc, <2 x double> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.fadd.*``' intrinsics do a floating point
+``ADD`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+If the intrinsic call has fast-math flags, then the reduction will not preserve
+the associativity of an equivalent scalarized counterpart. If it does not have
+fast-math flags, then the reduction will be *ordered*, implying that the
+operation respects the associativity of a scalarized reduction.
+
+
+Arguments:
+""""""""""
+The first argument to this intrinsic is a scalar accumulator value, which is
+only used when there are no fast-math flags attached. This argument may be undef
+when fast-math flags are used.
+
+The second argument must be a vector of floating point values.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %fast = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %input) ; fast reduction
+      %ord = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction
+
+
+'``llvm.experimental.vector.reduce.mul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> %a)
+      declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.mul.*``' intrinsics do an integer ``MUL``
+reduction of a vector, returning the result as a scalar. The return type matches
+the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.fmul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %a)
+      declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %acc, <2 x double> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.fmul.*``' intrinsics do a floating point
+``MUL`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+If the intrinsic call has fast-math flags, then the reduction will not preserve
+the associativity of an equivalent scalarized counterpart. If it does not have
+fast-math flags, then the reduction will be *ordered*, implying that the
+operation respects the associativity of a scalarized reduction.
+
+
+Arguments:
+""""""""""
+The first argument to this intrinsic is a scalar accumulator value, which is
+only used when there are no fast-math flags attached. This argument may be undef
+when fast-math flags are used.
+
+The second argument must be a vector of floating point values.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %fast = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %input) ; fast reduction
+      %ord = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction
+
+'``llvm.experimental.vector.reduce.and.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.and.*``' intrinsics do a bitwise ``AND``
+reduction of a vector, returning the result as a scalar. The return type matches
+the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.or.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.or.*``' intrinsics do a bitwise ``OR`` reduction
+of a vector, returning the result as a scalar. The return type matches the
+element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.xor.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.xor.*``' intrinsics do a bitwise ``XOR``
+reduction of a vector, returning the result as a scalar. The return type matches
+the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.smax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.smax.*``' intrinsics do a signed integer
+``MAX`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.smin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.smin.*``' intrinsics do a signed integer
+``MIN`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.umax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.umax.*``' intrinsics do an unsigned
+integer ``MAX`` reduction of a vector, returning the result as a scalar. The
+return type matches the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.umin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.umin.*``' intrinsics do an unsigned
+integer ``MIN`` reduction of a vector, returning the result as a scalar. The
+return type matches the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.fmax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %a)
+      declare double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.fmax.*``' intrinsics do a floating point
+``MAX`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+If the intrinsic call has the ``nnan`` fast-math flag then the operation can
+assume that NaNs are not present in the input vector.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of floating point values.
+
+'``llvm.experimental.vector.reduce.fmin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %a)
+      declare double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.fmin.*``' intrinsics do a floating point
+``MIN`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+If the intrinsic call has the ``nnan`` fast-math flag then the operation can
+assume that NaNs are not present in the input vector.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of floating point values.
+
 Half Precision Floating Point Intrinsics
 ----------------------------------------
 
@@ -11725,7 +12299,7 @@
 
 The LLVM debugger intrinsics (which all start with ``llvm.dbg.``
 prefix), are described in the `LLVM Source Level
-Debugging <SourceLevelDebugging.html#format_common_intrinsics>`_
+Debugging <SourceLevelDebugging.html#format-common-intrinsics>`_
 document.
 
 Exception Handling Intrinsics
@@ -11733,7 +12307,7 @@
 
 The LLVM exception handling intrinsics (which all start with
 ``llvm.eh.`` prefix), are described in the `LLVM Exception
-Handling <ExceptionHandling.html#format_common_intrinsics>`_ document.
+Handling <ExceptionHandling.html#format-common-intrinsics>`_ document.
 
 .. _int_trampoline:
 
@@ -11957,9 +12531,9 @@
 
 ::
 
-      declare <16 x float> @llvm.masked.gather.v16f32   (<16 x float*> <ptrs>, i32 <alignment>, <16 x i1> <mask>, <16 x float> <passthru>)
-      declare <2 x double> @llvm.masked.gather.v2f64    (<2 x double*> <ptrs>, i32 <alignment>, <2 x i1>  <mask>, <2 x double> <passthru>)
-      declare <8 x float*> @llvm.masked.gather.v8p0f32  (<8 x float**> <ptrs>, i32 <alignment>, <8 x i1>  <mask>, <8 x float*> <passthru>)
+      declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32   (<16 x float*> <ptrs>, i32 <alignment>, <16 x i1> <mask>, <16 x float> <passthru>)
+      declare <2 x double> @llvm.masked.gather.v2f64.v2p1f64     (<2 x double addrspace(1)*> <ptrs>, i32 <alignment>, <2 x i1>  <mask>, <2 x double> <passthru>)
+      declare <8 x float*> @llvm.masked.gather.v8p0f32.v8p0p0f32 (<8 x float**> <ptrs>, i32 <alignment>, <8 x i1>  <mask>, <8 x float*> <passthru>)
 
 Overview:
 """""""""
@@ -11982,7 +12556,7 @@
 
 ::
 
-       %res = call <4 x double> @llvm.masked.gather.v4f64 (<4 x double*> %ptrs, i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+       %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64 (<4 x double*> %ptrs, i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
 
        ;; The gather with all-true mask is equivalent to the following instruction sequence
        %ptr0 = extractelement <4 x double*> %ptrs, i32 0
@@ -12011,9 +12585,9 @@
 
 ::
 
-       declare void @llvm.masked.scatter.v8i32   (<8 x i32>     <value>, <8 x i32*>     <ptrs>, i32 <alignment>, <8 x i1>  <mask>)
-       declare void @llvm.masked.scatter.v16f32  (<16 x float>  <value>, <16 x float*>  <ptrs>, i32 <alignment>, <16 x i1> <mask>)
-       declare void @llvm.masked.scatter.v4p0f64 (<4 x double*> <value>, <4 x double**> <ptrs>, i32 <alignment>, <4 x i1>  <mask>)
+       declare void @llvm.masked.scatter.v8i32.v8p0i32     (<8 x i32>     <value>, <8 x i32*>     <ptrs>, i32 <alignment>, <8 x i1>  <mask>)
+       declare void @llvm.masked.scatter.v16f32.v16p1f32   (<16 x float>  <value>, <16 x float addrspace(1)*>  <ptrs>, i32 <alignment>, <16 x i1> <mask>)
+       declare void @llvm.masked.scatter.v4p0f64.v4p0p0f64 (<4 x double*> <value>, <4 x double**> <ptrs>, i32 <alignment>, <4 x i1>  <mask>)
 
 Overview:
 """""""""
@@ -12034,7 +12608,7 @@
 ::
 
        ;; This instruction unconditionally stores data vector in multiple addresses
-       call @llvm.masked.scatter.v8i32 (<8 x i32> %value, <8 x i32*> %ptrs, i32 4,  <8 x i1>  <true, true, .. true>)
+       call @llvm.masked.scatter.v8i32.v8p0i32 (<8 x i32> %value, <8 x i32*> %ptrs, i32 4,  <8 x i1>  <true, true, .. true>)
 
        ;; It is equivalent to a list of scalar stores
        %val0 = extractelement <8 x i32> %value, i32 0
@@ -12234,6 +12808,7 @@
 assumed. This argument must be one of the following strings:
 
 ::
+
       "round.dynamic"
       "round.tonearest"
       "round.downward"
@@ -12265,6 +12840,7 @@
 strings:
 
 ::
+
       "fpexcept.ignore"
       "fpexcept.maytrap"
       "fpexcept.strict"
@@ -12309,7 +12885,7 @@
       declare <type> 
       @llvm.experimental.constrained.fadd(<type> <op1>, <type> <op2>,
                                           metadata <rounding mode>,
-                                          metadata  <exception behavior>)
+                                          metadata <exception behavior>)
 
 Overview:
 """""""""
@@ -12346,7 +12922,7 @@
       declare <type> 
       @llvm.experimental.constrained.fsub(<type> <op1>, <type> <op2>,
                                           metadata <rounding mode>,
-                                          metadata  <exception behavior>)
+                                          metadata <exception behavior>)
 
 Overview:
 """""""""
@@ -12383,7 +12959,7 @@
       declare <type> 
       @llvm.experimental.constrained.fmul(<type> <op1>, <type> <op2>,
                                           metadata <rounding mode>,
-                                          metadata  <exception behavior>)
+                                          metadata <exception behavior>)
 
 Overview:
 """""""""
@@ -12420,7 +12996,7 @@
       declare <type> 
       @llvm.experimental.constrained.fdiv(<type> <op1>, <type> <op2>,
                                           metadata <rounding mode>,
-                                          metadata  <exception behavior>)
+                                          metadata <exception behavior>)
 
 Overview:
 """""""""
@@ -12457,7 +13033,7 @@
       declare <type> 
       @llvm.experimental.constrained.frem(<type> <op1>, <type> <op2>,
                                           metadata <rounding mode>,
-                                          metadata  <exception behavior>)
+                                          metadata <exception behavior>)
 
 Overview:
 """""""""
@@ -12485,6 +13061,496 @@
 value operands and has the same type as the operands.  The remainder has the
 same sign as the dividend. 
 
+'``llvm.experimental.constrained.fma``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.fma(<type> <op1>, <type> <op2>, <type> <op3>,
+                                          metadata <rounding mode>,
+                                          metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.fma``' intrinsic returns the result of a
+fused-multiply-add operation on its operands.
+
+Arguments:
+""""""""""
+
+The first three arguments to the '``llvm.experimental.constrained.fma``'
+intrinsic must be :ref:`floating point <t_floating>` or :ref:`vector
+<t_vector>` of floating point values. All arguments must have identical types.
+
+The fourth and fifth arguments specify the rounding mode and exception behavior
+as described above.
+
+Semantics:
+""""""""""
+
+The result produced is the product of the first two operands added to the third
+operand computed with infinite precision, and then rounded to the target
+precision.
+
+Constrained libm-equivalent Intrinsics
+--------------------------------------
+
+In addition to the basic floating point operations for which constrained
+intrinsics are described above, there are constrained versions of various
+operations which provide equivalent behavior to a corresponding libm function.
+These intrinsics allow the precise behavior of these operations with respect to
+rounding mode and exception behavior to be controlled.
+
+As with the basic constrained floating point intrinsics, the rounding mode
+and exception behavior arguments only control the behavior of the optimizer.
+They do not change the runtime floating point environment.
+
+
+'``llvm.experimental.constrained.sqrt``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.sqrt(<type> <op1>,
+                                          metadata <rounding mode>,
+                                          metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.sqrt``' intrinsic returns the square root
+of the specified value, returning the same value as the libm '``sqrt``'
+functions would, but without setting ``errno``.
+
+Arguments:
+""""""""""
+
+The first argument and the return type are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the nonnegative square root of the specified value.
+If the value is less than negative zero, a floating point exception occurs
+and the the return value is architecture specific.
+
+
+'``llvm.experimental.constrained.pow``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.pow(<type> <op1>, <type> <op2>,
+                                         metadata <rounding mode>,
+                                         metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.pow``' intrinsic returns the first operand
+raised to the (positive or negative) power specified by the second operand.
+
+Arguments:
+""""""""""
+
+The first two arguments and the return value are floating point numbers of the
+same type.  The second argument specifies the power to which the first argument
+should be raised.
+
+The third and fourth arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the first value raised to the second power,
+returning the same values as the libm ``pow`` functions would, and
+handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.powi``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.powi(<type> <op1>, i32 <op2>,
+                                          metadata <rounding mode>,
+                                          metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.powi``' intrinsic returns the first operand
+raised to the (positive or negative) power specified by the second operand. The
+order of evaluation of multiplications is not defined. When a vector of floating
+point type is used, the second argument remains a scalar integer value.
+
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.  The second argument is a 32-bit signed integer specifying the power to
+which the first argument should be raised.
+
+The third and fourth arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the first value raised to the second power with an
+unspecified sequence of rounding operations.
+
+
+'``llvm.experimental.constrained.sin``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.sin(<type> <op1>,
+                                         metadata <rounding mode>,
+                                         metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.sin``' intrinsic returns the sine of the
+first operand.
+
+Arguments:
+""""""""""
+
+The first argument and the return type are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the sine of the specified operand, returning the
+same values as the libm ``sin`` functions would, and handles error
+conditions in the same way.
+
+
+'``llvm.experimental.constrained.cos``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.cos(<type> <op1>,
+                                         metadata <rounding mode>,
+                                         metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.cos``' intrinsic returns the cosine of the
+first operand.
+
+Arguments:
+""""""""""
+
+The first argument and the return type are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the cosine of the specified operand, returning the
+same values as the libm ``cos`` functions would, and handles error
+conditions in the same way.
+
+
+'``llvm.experimental.constrained.exp``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.exp(<type> <op1>,
+                                         metadata <rounding mode>,
+                                         metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.exp``' intrinsic computes the base-e
+exponential of the specified value.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``exp`` functions
+would, and handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.exp2``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.exp2(<type> <op1>,
+                                          metadata <rounding mode>,
+                                          metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.exp2``' intrinsic computes the base-2
+exponential of the specified value.
+
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``exp2`` functions
+would, and handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.log``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.log(<type> <op1>,
+                                         metadata <rounding mode>,
+                                         metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.log``' intrinsic computes the base-e
+logarithm of the specified value.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``log`` functions
+would, and handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.log10``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.log10(<type> <op1>,
+                                           metadata <rounding mode>,
+                                           metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.log10``' intrinsic computes the base-10
+logarithm of the specified value.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``log10`` functions
+would, and handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.log2``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.log2(<type> <op1>,
+                                          metadata <rounding mode>,
+                                          metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.log2``' intrinsic computes the base-2
+logarithm of the specified value.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``log2`` functions
+would, and handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.rint``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.rint(<type> <op1>,
+                                          metadata <rounding mode>,
+                                          metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.rint``' intrinsic returns the first
+operand rounded to the nearest integer. It may raise an inexact floating point
+exception if the operand is not an integer.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``rint`` functions
+would, and handles error conditions in the same way.  The rounding mode is
+described, not determined, by the rounding mode argument.  The actual rounding
+mode is determined by the runtime floating point environment.  The rounding
+mode argument is only intended as information to the compiler.
+
+
+'``llvm.experimental.constrained.nearbyint``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.nearbyint(<type> <op1>,
+                                               metadata <rounding mode>,
+                                               metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.nearbyint``' intrinsic returns the first
+operand rounded to the nearest integer. It will not raise an inexact floating
+point exception if the operand is not an integer.
+
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``nearbyint`` functions
+would, and handles error conditions in the same way.  The rounding mode is
+described, not determined, by the rounding mode argument.  The actual rounding
+mode is determined by the runtime floating point environment.  The rounding
+mode argument is only intended as information to the compiler.
+
 
 General Intrinsics
 ------------------
@@ -12600,6 +13666,27 @@
 optimizations that want to look for these annotations. These have no
 other defined use; they are ignored by code generation and optimization.
 
+'``llvm.codeview.annotation``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This annotation emits a label at its program point and an associated
+``S_ANNOTATION`` codeview record with some additional string metadata. This is
+used to implement MSVC's ``__annotation`` intrinsic. It is marked
+``noduplicate``, so calls to this intrinsic prevent inlining and should be
+considered expensive.
+
+::
+
+      declare void @llvm.codeview.annotation(metadata)
+
+Arguments:
+""""""""""
+
+The argument should be an MDTuple containing any number of MDStrings.
+
 '``llvm.trap``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -12733,8 +13820,8 @@
 
 ::
 
-      declare i32 @llvm.objectsize.i32(i8* <object>, i1 <min>)
-      declare i64 @llvm.objectsize.i64(i8* <object>, i1 <min>)
+      declare i32 @llvm.objectsize.i32(i8* <object>, i1 <min>, i1 <nullunknown>)
+      declare i64 @llvm.objectsize.i64(i8* <object>, i1 <min>, i1 <nullunknown>)
 
 Overview:
 """""""""
@@ -12749,11 +13836,16 @@
 Arguments:
 """"""""""
 
-The ``llvm.objectsize`` intrinsic takes two arguments. The first
-argument is a pointer to or into the ``object``. The second argument is
-a boolean and determines whether ``llvm.objectsize`` returns 0 (if true)
-or -1 (if false) when the object size is unknown. The second argument
-only accepts constants.
+The ``llvm.objectsize`` intrinsic takes three arguments. The first argument is
+a pointer to or into the ``object``. The second argument determines whether
+``llvm.objectsize`` returns 0 (if true) or -1 (if false) when the object size
+is unknown. The third argument controls how ``llvm.objectsize`` acts when
+``null`` is used as its pointer argument. If it's true and the pointer is in
+address space 0, ``null`` is treated as an opaque value with an unknown number
+of bytes. Otherwise, ``llvm.objectsize`` reports 0 bytes available when given
+``null``.
+
+The second and third arguments only accept constants.
 
 Semantics:
 """"""""""
@@ -13135,62 +14227,66 @@
 These intrinsics are similar to the standard library memory intrinsics except
 that they perform memory transfer as a sequence of atomic memory accesses.
 
-.. _int_memcpy_element_atomic:
+.. _int_memcpy_element_unordered_atomic:
 
-'``llvm.memcpy.element.atomic``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.memcpy.element.unordered.atomic``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
 
-This is an overloaded intrinsic. You can use ``llvm.memcpy.element.atomic`` on
+This is an overloaded intrinsic. You can use ``llvm.memcpy.element.unordered.atomic`` on
 any integer bit width and for different address spaces. Not all targets
 support all bit widths however.
 
 ::
 
-      declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* <dest>, i8* <src>,
-                                              i64 <num_elements>, i32 <element_size>)
+      declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* <dest>,
+                                                                       i8* <src>,
+                                                                       i32 <len>,
+                                                                       i32 <element_size>)
+      declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* <dest>,
+                                                                       i8* <src>,
+                                                                       i64 <len>,
+                                                                       i32 <element_size>)
 
 Overview:
 """""""""
 
-The '``llvm.memcpy.element.atomic.*``' intrinsic performs copy of a block of 
-memory from the source location to the destination location as a sequence of
-unordered atomic memory accesses where each access is a multiple of
-``element_size`` bytes wide and aligned at an element size boundary. For example
-each element is accessed atomically in source and destination buffers.
+The '``llvm.memcpy.element.unordered.atomic.*``' intrinsic is a specialization of the
+'``llvm.memcpy.*``' intrinsic. It differs in that the ``dest`` and ``src`` are treated
+as arrays with elements that are exactly ``element_size`` bytes, and the copy between
+buffers uses a sequence of :ref:`unordered atomic <ordering>` load/store operations
+that are a positive integer multiple of the ``element_size`` in size.
 
 Arguments:
 """"""""""
 
-The first argument is a pointer to the destination, the second is a
-pointer to the source. The third argument is an integer argument
-specifying the number of elements to copy, the fourth argument is size of
-the single element in bytes.
+The first three arguments are the same as they are in the :ref:`@llvm.memcpy <int_memcpy>`
+intrinsic, with the added constraint that ``len`` is required to be a positive integer
+multiple of the ``element_size``. If ``len`` is not a positive integer multiple of
+``element_size``, then the behaviour of the intrinsic is undefined.
 
-``element_size`` should be a power of two, greater than zero and less than
-a target-specific atomic access size limit.
+``element_size`` must be a compile-time constant positive power of two no greater than
+target-specific atomic access size limit.
 
-For each of the input pointers ``align`` parameter attribute must be specified.
-It must be a power of two and greater than or equal to the ``element_size``.
-Caller guarantees that both the source and destination pointers are aligned to
-that boundary.
+For each of the input pointers ``align`` parameter attribute must be specified. It
+must be a power of two no less than the ``element_size``. Caller guarantees that
+both the source and destination pointers are aligned to that boundary.
 
 Semantics:
 """"""""""
 
-The '``llvm.memcpy.element.atomic.*``' intrinsic copies
-'``num_elements`` * ``element_size``' bytes of memory from the source location to
-the destination location. These locations are not allowed to overlap. Memory copy
-is performed as a sequence of unordered atomic memory accesses where each access
-is guaranteed to be a multiple of ``element_size`` bytes wide and aligned at an
-element size boundary.
+The '``llvm.memcpy.element.unordered.atomic.*``' intrinsic copies ``len`` bytes of
+memory from the source location to the destination location. These locations are not
+allowed to overlap. The memory copy is performed as a sequence of load/store operations
+where each access is guaranteed to be a multiple of ``element_size`` bytes wide and
+aligned at an ``element_size`` boundary. 
 
 The order of the copy is unspecified. The same value may be read from the source
 buffer many times, but only one write is issued to the destination buffer per
-element. It is well defined to have concurrent reads and writes to both source
-and destination provided those reads and writes are at least unordered atomic.
+element. It is well defined to have concurrent reads and writes to both source and
+destination provided those reads and writes are unordered atomic when specified.
 
 This intrinsic does not provide any additional ordering guarantees over those
 provided by a set of unordered loads from the source location and stores to the
@@ -13199,8 +14295,158 @@
 Lowering:
 """""""""
 
-In the most general case call to the '``llvm.memcpy.element.atomic.*``' is lowered
-to a call to the symbol ``__llvm_memcpy_element_atomic_*``. Where '*' is replaced
-with an actual element size.
+In the most general case call to the '``llvm.memcpy.element.unordered.atomic.*``' is
+lowered to a call to the symbol ``__llvm_memcpy_element_unordered_atomic_*``. Where '*'
+is replaced with an actual element size.
 
 Optimizer is allowed to inline memory copy when it's profitable to do so.
+
+'``llvm.memmove.element.unordered.atomic``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use
+``llvm.memmove.element.unordered.atomic`` on any integer bit width and for
+different address spaces. Not all targets support all bit widths however.
+
+::
+
+      declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* <dest>,
+                                                                        i8* <src>,
+                                                                        i32 <len>,
+                                                                        i32 <element_size>)
+      declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* <dest>,
+                                                                        i8* <src>,
+                                                                        i64 <len>,
+                                                                        i32 <element_size>)
+
+Overview:
+"""""""""
+
+The '``llvm.memmove.element.unordered.atomic.*``' intrinsic is a specialization
+of the '``llvm.memmove.*``' intrinsic. It differs in that the ``dest`` and
+``src`` are treated as arrays with elements that are exactly ``element_size``
+bytes, and the copy between buffers uses a sequence of
+:ref:`unordered atomic <ordering>` load/store operations that are a positive
+integer multiple of the ``element_size`` in size.
+
+Arguments:
+""""""""""
+
+The first three arguments are the same as they are in the
+:ref:`@llvm.memmove <int_memmove>` intrinsic, with the added constraint that
+``len`` is required to be a positive integer multiple of the ``element_size``.
+If ``len`` is not a positive integer multiple of ``element_size``, then the
+behaviour of the intrinsic is undefined.
+
+``element_size`` must be a compile-time constant positive power of two no
+greater than a target-specific atomic access size limit.
+
+For each of the input pointers the ``align`` parameter attribute must be
+specified. It must be a power of two no less than the ``element_size``. Caller
+guarantees that both the source and destination pointers are aligned to that
+boundary.
+
+Semantics:
+""""""""""
+
+The '``llvm.memmove.element.unordered.atomic.*``' intrinsic copies ``len`` bytes
+of memory from the source location to the destination location. These locations
+are allowed to overlap. The memory copy is performed as a sequence of load/store
+operations where each access is guaranteed to be a multiple of ``element_size``
+bytes wide and aligned at an ``element_size`` boundary. 
+
+The order of the copy is unspecified. The same value may be read from the source
+buffer many times, but only one write is issued to the destination buffer per
+element. It is well defined to have concurrent reads and writes to both source
+and destination provided those reads and writes are unordered atomic when
+specified.
+
+This intrinsic does not provide any additional ordering guarantees over those
+provided by a set of unordered loads from the source location and stores to the
+destination.
+
+Lowering:
+"""""""""
+
+In the most general case call to the
+'``llvm.memmove.element.unordered.atomic.*``' is lowered to a call to the symbol
+``__llvm_memmove_element_unordered_atomic_*``. Where '*' is replaced with an
+actual element size.
+
+The optimizer is allowed to inline the memory copy when it's profitable to do so.
+
+.. _int_memset_element_unordered_atomic:
+
+'``llvm.memset.element.unordered.atomic``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.memset.element.unordered.atomic`` on
+any integer bit width and for different address spaces. Not all targets
+support all bit widths however.
+
+::
+
+      declare void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* <dest>,
+                                                                  i8 <value>,
+                                                                  i32 <len>,
+                                                                  i32 <element_size>)
+      declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* <dest>,
+                                                                  i8 <value>,
+                                                                  i64 <len>,
+                                                                  i32 <element_size>)
+
+Overview:
+"""""""""
+
+The '``llvm.memset.element.unordered.atomic.*``' intrinsic is a specialization of the
+'``llvm.memset.*``' intrinsic. It differs in that the ``dest`` is treated as an array
+with elements that are exactly ``element_size`` bytes, and the assignment to that array
+uses uses a sequence of :ref:`unordered atomic <ordering>` store operations
+that are a positive integer multiple of the ``element_size`` in size.
+
+Arguments:
+""""""""""
+
+The first three arguments are the same as they are in the :ref:`@llvm.memset <int_memset>`
+intrinsic, with the added constraint that ``len`` is required to be a positive integer
+multiple of the ``element_size``. If ``len`` is not a positive integer multiple of
+``element_size``, then the behaviour of the intrinsic is undefined.
+
+``element_size`` must be a compile-time constant positive power of two no greater than
+target-specific atomic access size limit.
+
+The ``dest`` input pointer must have the ``align`` parameter attribute specified. It
+must be a power of two no less than the ``element_size``. Caller guarantees that
+the destination pointer is aligned to that boundary.
+
+Semantics:
+""""""""""
+
+The '``llvm.memset.element.unordered.atomic.*``' intrinsic sets the ``len`` bytes of
+memory starting at the destination location to the given ``value``. The memory is
+set with a sequence of store operations where each access is guaranteed to be a
+multiple of ``element_size`` bytes wide and aligned at an ``element_size`` boundary. 
+
+The order of the assignment is unspecified. Only one write is issued to the
+destination buffer per element. It is well defined to have concurrent reads and
+writes to the destination provided those reads and writes are unordered atomic
+when specified.
+
+This intrinsic does not provide any additional ordering guarantees over those
+provided by a set of unordered stores to the destination.
+
+Lowering:
+"""""""""
+
+In the most general case call to the '``llvm.memset.element.unordered.atomic.*``' is
+lowered to a call to the symbol ``__llvm_memset_element_unordered_atomic_*``. Where '*'
+is replaced with an actual element size.
+
+The optimizer is allowed to inline the memory assignment when it's profitable to do so.
+
diff --git a/docs/Lexicon.rst b/docs/Lexicon.rst
index 5d16091..0021bf8 100644
--- a/docs/Lexicon.rst
+++ b/docs/Lexicon.rst
@@ -38,6 +38,13 @@
 **BB Vectorization**
     Basic-Block Vectorization
 
+**BDCE**
+    Bit-tracking dead code elimination. Some bit-wise instructions (shifts,
+    ands, ors, etc.) "kill" some of their input bits -- that is, they make it
+    such that those bits can be either zero or one without affecting control or
+    data flow of a program. The BDCE pass removes instructions that only
+    compute these dead bits.
+
 **BURS**
     Bottom Up Rewriting System --- A method of instruction selection for code
     generation.  An example is the `BURG
@@ -102,6 +109,18 @@
     Garbage Collection. The practice of using reachability analysis instead of
     explicit memory management to reclaim unused memory.
 
+**GEP**
+    ``GetElementPtr``. An LLVM IR instruction that is used to get the address
+    of a subelement of an aggregate data structure. It is documented in detail
+    `here <http://llvm.org/docs/GetElementPtr.html>`_.
+
+**GVN**
+    Global Value Numbering. GVN is a pass that partitions values computed by a
+    function into congruence classes. Values ending up in the same congruence
+    class are guaranteed to be the same for every execution of the program.
+    In that respect, congruency is a compile-time approximation of equivalence
+    of values at runtime.
+
 H
 -
 
@@ -242,6 +261,14 @@
     Superword-Level Parallelism, same as :ref:`Basic-Block Vectorization
     <lexicon-bb-vectorization>`.
 
+**Splat**
+    Splat refers to a vector of identical scalar elements.
+
+    The term is based on the PowerPC Altivec instructions that provided
+    this functionality in hardware. For example, "vsplth" and the corresponding
+    software intrinsic "vec_splat()". Examples of other hardware names for this
+    action include "duplicate" (ARM) and "broadcast" (x86).
+
 **SRoA**
     Scalar Replacement of Aggregates
 
diff --git a/docs/LibFuzzer.rst b/docs/LibFuzzer.rst
index a75dd38..2ae84af 100644
--- a/docs/LibFuzzer.rst
+++ b/docs/LibFuzzer.rst
@@ -42,10 +42,10 @@
 ``./third_party/llvm-build/Release+Asserts/bin/clang``)
 
 The libFuzzer code resides in the LLVM repository, and requires a recent Clang
-compiler to build (and is used to `fuzz various parts of LLVM itself`_).
-However the fuzzer itself does not (and should not) depend on any part of LLVM
-infrastructure and can be used for other projects without requiring the rest
-of LLVM.
+compiler to build (and is used to :doc:`fuzz various parts of LLVM itself
+<FuzzingLLVM>`).  However the fuzzer itself does not (and should not) depend on
+any part of LLVM infrastructure and can be used for other projects without
+requiring the rest of LLVM.
 
 
 Getting Started
@@ -87,10 +87,28 @@
 * Usually, the narrower the target the better. E.g. if your target can parse several data formats, split it into several targets, one per format.
 
 
-Building
---------
+Fuzzer Usage
+------------
 
-Next, build the libFuzzer library as a static archive, without any sanitizer
+Very recent versions of Clang (after April 20 2017) include libFuzzer,
+and no installation is necessary.
+In order to fuzz your binary, use the `-fsanitize=fuzzer` flag during the compilation::
+
+   clang -fsanitize=fuzzer,address mytarget.c
+
+This will perform the necessary instrumentation, as well as linking in libFuzzer
+library.
+Note that linking in libFuzzer defines the ``main`` symbol.
+If modifying ``CFLAGS`` of a large project, which also compiles executables
+requiring their own ``main`` symbol, it may be desirable to request just the
+instrumentation without linking::
+
+   clang -fsanitize=fuzzer-no-link mytarget.c
+
+Then libFuzzer can be linked to the desired driver by passing in
+``-fsanitize=fuzzer`` during the linking stage.
+
+Otherwise, build the libFuzzer library as a static archive, without any sanitizer
 options. Note that the libFuzzer library contains the ``main()`` function:
 
 .. code-block:: console
@@ -119,6 +137,8 @@
 
   clang -fsanitize-coverage=trace-pc-guard -fsanitize=address your_lib.cc fuzz_target.cc libFuzzer.a -o my_fuzzer
 
+.. _libfuzzer-corpus:
+  
 Corpus
 ------
 
@@ -335,6 +355,9 @@
 ``NEW``
   The fuzzer has created a test input that covers new areas of the code
   under test.  This input will be saved to the primary corpus directory.
+``REDUCE``
+  The fuzzer has found a better (smaller) input that triggers previously
+  discovered features (set ``-reduce_inputs=0`` to disable).
 ``pulse``
   The fuzzer has generated 2\ :sup:`n` inputs (generated periodically to reassure
   the user that the fuzzer is still working).
@@ -533,21 +556,12 @@
 Once you implement your target function ``LLVMFuzzerTestOneInput`` and fuzz it to death,
 you will want to know whether the function or the corpus can be improved further.
 One easy to use metric is, of course, code coverage.
-You can get the coverage for your corpus like this:
 
-.. code-block:: console
+We recommend to use
+`Clang Coverage <http://clang.llvm.org/docs/SourceBasedCodeCoverage.html>`_,
+to visualize and study your code coverage
+(`example <https://github.com/google/fuzzer-test-suite/blob/master/tutorial/libFuzzerTutorial.md#visualizing-coverage>`_).
 
-  ASAN_OPTIONS=coverage=1 ./fuzzer CORPUS_DIR -runs=0
-
-This will run all tests in the CORPUS_DIR but will not perform any fuzzing.
-At the end of the process it will dump a single ``.sancov`` file with coverage 
-information.  See SanitizerCoverage_ for details on querying the file using the
-``sancov`` tool.
-
-You may also use other ways to visualize coverage,
-e.g. using `Clang coverage <http://clang.llvm.org/docs/SourceBasedCodeCoverage.html>`_,
-but those will require
-you to rebuild the code with different compiler flags.
 
 User-supplied mutators
 ----------------------
@@ -570,7 +584,7 @@
 
 Alternatively, you may define an optional init function and it will receive
 the program arguments that you can read and modify. Do this **only** if you
-realy need to access ``argv``/``argc``.
+really need to access ``argv``/``argc``.
 
 .. code-block:: c++
 
@@ -604,75 +618,17 @@
 Developing libFuzzer
 ====================
 
-Building libFuzzer as a part of LLVM project and running its test requires
-fresh clang as the host compiler and special CMake configuration:
+LibFuzzer is built as a part of LLVM project by default on macos and Linux.
+Users of other operating systems can explicitly request compilation using
+``-DLIBFUZZER_ENABLE=YES`` flag.
+Tests are run using ``check-fuzzer`` target from the build directory
+which was configured with ``-DLIBFUZZER_ENABLE_TESTS=ON`` flag.
 
 .. code-block:: console
 
-    cmake -GNinja  -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DLLVM_USE_SANITIZER=Address -DLLVM_USE_SANITIZE_COVERAGE=YES -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON /path/to/llvm
     ninja check-fuzzer
 
 
-Fuzzing components of LLVM
-==========================
-.. contents::
-   :local:
-   :depth: 1
-
-To build any of the LLVM fuzz targets use the build instructions above.
-
-clang-format-fuzzer
--------------------
-The inputs are random pieces of C++-like text.
-
-.. code-block:: console
-
-    ninja clang-format-fuzzer
-    mkdir CORPUS_DIR
-    ./bin/clang-format-fuzzer CORPUS_DIR
-
-Optionally build other kinds of binaries (ASan+Debug, MSan, UBSan, etc).
-
-Tracking bug: https://llvm.org/bugs/show_bug.cgi?id=23052
-
-clang-fuzzer
-------------
-
-The behavior is very similar to ``clang-format-fuzzer``.
-
-Tracking bug: https://llvm.org/bugs/show_bug.cgi?id=23057
-
-llvm-as-fuzzer
---------------
-
-Tracking bug: https://llvm.org/bugs/show_bug.cgi?id=24639
-
-llvm-mc-fuzzer
---------------
-
-This tool fuzzes the MC layer. Currently it is only able to fuzz the
-disassembler but it is hoped that assembly, and round-trip verification will be
-added in future.
-
-When run in dissassembly mode, the inputs are opcodes to be disassembled. The
-fuzzer will consume as many instructions as possible and will stop when it
-finds an invalid instruction or runs out of data.
-
-Please note that the command line interface differs slightly from that of other
-fuzzers. The fuzzer arguments should follow ``--fuzzer-args`` and should have
-a single dash, while other arguments control the operation mode and target in a
-similar manner to ``llvm-mc`` and should have two dashes. For example:
-
-.. code-block:: console
-
-  llvm-mc-fuzzer --triple=aarch64-linux-gnu --disassemble --fuzzer-args -max_len=4 -jobs=10
-
-Buildbot
---------
-
-A buildbot continuously runs the above fuzzers for LLVM components, with results
-shown at http://lab.llvm.org:8011/builders/sanitizer-x86_64-linux-fuzzer .
-
 FAQ
 =========================
 
@@ -728,6 +684,7 @@
 Examples: regular expression matchers, text or binary format parsers, compression,
 network, crypto.
 
+
 Trophies
 ========
 * GLIBC: https://sourceware.org/glibc/wiki/FuzzingLibc
@@ -772,6 +729,10 @@
 
 * Ffmpeg: `[1] <https://github.com/FFmpeg/FFmpeg/commit/c92f55847a3d9cd12db60bfcd0831ff7f089c37c>`__  `[2] <https://github.com/FFmpeg/FFmpeg/commit/25ab1a65f3acb5ec67b53fb7a2463a7368f1ad16>`__  `[3] <https://github.com/FFmpeg/FFmpeg/commit/85d23e5cbc9ad6835eef870a5b4247de78febe56>`__ `[4] <https://github.com/FFmpeg/FFmpeg/commit/04bd1b38ee6b8df410d0ab8d4949546b6c4af26a>`__
 
+* `Wireshark <https://bugs.wireshark.org/bugzilla/buglist.cgi?bug_status=UNCONFIRMED&bug_status=CONFIRMED&bug_status=IN_PROGRESS&bug_status=INCOMPLETE&bug_status=RESOLVED&bug_status=VERIFIED&f0=OP&f1=OP&f2=product&f3=component&f4=alias&f5=short_desc&f7=content&f8=CP&f9=CP&j1=OR&o2=substring&o3=substring&o4=substring&o5=substring&o6=substring&o7=matches&order=bug_id%20DESC&query_format=advanced&v2=libfuzzer&v3=libfuzzer&v4=libfuzzer&v5=libfuzzer&v6=libfuzzer&v7=%22libfuzzer%22>`_
+
+* `QEMU <https://researchcenter.paloaltonetworks.com/2017/09/unit42-palo-alto-networks-discovers-new-qemu-vulnerability/>`_
+
 .. _pcre2: http://www.pcre.org/
 .. _AFL: http://lcamtuf.coredump.cx/afl/
 .. _Radamsa: https://github.com/aoh/radamsa
@@ -789,4 +750,4 @@
 .. _`value profile`: #value-profile
 .. _`caller-callee pairs`: http://clang.llvm.org/docs/SanitizerCoverage.html#caller-callee-coverage
 .. _BoringSSL: https://boringssl.googlesource.com/boringssl/
-.. _`fuzz various parts of LLVM itself`: `Fuzzing components of LLVM`_
+
diff --git a/docs/MIRLangRef.rst b/docs/MIRLangRef.rst
index f6ee6cc..b4ca8f2 100644
--- a/docs/MIRLangRef.rst
+++ b/docs/MIRLangRef.rst
@@ -39,37 +39,87 @@
 You can use the MIR format for testing in two different ways:
 
 - You can write MIR tests that invoke a single code generation pass using the
-  ``run-pass`` option in llc.
+  ``-run-pass`` option in llc.
 
-- You can use llc's ``stop-after`` option with existing or new LLVM assembly
+- You can use llc's ``-stop-after`` option with existing or new LLVM assembly
   tests and check the MIR output of a specific code generation pass.
 
 Testing Individual Code Generation Passes
 -----------------------------------------
 
-The ``run-pass`` option in llc allows you to create MIR tests that invoke
-just a single code generation pass. When this option is used, llc will parse
-an input MIR file, run the specified code generation pass, and print the
-resulting MIR to the standard output stream.
+The ``-run-pass`` option in llc allows you to create MIR tests that invoke just
+a single code generation pass. When this option is used, llc will parse an
+input MIR file, run the specified code generation pass(es), and output the
+resulting MIR code.
 
-You can generate an input MIR file for the test by using the ``stop-after``
-option in llc. For example, if you would like to write a test for the
-post register allocation pseudo instruction expansion pass, you can specify
-the machine copy propagation pass in the ``stop-after`` option, as it runs
-just before the pass that we are trying to test:
+You can generate an input MIR file for the test by using the ``-stop-after`` or
+``-stop-before`` option in llc. For example, if you would like to write a test
+for the post register allocation pseudo instruction expansion pass, you can
+specify the machine copy propagation pass in the ``-stop-after`` option, as it
+runs just before the pass that we are trying to test:
 
-   ``llc -stop-after machine-cp bug-trigger.ll > test.mir``
+   ``llc -stop-after=machine-cp bug-trigger.ll > test.mir``
 
 After generating the input MIR file, you'll have to add a run line that uses
 the ``-run-pass`` option to it. In order to test the post register allocation
 pseudo instruction expansion pass on X86-64, a run line like the one shown
 below can be used:
 
-    ``# RUN: llc -run-pass postrapseudos -march=x86-64 %s -o /dev/null | FileCheck %s``
+    ``# RUN: llc -o - %s -mtriple=x86_64-- -run-pass=postrapseudos | FileCheck %s``
 
 The MIR files are target dependent, so they have to be placed in the target
-specific test directories. They also need to specify a target triple or a
-target architecture either in the run line or in the embedded LLVM IR module.
+specific test directories (``lib/CodeGen/TARGETNAME``). They also need to
+specify a target triple or a target architecture either in the run line or in
+the embedded LLVM IR module.
+
+Simplifying MIR files
+^^^^^^^^^^^^^^^^^^^^^
+
+The MIR code coming out of ``-stop-after``/``-stop-before`` is very verbose;
+Tests are more accessible and future proof when simplified:
+
+- Use the ``-simplify-mir`` option with llc.
+
+- Machine function attributes often have default values or the test works just
+  as well with default values. Typical candidates for this are: `alignment:`,
+  `exposesReturnsTwice`, `legalized`, `regBankSelected`, `selected`.
+  The whole `frameInfo` section is often unnecessary if there is no special
+  frame usage in the function. `tracksRegLiveness` on the other hand is often
+  necessary for some passes that care about block livein lists.
+
+- The (global) `liveins:` list is typically only interesting for early
+  instruction selection passes and can be removed when testing later passes.
+  The per-block `liveins:` on the other hand are necessary if
+  `tracksRegLiveness` is true.
+
+- Branch probability data in block `successors:` lists can be dropped if the
+  test doesn't depend on it. Example:
+  `successors: %bb.1(0x40000000), %bb.2(0x40000000)` can be replaced with
+  `successors: %bb.1, %bb.2`.
+
+- MIR code contains a whole IR module. This is necessary because there are
+  no equivalents in MIR for global variables, references to external functions,
+  function attributes, metadata, debug info. Instead some MIR data references
+  the IR constructs. You can often remove them if the test doesn't depend on
+  them.
+
+- Alias Analysis is performed on IR values. These are referenced by memory
+  operands in MIR. Example: `:: (load 8 from %ir.foobar, !alias.scope !9)`.
+  If the test doesn't depend on (good) alias analysis the references can be
+  dropped: `:: (load 8)`
+
+- MIR blocks can reference IR blocks for debug printing, profile information
+  or debug locations. Example: `bb.42.myblock` in MIR references the IR block
+  `myblock`. It is usually possible to drop the `.myblock` reference and simply
+  use `bb.42`.
+
+- If there are no memory operands or blocks referencing the IR then the
+  IR function can be replaced by a parameterless dummy function like
+  `define @func() { ret void }`.
+
+- It is possible to drop the whole IR section of the MIR file if it only
+  contains dummy functions (see above). The .mir loader will create the
+  IR functions automatically in this case.
 
 Limitations
 -----------
diff --git a/docs/Phabricator.rst b/docs/Phabricator.rst
index 8d1984b..cc8484c 100644
--- a/docs/Phabricator.rst
+++ b/docs/Phabricator.rst
@@ -54,7 +54,8 @@
 To get a full diff, use one of the following commands (or just use Arcanist
 to upload your patch):
 
-* ``git diff -U999999 other-branch``
+* ``git show HEAD -U999999 > mypatch.patch``
+* ``git format-patch -U999999 @{u}``
 * ``svn diff --diff-cmd=diff -x -U999999``
 
 To upload a new patch:
diff --git a/docs/ProgrammersManual.rst b/docs/ProgrammersManual.rst
index decac60..719d399 100644
--- a/docs/ProgrammersManual.rst
+++ b/docs/ProgrammersManual.rst
@@ -32,7 +32,7 @@
 Core LLVM classes.  In the future this manual will be extended with information
 describing how to use extension libraries, such as dominator information, CFG
 traversal routines, and useful utilities like the ``InstVisitor`` (`doxygen
-<http://llvm.org/doxygen/InstVisitor_8h-source.html>`__) template.
+<http://llvm.org/doxygen/InstVisitor_8h_source.html>`__) template.
 
 .. _general:
 
@@ -108,7 +108,7 @@
 ``dynamic_cast<>`` only works on classes that have a v-table).  Because they are
 used so often, you must know what they do and how they work.  All of these
 templates are defined in the ``llvm/Support/Casting.h`` (`doxygen
-<http://llvm.org/doxygen/Casting_8h-source.html>`__) file (note that you very
+<http://llvm.org/doxygen/Casting_8h_source.html>`__) file (note that you very
 rarely have to include this file directly).
 
 ``isa<>``:
@@ -225,7 +225,7 @@
 Similarly, APIs which need to return a string may return a ``StringRef``
 instance, which can be used directly or converted to an ``std::string`` using
 the ``str`` member function.  See ``llvm/ADT/StringRef.h`` (`doxygen
-<http://llvm.org/doxygen/classllvm_1_1StringRef_8h-source.html>`__) for more
+<http://llvm.org/doxygen/StringRef_8h_source.html>`__) for more
 information.
 
 You should rarely use the ``StringRef`` class directly, because it contains
@@ -441,6 +441,15 @@
 as simple as reporting the issue to the user, or it may involve attempts at
 recovery.
 
+.. note::
+
+   While it would be ideal to use this error handling scheme throughout
+   LLVM, there are places where this hasn't been practical to apply. In
+   situations where you absolutely must emit a non-programmatic error and
+   the ``Error`` model isn't workable you can call ``report_fatal_error``,
+   which will call installed error handlers, print a message, and exit the
+   program.
+
 Recoverable errors are modeled using LLVM's ``Error`` scheme. This scheme
 represents errors using function return values, similar to classic C integer
 error codes, or C++'s ``std::error_code``. However, the ``Error`` class is
@@ -486,7 +495,7 @@
 
   Error printFormattedFile(StringRef Path) {
     if (<check for valid format>)
-      return make_error<InvalidObjectFile>(Path);
+      return make_error<BadFileFormat>(Path);
     // print file contents.
     return Error::success();
   }
@@ -776,22 +785,21 @@
 Using cantFail to simplify safe callsites
 """""""""""""""""""""""""""""""""""""""""
 
-Some functions may only fail for a subset of their inputs. For such functions
-call-sites using known-safe inputs can assume that the result will be a success
-value.
+Some functions may only fail for a subset of their inputs, so calls using known
+safe inputs can be assumed to succeed.
 
 The cantFail functions encapsulate this by wrapping an assertion that their
 argument is a success value and, in the case of Expected<T>, unwrapping the
-T value from the Expected<T> argument:
+T value:
 
 .. code-block:: c++
 
-  Error mayFail(int X);
-  Expected<int> mayFail2(int X);
+  Error onlyFailsForSomeXValues(int X);
+  Expected<int> onlyFailsForSomeXValues2(int X);
 
   void foo() {
-    cantFail(mayFail(KnownSafeValue));
-    int Y = cantFail(mayFail2(KnownSafeValue));
+    cantFail(onlyFailsForSomeXValues(KnownSafeValue));
+    int Y = cantFail(onlyFailsForSomeXValues2(KnownSafeValue));
     ...
   }
 
@@ -801,8 +809,8 @@
 is success. In debug builds this will result in an assertion failure if an error
 is encountered. In release builds the behavior of cantFail for failure values is
 undefined. As such, care must be taken in the use of cantFail: clients must be
-certain that a cantFail wrapped call really can not fail under any
-circumstances.
+certain that a cantFail wrapped call really can not fail with the given
+arguments.
 
 Use of the cantFail functions should be rare in library code, but they are
 likely to be of more use in tool and unit-test code where inputs and/or
@@ -974,7 +982,7 @@
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The ``function_ref``
-(`doxygen <http://llvm.org/docs/doxygen/html/classllvm_1_1function__ref_3_01Ret_07Params_8_8_8_08_4.html>`__) class
+(`doxygen <http://llvm.org/doxygen/classllvm_1_1function__ref_3_01Ret_07Params_8_8_8_08_4.html>`__) class
 template represents a reference to a callable object, templated over the type
 of the callable. This is a good choice for passing a callback to a function,
 if you don't need to hold onto the callback after the function returns. In this
@@ -1024,7 +1032,7 @@
 them out, allowing you to enable them if you need them in the future.
 
 The ``llvm/Support/Debug.h`` (`doxygen
-<http://llvm.org/doxygen/Debug_8h-source.html>`__) file provides a macro named
+<http://llvm.org/doxygen/Debug_8h_source.html>`__) file provides a macro named
 ``DEBUG()`` that is a much nicer solution to this problem.  Basically, you can
 put arbitrary code into the argument of the ``DEBUG`` macro, and it is only
 executed if '``opt``' (or any other tool) is run with the '``-debug``' command
@@ -1121,7 +1129,7 @@
 -------------------------------------------
 
 The ``llvm/ADT/Statistic.h`` (`doxygen
-<http://llvm.org/doxygen/Statistic_8h-source.html>`__) file provides a class
+<http://llvm.org/doxygen/Statistic_8h_source.html>`__) file provides a class
 named ``Statistic`` that is used as a unified way to keep track of what the LLVM
 compiler is doing and how effective various optimizations are.  It is useful to
 see what optimizations are contributing to making a particular program run
@@ -1225,7 +1233,7 @@
 .. code-block:: c++
 
   DEBUG_COUNTER(DeleteAnInstruction, "passname-delete-instruction",
-		"Controls which instructions get delete").
+		"Controls which instructions get delete");
 
 The ``DEBUG_COUNTER`` macro defines a static variable, whose name
 is specified by the first argument.  The name of the counter
@@ -2106,7 +2114,7 @@
 StringMap also provides query methods that take byte ranges, so it only ever
 copies a string if a value is inserted into the table.
 
-StringMap iteratation order, however, is not guaranteed to be deterministic, so
+StringMap iteration order, however, is not guaranteed to be deterministic, so
 any uses which require that should instead use a std::map.
 
 .. _dss_indexmap:
@@ -2819,7 +2827,7 @@
 """""""""""""""""""""""""""""""""
 
 Including "`llvm/Transforms/Utils/BasicBlockUtils.h
-<http://llvm.org/doxygen/BasicBlockUtils_8h-source.html>`_" permits use of two
+<http://llvm.org/doxygen/BasicBlockUtils_8h_source.html>`_" permits use of two
 very useful replace functions: ``ReplaceInstWithValue`` and
 ``ReplaceInstWithInst``.
 
@@ -2915,7 +2923,7 @@
   FunctionType *ft = FunctionType::get(Type::Int8Ty, params, false);
 
 See the `class comment
-<http://llvm.org/doxygen/TypeBuilder_8h-source.html#l00001>`_ for more details.
+<http://llvm.org/doxygen/TypeBuilder_8h_source.html#l00001>`_ for more details.
 
 .. _threading:
 
@@ -3336,7 +3344,7 @@
 
 ``#include "llvm/IR/Type.h"``
 
-header source: `Type.h <http://llvm.org/doxygen/Type_8h-source.html>`_
+header source: `Type.h <http://llvm.org/doxygen/Type_8h_source.html>`_
 
 doxygen info: `Type Clases <http://llvm.org/doxygen/classllvm_1_1Type.html>`_
 
@@ -3440,7 +3448,7 @@
 
 ``#include "llvm/IR/Module.h"``
 
-header source: `Module.h <http://llvm.org/doxygen/Module_8h-source.html>`_
+header source: `Module.h <http://llvm.org/doxygen/Module_8h_source.html>`_
 
 doxygen info: `Module Class <http://llvm.org/doxygen/classllvm_1_1Module.html>`_
 
@@ -3527,7 +3535,7 @@
 
 ``#include "llvm/IR/Value.h"``
 
-header source: `Value.h <http://llvm.org/doxygen/Value_8h-source.html>`_
+header source: `Value.h <http://llvm.org/doxygen/Value_8h_source.html>`_
 
 doxygen info: `Value Class <http://llvm.org/doxygen/classllvm_1_1Value.html>`_
 
@@ -3618,7 +3626,7 @@
 
 ``#include "llvm/IR/User.h"``
 
-header source: `User.h <http://llvm.org/doxygen/User_8h-source.html>`_
+header source: `User.h <http://llvm.org/doxygen/User_8h_source.html>`_
 
 doxygen info: `User Class <http://llvm.org/doxygen/classllvm_1_1User.html>`_
 
@@ -3665,7 +3673,7 @@
 ``#include "llvm/IR/Instruction.h"``
 
 header source: `Instruction.h
-<http://llvm.org/doxygen/Instruction_8h-source.html>`_
+<http://llvm.org/doxygen/Instruction_8h_source.html>`_
 
 doxygen info: `Instruction Class
 <http://llvm.org/doxygen/classllvm_1_1Instruction.html>`_
@@ -3813,7 +3821,7 @@
 ``#include "llvm/IR/GlobalValue.h"``
 
 header source: `GlobalValue.h
-<http://llvm.org/doxygen/GlobalValue_8h-source.html>`_
+<http://llvm.org/doxygen/GlobalValue_8h_source.html>`_
 
 doxygen info: `GlobalValue Class
 <http://llvm.org/doxygen/classllvm_1_1GlobalValue.html>`_
@@ -3871,7 +3879,7 @@
 
 ``#include "llvm/IR/Function.h"``
 
-header source: `Function.h <http://llvm.org/doxygen/Function_8h-source.html>`_
+header source: `Function.h <http://llvm.org/doxygen/Function_8h_source.html>`_
 
 doxygen info: `Function Class
 <http://llvm.org/doxygen/classllvm_1_1Function.html>`_
@@ -3980,7 +3988,7 @@
 ``#include "llvm/IR/GlobalVariable.h"``
 
 header source: `GlobalVariable.h
-<http://llvm.org/doxygen/GlobalVariable_8h-source.html>`_
+<http://llvm.org/doxygen/GlobalVariable_8h_source.html>`_
 
 doxygen info: `GlobalVariable Class
 <http://llvm.org/doxygen/classllvm_1_1GlobalVariable.html>`_
@@ -4038,7 +4046,7 @@
 ``#include "llvm/IR/BasicBlock.h"``
 
 header source: `BasicBlock.h
-<http://llvm.org/doxygen/BasicBlock_8h-source.html>`_
+<http://llvm.org/doxygen/BasicBlock_8h_source.html>`_
 
 doxygen info: `BasicBlock Class
 <http://llvm.org/doxygen/classllvm_1_1BasicBlock.html>`_
diff --git a/docs/Proposals/VectorizationPlan.rst b/docs/Proposals/VectorizationPlan.rst
new file mode 100644
index 0000000..aed8e3d
--- /dev/null
+++ b/docs/Proposals/VectorizationPlan.rst
@@ -0,0 +1,182 @@
+==================
+Vectorization Plan
+==================
+
+.. contents::
+   :local:
+
+Abstract
+========
+The vectorization transformation can be rather complicated, involving several
+potential alternatives, especially for outer-loops [1]_ but also possibly for
+innermost loops. These alternatives may have significant performance impact,
+both positive and negative. A cost model is therefore employed to identify the
+best alternative, including the alternative of avoiding any transformation
+altogether.
+
+The Vectorization Plan is an explicit model for describing vectorization
+candidates. It serves for both optimizing candidates including estimating their
+cost reliably, and for performing their final translation into IR. This
+facilitates dealing with multiple vectorization candidates.
+
+High-level Design
+=================
+
+Vectorization Workflow
+----------------------
+VPlan-based vectorization involves three major steps, taking a "scenario-based
+approach" to vectorization planning:
+
+1. Legal Step: check if a loop can be legally vectorized; encode constraints and
+   artifacts if so.
+2. Plan Step:
+
+   a. Build initial VPlans following the constraints and decisions taken by
+      Legal Step 1, and compute their cost.
+   b. Apply optimizations to the VPlans, possibly forking additional VPlans.
+      Prune sub-optimal VPlans having relatively high cost.
+3. Execute Step: materialize the best VPlan. Note that this is the only step
+   that modifies the IR.
+
+Design Guidelines
+-----------------
+In what follows, the term "input IR" refers to code that is fed into the
+vectorizer whereas the term "output IR" refers to code that is generated by the
+vectorizer. The output IR contains code that has been vectorized or "widened"
+according to a loop Vectorization Factor (VF), and/or loop unroll-and-jammed
+according to an Unroll Factor (UF).
+The design of VPlan follows several high-level guidelines:
+
+1. Analysis-like: building and manipulating VPlans must not modify the input IR.
+   In particular, if the best option is not to vectorize at all, the
+   vectorization process terminates before reaching Step 3, and compilation
+   should proceed as if VPlans had not been built.
+
+2. Align Cost & Execute: each VPlan must support both estimating the cost and
+   generating the output IR code, such that the cost estimation evaluates the
+   to-be-generated code reliably.
+
+3. Support vectorizing additional constructs:
+
+   a. Outer-loop vectorization. In particular, VPlan must be able to model the
+      control-flow of the output IR which may include multiple basic-blocks and
+      nested loops.
+   b. SLP vectorization.
+   c. Combinations of the above, including nested vectorization: vectorizing
+      both an inner loop and an outer-loop at the same time (each with its own
+      VF and UF), mixed vectorization: vectorizing a loop with SLP patterns
+      inside [4]_, (re)vectorizing input IR containing vector code.
+   d. Function vectorization [2]_.
+
+4. Support multiple candidates efficiently. In particular, similar candidates
+   related to a range of possible VF's and UF's must be represented efficiently.
+   Potential versioning needs to be supported efficiently.
+
+5. Support vectorizing idioms, such as interleaved groups of strided loads or
+   stores. This is achieved by modeling a sequence of output instructions using
+   a "Recipe", which is responsible for computing its cost and generating its
+   code.
+
+6. Encapsulate Single-Entry Single-Exit regions (SESE). During vectorization
+   such regions may need to be, for example, predicated and linearized, or
+   replicated VF*UF times to handle scalarized and predicated instructions.
+   Innerloops are also modelled as SESE regions.
+
+Low-level Design
+================
+The low-level design of VPlan comprises of the following classes.
+
+:LoopVectorizationPlanner:
+  A LoopVectorizationPlanner is designed to handle the vectorization of a loop
+  or a loop nest. It can construct, optimize and discard one or more VPlans,
+  each VPlan modelling a distinct way to vectorize the loop or the loop nest.
+  Once the best VPlan is determined, including the best VF and UF, this VPlan
+  drives the generation of output IR.
+
+:VPlan:
+  A model of a vectorized candidate for a given input IR loop or loop nest. This
+  candidate is represented using a Hierarchical CFG. VPlan supports estimating
+  the cost and driving the generation of the output IR code it represents.
+
+:Hierarchical CFG:
+  A control-flow graph whose nodes are basic-blocks or Hierarchical CFG's. The
+  Hierarchical CFG data structure is similar to the Tile Tree [5]_, where
+  cross-Tile edges are lifted to connect Tiles instead of the original
+  basic-blocks as in Sharir [6]_, promoting the Tile encapsulation. The terms
+  Region and Block are used rather than Tile [5]_ to avoid confusion with loop
+  tiling.
+
+:VPBlockBase:
+  The building block of the Hierarchical CFG. A pure-virtual base-class of
+  VPBasicBlock and VPRegionBlock, see below. VPBlockBase models the hierarchical
+  control-flow relations with other VPBlocks. Note that in contrast to the IR
+  BasicBlock, a VPBlockBase models its control-flow successors and predecessors
+  directly, rather than through a Terminator branch or through predecessor
+  branches that "use" the VPBlockBase.
+
+:VPBasicBlock:
+  VPBasicBlock is a subclass of VPBlockBase, and serves as the leaves of the
+  Hierarchical CFG. It represents a sequence of output IR instructions that will
+  appear consecutively in an output IR basic-block. The instructions of this
+  basic-block originate from one or more VPBasicBlocks. VPBasicBlock holds a
+  sequence of zero or more VPRecipes that model the cost and generation of the
+  output IR instructions.
+
+:VPRegionBlock:
+  VPRegionBlock is a subclass of VPBlockBase. It models a collection of
+  VPBasicBlocks and VPRegionBlocks which form a SESE subgraph of the output IR
+  CFG. A VPRegionBlock may indicate that its contents are to be replicated a
+  constant number of times when output IR is generated, effectively representing
+  a loop with constant trip-count that will be completely unrolled. This is used
+  to support scalarized and predicated instructions with a single model for
+  multiple candidate VF's and UF's.
+
+:VPRecipeBase:
+  A pure-virtual base class modeling a sequence of one or more output IR
+  instructions, possibly based on one or more input IR instructions. These
+  input IR instructions are referred to as "Ingredients" of the Recipe. A Recipe
+  may specify how its ingredients are to be transformed to produce the output IR
+  instructions; e.g., cloned once, replicated multiple times or widened
+  according to selected VF.
+
+:VPTransformState:
+  Stores information used for generating output IR, passed from
+  LoopVectorizationPlanner to its selected VPlan for execution, and used to pass
+  additional information down to VPBlocks and VPRecipes.
+
+Related LLVM components
+-----------------------
+1. SLP Vectorizer: one can compare the VPlan model with LLVM's existing SLP
+   tree, where TSLP [3]_ adds Plan Step 2.b.
+
+2. RegionInfo: one can compare VPlan's H-CFG with the Region Analysis as used by
+   Polly [7]_.
+
+References
+----------
+.. [1] "Outer-loop vectorization: revisited for short SIMD architectures", Dorit
+    Nuzman and Ayal Zaks, PACT 2008.
+
+.. [2] "Proposal for function vectorization and loop vec