AMDGPU: Add pass to lower kernel arguments to loads

This replaces most argument uses with loads, but for
now not all.

The code in SelectionDAG for calling convention lowering
is actively harmful for amdgpu_kernel. It attempts to
split the argument types into register legal types, which
results in low quality code for arbitary types. Since
all kernel arguments are passed in memory, we just want the
raw types.

I've tried a couple of methods of mitigating this in SelectionDAG,
but it's easier to just bypass this problem alltogether. It's
possible to hack around the problem in the initial lowering,
but the real problem is the DAG then expects to be able to use
CopyToReg/CopyFromReg for uses of the arguments outside the block.

Exposing the argument loads in the IR also has the advantage
that the LoadStoreVectorizer can merge them.

I'm not sure the best approach to dealing with the IR
argument list is. The patch as-is just leaves the IR arguments
in place, so all the existing code will still compute the same
kernarg size and pointlessly lowers the arguments.

Arguably the frontend should emit kernels with an empty argument
list in the first place. Alternatively a dummy array could be
inserted as a single argument just to reserve space.

This does have some disadvantages. Local pointer kernel arguments can
no longer have AssertZext placed  on them as the equivalent !range
metadata is not valid on pointer  typed loads. This is mostly bad
for SI which needs to know about the known bits in order to use the
DS instruction offset, so in this case this is not done.

More importantly, this skips noalias arguments since this pass
does not yet convert this to the equivalent !alias.scope and !noalias
metadata. Producing this metadata correctly seems to be tricky,
although this logically is the same as inlining into a function which
doesn't exist. Additionally, exposing these loads to the vectorizer
may result in degraded aliasing information if a pointer load is
merged with another argument load.

I'm also not entirely sure this is preserving the current clover
ABI, although I would greatly prefer if it would stop widening
arguments and match the HSA ABI. As-is I think it is extending
< 4-byte arguments to 4-bytes but doesn't align them to 4-bytes.

llvm-svn: 335650
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index ef7b897..796766d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -73,6 +73,10 @@
 void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
 extern char &AMDGPULowerIntrinsicsID;
 
+FunctionPass *createAMDGPULowerKernelArgumentsPass();
+void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
+extern char &AMDGPULowerKernelArgumentsID;
+
 ModulePass *createAMDGPULowerKernelAttributesPass();
 void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
 extern char &AMDGPULowerKernelAttributesID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
new file mode 100644
index 0000000..e37e6bc
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -0,0 +1,267 @@
+//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass replaces accesses to kernel arguments with loads from
+/// offsets from the kernarg base pointer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+
+#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPULowerKernelArguments : public FunctionPass{
+public:
+  static char ID;
+
+  AMDGPULowerKernelArguments() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.setPreservesAll();
+ }
+};
+
+} // end anonymous namespace
+
+bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
+  CallingConv::ID CC = F.getCallingConv();
+  if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
+    return false;
+
+  auto &TPC = getAnalysis<TargetPassConfig>();
+
+  const TargetMachine &TM = TPC.getTM<TargetMachine>();
+  const SISubtarget &ST = TM.getSubtarget<SISubtarget>(F);
+  LLVMContext &Ctx = F.getParent()->getContext();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  BasicBlock &EntryBlock = *F.begin();
+  IRBuilder<> Builder(&*EntryBlock.begin());
+
+  SmallVector<Type *, 16> ArgTypes;
+  for (Argument &Arg : F.args()) {
+    Type *ArgTy = Arg.getType();
+    unsigned Size = DL.getTypeStoreSizeInBits(ArgTy);
+    bool IsExtArg = Size < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
+                    !ST.isAmdHsaOS();
+
+    // Clover seems to always pad i8/i16 to i32, but doesn't properly align
+    // them?
+    // Make sure the struct elements have correct size and alignment for ext
+    // args. These seem to be padded up to 4-bytes but not correctly aligned.
+    ArgTypes.push_back(
+      IsExtArg ? ArrayType::get(ArgTy, 32 / Size) : Arg.getType());
+  }
+
+  StructType *ArgStructTy = StructType::create(Ctx, ArgTypes, F.getName());
+  const StructLayout *Layout = DL.getStructLayout(ArgStructTy);
+
+  // Minimum alignment for kern segment is 16.
+  unsigned KernArgBaseAlign = std::max(16u, DL.getABITypeAlignment(ArgStructTy));
+  const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
+
+  // FIXME: Alignment is broken broken with explicit arg offset.;
+  const uint64_t TotalKernArgSize = BaseOffset +
+    ST.getKernArgSegmentSize(F, DL.getTypeAllocSize(ArgStructTy));
+
+  CallInst *KernArgSegment =
+    Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, nullptr,
+                            F.getName() + ".kernarg.segment");
+
+  KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+  KernArgSegment->addAttribute(AttributeList::ReturnIndex,
+    Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
+  KernArgSegment->addAttribute(AttributeList::ReturnIndex,
+    Attribute::getWithAlignment(Ctx, KernArgBaseAlign));
+
+  Value *KernArgBase = KernArgSegment;
+  if (BaseOffset != 0) {
+    KernArgBase = Builder.CreateConstInBoundsGEP1_64(KernArgBase, BaseOffset);
+    KernArgBaseAlign = MinAlign(KernArgBaseAlign, BaseOffset);
+  }
+
+  unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
+  Value *CastStruct = Builder.CreateBitCast(KernArgBase,
+                                            ArgStructTy->getPointerTo(AS));
+  for (Argument &Arg : F.args()) {
+    if (Arg.use_empty())
+      continue;
+
+    Type *ArgTy = Arg.getType();
+    if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
+      // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
+      // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
+      // can't represent this with range metadata because it's only allowed for
+      // integer types.
+      if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+          ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
+        continue;
+
+      // FIXME: We can replace this with equivalent alias.scope/noalias
+      // metadata, but this appears to be a lot of work.
+      if (Arg.hasNoAliasAttr())
+        continue;
+    }
+
+    VectorType *VT = dyn_cast<VectorType>(ArgTy);
+    bool IsV3 = VT && VT->getNumElements() == 3;
+    VectorType *V4Ty = nullptr;
+
+    unsigned Size = DL.getTypeSizeInBits(ArgTy);
+    bool IsExtArg = Size < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
+                    !ST.isAmdHsaOS();
+    int64_t EltOffset = Layout->getElementOffset(Arg.getArgNo());
+    int64_t AlignDownOffset = alignDown(EltOffset, 4);
+    int64_t OffsetDiff = EltOffset - AlignDownOffset;
+    unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset);
+
+    Value *ArgPtr;
+    if (Size < 32) {
+      // Since we don't have sub-dword scalar loads, avoid doing an extload by
+      // loading earlier than the argument address, and extracting the relevant
+      // bits.
+      //
+      // Additionally widen any sub-dword load to i32 even if suitably aligned,
+      // so that CSE between different argument loads works easily.
+
+      ArgPtr = Builder.CreateConstGEP1_64(KernArgBase, AlignDownOffset);
+      ArgPtr = Builder.CreateBitCast(
+        ArgPtr,
+        Builder.getInt32Ty()->getPointerTo(AS),
+        Arg.getName() + ".kernarg.offset.align.down");
+    } else {
+      ArgPtr = Builder.CreateStructGEP(CastStruct, Arg.getArgNo(),
+                                       Arg.getName() + ".kernarg.offset");
+    }
+
+    assert((!IsExtArg || !IsV3) && "incompatible situation");
+
+
+    if (IsV3 && Size >= 32) {
+      V4Ty = VectorType::get(VT->getVectorElementType(), 4);
+      // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
+      ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS));
+    }
+
+    LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign);
+    Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
+
+    MDBuilder MDB(Ctx);
+
+    if (isa<PointerType>(ArgTy)) {
+      if (Arg.hasNonNullAttr())
+        Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
+
+      uint64_t DerefBytes = Arg.getDereferenceableBytes();
+      if (DerefBytes != 0) {
+        Load->setMetadata(
+          LLVMContext::MD_dereferenceable,
+          MDNode::get(Ctx,
+                      MDB.createConstant(
+                        ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
+      }
+
+      uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
+      if (DerefOrNullBytes != 0) {
+        Load->setMetadata(
+          LLVMContext::MD_dereferenceable_or_null,
+          MDNode::get(Ctx,
+                      MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
+                                                          DerefOrNullBytes))));
+      }
+
+      unsigned ParamAlign = Arg.getParamAlignment();
+      if (ParamAlign != 0) {
+        Load->setMetadata(
+          LLVMContext::MD_align,
+          MDNode::get(Ctx,
+                      MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
+                                                          ParamAlign))));
+      }
+    }
+
+    // TODO: Convert noalias arg to !noalias
+
+    if (Size < 32) {
+      if (IsExtArg && OffsetDiff == 0) {
+        Type *I32Ty = Builder.getInt32Ty();
+        bool IsSext = Arg.hasSExtAttr();
+        Metadata *LowAndHigh[] = {
+          ConstantAsMetadata::get(
+            ConstantInt::get(I32Ty, IsSext ? minIntN(Size) : 0)),
+          ConstantAsMetadata::get(
+            ConstantInt::get(I32Ty,
+                             IsSext ? maxIntN(Size) + 1 : maxUIntN(Size) + 1))
+        };
+
+        Load->setMetadata(LLVMContext::MD_range, MDNode::get(Ctx, LowAndHigh));
+      }
+
+      Value *ExtractBits = OffsetDiff == 0 ?
+        Load : Builder.CreateLShr(Load, OffsetDiff * 8);
+
+      IntegerType *ArgIntTy = Builder.getIntNTy(Size);
+      Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
+      Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
+                                            Arg.getName() + ".load");
+      Arg.replaceAllUsesWith(NewVal);
+    } else if (IsV3) {
+      Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
+                                                {0, 1, 2},
+                                                Arg.getName() + ".load");
+      Arg.replaceAllUsesWith(Shuf);
+    } else {
+      Load->setName(Arg.getName() + ".load");
+      Arg.replaceAllUsesWith(Load);
+    }
+  }
+
+  return true;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
+                      "AMDGPU Lower Kernel Arguments", false, false)
+INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
+                    false, false)
+
+char AMDGPULowerKernelArguments::ID = 0;
+
+FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
+  return new AMDGPULowerKernelArguments();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d361f51..6643c1a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -130,6 +130,12 @@
   cl::init(true),
   cl::Hidden);
 
+static cl::opt<bool> EnableLowerKernelArguments(
+  "amdgpu-ir-lower-kernel-arguments",
+  cl::desc("Lower kernel argument loads in IR pass"),
+  cl::init(true),
+  cl::Hidden);
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -155,6 +161,7 @@
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
   initializeAMDGPUArgumentUsageInfoPass(*PR);
+  initializeAMDGPULowerKernelArgumentsPass(*PR);
   initializeAMDGPULowerKernelAttributesPass(*PR);
   initializeAMDGPULowerIntrinsicsPass(*PR);
   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
@@ -669,6 +676,10 @@
 }
 
 void AMDGPUPassConfig::addCodeGenPrepare() {
+  if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
+      EnableLowerKernelArguments)
+    addPass(createAMDGPULowerKernelArgumentsPass());
+
   TargetPassConfig::addCodeGenPrepare();
 
   if (EnableLoadStoreVectorizer)
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index fce3c8d..f2ea3a1 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -40,6 +40,7 @@
   AMDGPULibCalls.cpp
   AMDGPULibFunc.cpp
   AMDGPULowerIntrinsics.cpp
+  AMDGPULowerKernelArguments.cpp
   AMDGPULowerKernelAttributes.cpp
   AMDGPUMachineCFGStructurizer.cpp
   AMDGPUMachineFunction.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
index 2097d51..ff33a6e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@@ -9,11 +9,11 @@
 ; GCN-LABEL: {{^}}smrd0:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
+define amdgpu_kernel void @smrd0(i32 addrspace(4)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 1
   %1 = load i32, i32 addrspace(4)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, i32 addrspace(1)* undef
   ret void
 }
 
@@ -21,11 +21,11 @@
 ; GCN-LABEL: {{^}}smrd1:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
+define amdgpu_kernel void @smrd1(i32 addrspace(4)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 255
   %1 = load i32, i32 addrspace(4)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, i32 addrspace(1)* undef
   ret void
 }
 
@@ -36,11 +36,11 @@
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
+define amdgpu_kernel void @smrd2(i32 addrspace(4)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 256
   %1 = load i32, i32 addrspace(4)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, i32 addrspace(1)* undef
   ret void
 }
 
@@ -51,11 +51,11 @@
 ; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
 ; TODO: Add VI checks
 ; XGCN: s_endpgm
-define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
+define amdgpu_kernel void @smrd3(i32 addrspace(4)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296 ; 2 ^ 32
   %1 = load i32, i32 addrspace(4)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, i32 addrspace(1)* undef
   ret void
 }
 
@@ -65,11 +65,11 @@
 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
+define amdgpu_kernel void @smrd4(i32 addrspace(4)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143
   %1 = load i32, i32 addrspace(4)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, i32 addrspace(1)* undef
   ret void
 }
 
@@ -79,11 +79,11 @@
 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
+define amdgpu_kernel void @smrd5(i32 addrspace(4)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262144
   %1 = load i32, i32 addrspace(4)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, i32 addrspace(1)* undef
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/add_i64.ll b/llvm/test/CodeGen/AMDGPU/add_i64.ll
index f673d91..894e9c7 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i64.ll
@@ -76,7 +76,7 @@
 ; SI-NOT: addc
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
-define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i32, i64 %a, i32, i64 %b) {
   %add = add i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
index 21bb651..78d99d8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
 ; TRAP-HANDLER-ENABLE:  NumSgprs: 60
-; TRAP-HANDLER-DISABLE: NumSgprs: 76
+; TRAP-HANDLER-DISABLE: NumSgprs: 78
 define amdgpu_kernel void @amdhsa_trap_num_sgprs(
     i32 addrspace(1)* %out0, i32 %in0,
     i32 addrspace(1)* %out1, i32 %in1,
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index b5b3e78..739e6c1 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -217,7 +217,7 @@
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i32, i64 %a) {
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -235,7 +235,7 @@
 ; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
   %shl.a = shl i64 %a, 1
   %shl.b = shl i64 %b, 1
   %and0 = and i64 %shl.a, 62
@@ -381,7 +381,7 @@
 ; SI-NOT: and
 ; SI: s_add_u32
 ; SI-NEXT: s_addc_u32
-define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i32, i64 %b) {
   %shl = shl i64 %a, 1
   %and = and i64 %shl, 64
   %add = add i64 %and, %b
diff --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
index 14327e3..40e3015 100644
--- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
@@ -12,25 +12,17 @@
 ; CIVI: s_load_dword [[LHS:s[0-9]+]]
 ; CIVI: s_load_dword [[RHS:s[0-9]+]]
 
-; VI: s_ashr_i32
-; VI: s_ashr_i32
-; VI: s_sext_i32_i16
-; VI: s_sext_i32_i16
-; VI: s_ashr_i32
-; VI: s_ashr_i32
-; VI: s_lshl_b32
-; VI: s_and_b32
-; VI: s_or_b32
+; CIVI-DAG: s_ashr_i32
+; CIVI-DAG: s_ashr_i32
+; CIVI-DAG: s_sext_i32_i16
+; CIVI-DAG: s_sext_i32_i16
+; CIVI-DAG: s_ashr_i32
+; CIVI-DAG: s_ashr_i32
+; CIVI-DAG: s_lshl_b32
+; CIVI: s_and_b32
+; CIVI: s_or_b32
 
-; CI: s_ashr_i32
-; CI: s_and_b32
-; CI: s_lshr_b32
-; CI: s_sext_i32_i16
-; CI: s_ashr_i32
-; CI: s_ashr_i32
-; CI: s_lshl_b32
-; CI: s_and_b32
-define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, i32, <2 x i16> %lhs, i32, <2 x i16> %rhs) #0 {
   %result = ashr <2 x i16> %lhs, %rhs
   store <2 x i16> %result, <2 x i16> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 836ba76..943ff9e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -7,16 +7,16 @@
 ; GFX9-NOT: m0
 ; SICIVI-DAG: s_mov_b32 m0
 
-; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
+; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
 ; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, [8 x i32], i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
@@ -70,15 +70,15 @@
 
 
 ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
+; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x12
 ; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
+; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x48
 ; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll
new file mode 100644
index 0000000..c05f296
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s
+
+; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0
+
+; ALL-LABEL: {{^}}max_9_sgprs:
+; ALL: SGPRBlocks: 1
+; ALL: NumSGPRsForWavesPerEU: 9
+define amdgpu_kernel void @max_9_sgprs() #0 {
+  %one = load volatile i32, i32 addrspace(4)* undef
+  %two = load volatile i32, i32 addrspace(4)* undef
+  %three = load volatile i32, i32 addrspace(4)* undef
+  %four = load volatile i32, i32 addrspace(4)* undef
+  %five = load volatile i32, i32 addrspace(4)* undef
+  %six = load volatile i32, i32 addrspace(4)* undef
+  %seven = load volatile i32, i32 addrspace(4)* undef
+  %eight = load volatile i32, i32 addrspace(4)* undef
+  %nine = load volatile i32, i32 addrspace(4)* undef
+  %ten = load volatile i32, i32 addrspace(4)* undef
+  call void asm sideeffect "", "s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight)
+  store volatile i32 %one, i32 addrspace(1)* undef
+  store volatile i32 %two, i32 addrspace(1)* undef
+  store volatile i32 %three, i32 addrspace(1)* undef
+  store volatile i32 %four, i32 addrspace(1)* undef
+  store volatile i32 %five, i32 addrspace(1)* undef
+  store volatile i32 %six, i32 addrspace(1)* undef
+  store volatile i32 %seven, i32 addrspace(1)* undef
+  store volatile i32 %eight, i32 addrspace(1)* undef
+  store volatile i32 %nine, i32 addrspace(1)* undef
+  store volatile i32 %ten, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index 822ea80..b98b48d 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -1,25 +1,37 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s
 
 ; If spilling to smem, additional registers are used for the resource
 ; descriptor.
 
+; FIXME: Vectorization can increase required SGPR count beyond limit.
+; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0
+
 ; ALL-LABEL: {{^}}max_9_sgprs:
 
 ; ALL: SGPRBlocks: 1
 ; ALL: NumSGPRsForWavesPerEU: 9
-define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1,
-
-                          i32 addrspace(1)* %out2,
-                          i32 addrspace(1)* %out3,
-                          i32 addrspace(1)* %out4,
-                          i32 addrspace(1)* %out5,
-                          i32 %one, i32 %two, i32 %three, i32 %four, i32 %five) #0 {
-  store i32 %one, i32 addrspace(1)* %out1
-  store i32 %two, i32 addrspace(1)* %out2
-  store i32 %three, i32 addrspace(1)* %out3
-  store i32 %four, i32 addrspace(1)* %out4
-  store i32 %five, i32 addrspace(1)* %out5
+define amdgpu_kernel void @max_9_sgprs() #0 {
+  %one = load volatile i32, i32 addrspace(4)* undef
+  %two = load volatile i32, i32 addrspace(4)* undef
+  %three = load volatile i32, i32 addrspace(4)* undef
+  %four = load volatile i32, i32 addrspace(4)* undef
+  %five = load volatile i32, i32 addrspace(4)* undef
+  %six = load volatile i32, i32 addrspace(4)* undef
+  %seven = load volatile i32, i32 addrspace(4)* undef
+  %eight = load volatile i32, i32 addrspace(4)* undef
+  %nine = load volatile i32, i32 addrspace(4)* undef
+  %ten = load volatile i32, i32 addrspace(4)* undef
+  call void asm sideeffect "", "s,s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, i32 %nine)
+  store volatile i32 %one, i32 addrspace(1)* undef
+  store volatile i32 %two, i32 addrspace(1)* undef
+  store volatile i32 %three, i32 addrspace(1)* undef
+  store volatile i32 %four, i32 addrspace(1)* undef
+  store volatile i32 %five, i32 addrspace(1)* undef
+  store volatile i32 %six, i32 addrspace(1)* undef
+  store volatile i32 %seven, i32 addrspace(1)* undef
+  store volatile i32 %eight, i32 addrspace(1)* undef
+  store volatile i32 %nine, i32 addrspace(1)* undef
+  store volatile i32 %ten, i32 addrspace(1)* undef
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
index 0607eca..057742f 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
@@ -29,7 +29,8 @@
 
 ; GCN-LABEL: {{^}}test_brcc_i1:
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
-; GCNNOOPT: s_and_b32 s{{[0-9]+}}, 1, [[VAL]]
+; GCNNOOPT: s_mov_b32 [[ONE:s[0-9]+]], 1{{$}}
+; GCNNOOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], [[ONE]]
 ; GCNOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], 1
 ; GCN: s_cmp_eq_u32
 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index a33ab4e..69237cf 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}v_ubfe_sub_i32:
 ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
@@ -48,10 +48,9 @@
 }
 
 ; GCN-LABEL: {{^}}s_ubfe_sub_i32:
-; GCN: s_load_dword [[SRC:s[0-9]+]]
-; GCN: s_load_dword [[WIDTH:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
-; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
+; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
+; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]]
+; GCN: v_bfe_u32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]]
 define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -63,11 +62,10 @@
 }
 
 ; GCN-LABEL: {{^}}s_ubfe_sub_multi_use_shl_i32:
-; GCN: s_load_dword [[SRC:s[0-9]+]]
-; GCN: s_load_dword [[WIDTH:s[0-9]+]]
-; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
-; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
-; GCN-NEXT: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
+; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
+; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
+; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]]
+; GCN: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
 define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -126,10 +124,9 @@
 }
 
 ; GCN-LABEL: {{^}}s_sbfe_sub_i32:
-; GCN: s_load_dword [[SRC:s[0-9]+]]
-; GCN: s_load_dword [[WIDTH:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
-; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
+; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
+; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]]
+; GCN: v_bfe_i32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]]
 define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -141,11 +138,10 @@
 }
 
 ; GCN-LABEL: {{^}}s_sbfe_sub_multi_use_shl_i32:
-; GCN: s_load_dword [[SRC:s[0-9]+]]
-; GCN: s_load_dword [[WIDTH:s[0-9]+]]
-; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
-; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
-; GCN-NEXT: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
+; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
+; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
+; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]]
+; GCN: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
 define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 1e74f06..50e72cd 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck -check-prefixes=R600,FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=R600,FUNC %s
 
 ; BFI_INT Definition pattern from ISA docs
 ; (y & x) | (z & ~x)
@@ -119,10 +119,10 @@
 ; FUNC-LABEL: {{^}}s_bitselect_i64_pat_0:
 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
-; GCN: v_bfi_b32
 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
-; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
-; GCN: v_bfi_b32
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
+; GCN-DAG: v_bfi_b32
+; GCN-DAG: v_bfi_b32
 define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
   %and0 = and i64 %a, %b
   %not.a = xor i64 %a, -1
@@ -136,10 +136,10 @@
 ; FUNC-LABEL: {{^}}s_bitselect_i64_pat_1:
 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
-; GCN-DAG: v_bfi_b32
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
-; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
-; GCN: v_bfi_b32
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
+; GCN-DAG: v_bfi_b32
+; GCN-DAG: v_bfi_b32
 define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
   %xor.0 = xor i64 %a, %mask
   %and = and i64 %xor.0, %b
@@ -155,8 +155,8 @@
 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
 ; GCN-DAG: v_bfi_b32
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
-; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
-; GCN: v_bfi_b32
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
+; GCN-DAG: v_bfi_b32
 define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
   %xor.0 = xor i64 %a, %mask
   %and = and i64 %xor.0, %b
diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
index f145869..b24f4f1 100644
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -11,22 +11,32 @@
 ; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
 ; GCN: s_cbranch_vccnz
 
-; GCN: one{{$}}
-; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
-; GCN: buffer_store_short
-; GCN: s_endpgm
+; SI: one{{$}}
+; SI: v_cvt_f16_f32_e32 v[[CVT:[0-9]+]], v[[A_F32]]
 
-; GCN: two{{$}}
-; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
-; GCN: buffer_store_short v[[B_F16]]
-; GCN: s_endpgm
+; SI: two{{$}}
+; SI:  v_cvt_f16_f32_e32 v[[CVT]], v[[B_F32]]
+
+; SI: one{{$}}
+; SI: buffer_store_short v[[CVT]]
+; SI: s_endpgm
+
+
+
+; VI: one{{$}}
+; VI: buffer_store_short v[[A_F16]]
+; VI: s_endpgm
+
+; VI: two{{$}}
+; VI: buffer_store_short v[[B_F16]]
+; VI: s_endpgm
 define amdgpu_kernel void @br_cc_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %fcmp = fcmp olt half %a.val, %b.val
   br i1 %fcmp, label %one, label %two
 
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index b0ee7f3..d4284a3 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -490,7 +490,7 @@
 
 ; GCN-LABEL: {{^}}long_branch_hang:
 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
-; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
+; GCN: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
 ; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
 
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index 8d74011..2291527 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -9,7 +9,7 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 6
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 8
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -23,7 +23,7 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 6
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 8
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 40548ba..612f31f 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -123,7 +123,7 @@
 }
 
 ; FUNC-LABEL: {{^}}s_ctlz_i64:
-; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
 ; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
 ; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
@@ -133,7 +133,7 @@
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   store i64 %ctlz, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 9796bc7..70a81e1 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -98,7 +98,7 @@
 }
 
 ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64:
-; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
 ; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
 ; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
@@ -108,7 +108,7 @@
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   store i64 %ctlz, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop.ll b/llvm/test/CodeGen/AMDGPU/ctpop.ll
index 2d16d40..def3533 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop.ll
@@ -305,14 +305,14 @@
 ; but there are some cases when the should be allowed.
 
 ; FUNC-LABEL: {{^}}ctpop_i32_in_br:
-; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
-; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
+; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x16
+; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x58
 ; GCN: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[VAL]]
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 ; EG: BCNT_INT
-define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
+define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, [8 x i32], i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 8236ac0..3a7e905 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -308,7 +308,9 @@
 ; FUNC-LABEL: {{^}}ctpop_i16_in_br:
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
 ; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
-; GCN: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[VAL]]
+
+; GCN: s_and_b32 [[CTPOP_ARG:s[0-9]+]], [[VAL]], 0xffff
+; GCN: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[CTPOP_ARG]]
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_short [[RESULT]],
 ; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 298280f..00011cd 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -13,13 +13,13 @@
 declare i128 @llvm.ctpop.i128(i128) nounwind readnone
 
 ; FUNC-LABEL: {{^}}s_ctpop_i64:
-; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]]
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index c84e53d..2bd69b8 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -58,11 +58,8 @@
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v3f16:
-; SI: s_load_dword s
-; SI: s_load_dword s
-
-; GFX89: s_load_dwordx2
-; GFX89: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
 
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
@@ -78,8 +75,8 @@
 ; FIXME: Why sometimes vector shift?
 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16:
 ; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
+; SI: s_load_dwordx2 s
+; SI: s_load_dwordx2 s
 
 ; GFX89: s_load_dwordx2 s
 ; GFX89: s_load_dwordx2 s
@@ -87,9 +84,7 @@
 
 
 ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
-; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v
-
-; SI: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
 
 ; GCN: {{buffer|global}}_store_short
 define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
index d6363b1..d533750 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -27,7 +27,7 @@
 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN: buffer_store_short [[VELT1]]
 ; GCN: ScratchSize: 0
-define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %idx) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
   %elt = extractelement <2 x i16> %vec, i32 %idx
   store i16 %elt, i16 addrspace(1)* %out, align 2
@@ -58,12 +58,8 @@
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v3i16:
-; SI: s_load_dword s
-; SI: s_load_dwordx2 s
-; SI: s_load_dword s
-
-; GFX89: s_load_dwordx2
-; GFX89: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
 
 ; GCN-NOT: {{buffer|flat|global}}_load
 
@@ -79,8 +75,7 @@
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
-; SI: s_load_dword s
-; SI: s_load_dword s
+; SI: s_load_dwordx2
 ; SI: buffer_store_short
 ; SI: buffer_store_short
 
@@ -100,12 +95,12 @@
 
 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
 ; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
+; SI: s_load_dwordx2 s
+; SI: s_load_dwordx2 s
 
-; GFX89-DAG: s_load_dwordx2
-; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x2c
-; GFX89-DAG: s_load_dword s
+; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x24
+; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x4c
+; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[0:1], 0x54
 
 ; GCN-NOT: {{buffer|flat|global}}
 
@@ -113,17 +108,13 @@
 ; SICI: buffer_store_short
 ; SICI: buffer_store_short
 
-; SICI: buffer_load_ushort
-; SICI: buffer_store_short
-
 ; GFX9-NOT: s_pack_ll_b32_b16
 ; GFX9-NOT: s_pack_lh_b32_b16
 
 ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
-; GFX89: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LOAD0]]:[[LOAD1]]{{\]}}, s{{[0-9]+}}
-
+; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s
 ; GCN: {{buffer|global}}_store_short
-define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, [8 x i32], <3 x i16> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
   store i16 %p0, i16 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index 7f961f1..bf403bf 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v1i8:
 ; GCN: s_load_dword [[LOAD:s[0-9]+]]
@@ -14,7 +14,8 @@
 ; GCN-LABEL: {{^}}extract_vector_elt_v2i8:
 ; GCN: s_load_dword s
 ; GCN-NOT: {{flat|buffer|global}}
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
+; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
+; VI: v_lshrrev_b16_e64 v{{[0-9]+}}, 8, s{{[0-9]+}}
 ; GCN-NOT: {{flat|buffer|global}}
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
@@ -22,8 +23,8 @@
   %p0 = extractelement <2 x i8> %foo, i32 0
   %p1 = extractelement <2 x i8> %foo, i32 1
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p1, i8 addrspace(1)* %out
-  store i8 %p0, i8 addrspace(1)* %out1
+  store volatile i8 %p1, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out1
   ret void
 }
 
@@ -38,8 +39,8 @@
   %p0 = extractelement <3 x i8> %foo, i32 0
   %p1 = extractelement <3 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p1, i8 addrspace(1)* %out
-  store i8 %p0, i8 addrspace(1)* %out1
+  store volatile i8 %p1, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out1
   ret void
 }
 
@@ -54,24 +55,24 @@
   %p0 = extractelement <4 x i8> %foo, i32 0
   %p1 = extractelement <4 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p1, i8 addrspace(1)* %out
-  store i8 %p0, i8 addrspace(1)* %out1
+  store volatile i8 %p1, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v8i8:
+; GCN-NOT: {{s|flat|buffer|global}}_load
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
-; GCN-NOT: {{flat|buffer|global}}
+; GCN-NOT: {{s|flat|buffer|global}}_load
 ; GCN: s_lshr_b32 s{{[0-9]+}}, [[VAL]], 16
-; GCN-NOT: {{flat|buffer|global}}
+; GCN-NOT: {{s|flat|buffer|global}}_load
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
   %p0 = extractelement <8 x i8> %foo, i32 0
   %p1 = extractelement <8 x i8> %foo, i32 2
-  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p1, i8 addrspace(1)* %out
-  store i8 %p0, i8 addrspace(1)* %out1
+  store volatile i8 %p1, i8 addrspace(1)* null
+  store volatile i8 %p0, i8 addrspace(1)* null
   ret void
 }
 
@@ -87,25 +88,25 @@
   %p0 = extractelement <16 x i8> %foo, i32 0
   %p1 = extractelement <16 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p1, i8 addrspace(1)* %out
-  store i8 %p0, i8 addrspace(1)* %out1
+  store volatile i8 %p1, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v32i8:
-; GCN: s_load_dword [[LOAD0:s[0-9]+]]
-; GCN-NOT: {{flat|buffer|global}}
-; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16
-; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]]
+; GCN-NOT: {{s|flat|buffer|global}}_load
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN-NOT: {{s|flat|buffer|global}}_load
+; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[VAL]], 16
+; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], s{{[0-9]+}}
 ; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
 ; GCN: buffer_store_byte [[V_ELT2]]
 ; GCN: buffer_store_byte [[V_LOAD0]]
-define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
   %p0 = extractelement <32 x i8> %foo, i32 0
   %p1 = extractelement <32 x i8> %foo, i32 2
-  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p1, i8 addrspace(1)* %out
-  store i8 %p0, i8 addrspace(1)* %out1
+  store volatile i8 %p1, i8 addrspace(1)* null
+  store volatile i8 %p0, i8 addrspace(1)* null
   ret void
 }
 
@@ -121,8 +122,8 @@
   %p0 = extractelement <64 x i8> %foo, i32 0
   %p1 = extractelement <64 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p1, i8 addrspace(1)* %out
-  store i8 %p0, i8 addrspace(1)* %out1
+  store volatile i8 %p1, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out1
   ret void
 }
 
@@ -132,42 +133,36 @@
 ; isTypeDesirableForOp in SimplifyDemandedBits
 
 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8:
-; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c
-; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
+; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28
+; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c
 ; VI-NOT: {{flat|buffer|global}}
-; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8
-; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]]
-; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
-; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[ELT0]], [[ELT2]]
+; VI-DAG: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[BUILD_VEC]]
-; VI: buffer_store_byte [[EXTRACT]]
-define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo, i32 %idx) #0 {
+; VI: v_lshrrev_b16_e32 [[ELT:v[0-9]+]], [[SCALED_IDX]], [[V_LOAD]]
+; VI: buffer_store_byte [[ELT]]
+define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 {
   %elt = extractelement <2 x i8> %foo, i32 %idx
-  store i8 %elt, i8 addrspace(1)* %out
+  store volatile i8 %elt, i8 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8:
-; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c
-; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
+; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28
+; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c
 ; VI-NOT: {{flat|buffer|global}}
-; VI: s_lshr_b32 [[ELT12:s[0-9]+]], [[LOAD]], 8
-; VI: v_lshlrev_b16_e64 [[ELT1:v[0-9]+]], 8, [[ELT12]]
-; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
-; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[ELT0]], [[ELT1]]
 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: v_lshrrev_b32_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[VEC3]]
-; VI: buffer_store_byte [[EXTRACT]]
-define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
+; VI: s_lshr_b32 [[ELT:s[0-9]+]], [[LOAD]], [[SCALED_IDX]]
+; VI: v_mov_b32_e32 [[V_ELT:v[0-9]+]], [[ELT]]
+; VI: buffer_store_byte [[V_ELT]]
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 {
   %p0 = extractelement <3 x i8> %foo, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p0, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8:
-; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
+; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x30
 ; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
@@ -175,16 +170,16 @@
 
 ; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]]
 ; VI: buffer_store_byte [[V_EXTRACT]]
-define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 {
   %vec = load <4 x i8>, <4 x i8> addrspace(4)* %vec.ptr
   %p0 = extractelement <4 x i8> %vec, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p0, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8:
-; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
+; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x10
 ; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
@@ -195,7 +190,7 @@
   %vec = load <8 x i8>, <8 x i8> addrspace(4)* %vec.ptr
   %p0 = extractelement <8 x i8> %vec, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store i8 %p0, i8 addrspace(1)* %out
+  store volatile i8 %p0, i8 addrspace(1)* %out
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 721b4f1..6b8eb91 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -39,9 +39,9 @@
 }
 
 ; GCN-LABEL: {{^}}s_fabs_v4f16:
-; CI: s_load_dword s[[LO:[0-9]+]]
-; CI: s_load_dword s[[HI:[0-9]+]]
+; CI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2
 ; GFX89: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
+
 ; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
 ; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]]
 ; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], [[MASK]]
@@ -54,7 +54,7 @@
 
 ; GCN-LABEL: {{^}}fabs_fold_f16:
 ; GCN: s_load_dword [[IN0:s[0-9]+]]
-; GCN: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16
+; GCN-DAG: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16
 
 ; CI-DAG: v_cvt_f32_f16_e64 [[CVT0:v[0-9]+]], |[[IN0]]|
 ; CI-DAG: v_cvt_f32_f16_e32 [[ABS_CVT1:v[0-9]+]], [[IN1]]
@@ -62,6 +62,7 @@
 ; CI-DAG: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]]
 
+; GFX89-NOT: and
 ; GFX89: v_mov_b32_e32 [[V_IN1:v[0-9]+]], [[IN1]]
 ; GFX89: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN0]]|, [[V_IN1]]
 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
index 718176b..d20e74a 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
@@ -53,11 +53,11 @@
 }
 
 ; SI-LABEL: {{^}}fabs_fold_f64:
-; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
 ; SI-NOT: and
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
 ; SI: s_endpgm
-define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, [8 x i32], double %in0, [8 x i32], double %in1) {
   %fabs = call double @llvm.fabs.f64(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, double addrspace(1)* %out
@@ -65,11 +65,11 @@
 }
 
 ; SI-LABEL: {{^}}fabs_fn_fold_f64:
-; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
 ; SI-NOT: and
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
 ; SI: s_endpgm
-define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, [8 x i32], double %in0, [8 x i32], double %in1) {
   %fabs = call double @fabs(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 3c6bdfa..ba72969 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -70,10 +70,11 @@
 }
 
 ; GCN-LABEL: {{^}}fabs_fn_fold:
-; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
-; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; SI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]]
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]]
 define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @fabs(float %in0)
   %fmul = fmul float %fabs, %in1
@@ -82,10 +83,11 @@
 }
 
 ; FUNC-LABEL: {{^}}fabs_fold:
-; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
-; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; SI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]]
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]]
 define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @llvm.fabs.f32(float %in0)
   %fmul = fmul float %fabs, %in1
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index 447c253..30fdb1d 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -16,8 +16,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fadd half %a.val, %b.val
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -65,10 +65,10 @@
 ; VI: flat_load_dword v[[B_V2_F16:[0-9]+]]
 ; VI: flat_load_dword v[[A_V2_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
@@ -102,13 +102,13 @@
 
 ; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
 ; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
@@ -133,13 +133,13 @@
 
 ; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
 ; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index aef898b..a9211f9 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -16,8 +16,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp olt half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -42,8 +42,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %a.abs = call half @llvm.fabs.f16(half %a.val)
   %b.abs = call half @llvm.fabs.f16(half %b.val)
   %r.val = fcmp olt half %a.abs, %b.abs
@@ -67,8 +67,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp oeq half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -90,8 +90,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp ole half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -113,8 +113,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp ogt half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -136,8 +136,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp one half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -159,8 +159,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp oge half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -182,8 +182,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp ord half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -205,8 +205,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp uno half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -228,8 +228,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp ult half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -251,8 +251,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp ueq half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -274,8 +274,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp ule half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -297,8 +297,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp ugt half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -320,8 +320,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp une half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
@@ -343,8 +343,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fcmp uge half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
   store i32 %r.val.sext, i32 addrspace(1)* %r
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 15d4d2a..6f3b3eb 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -30,8 +30,8 @@
   half addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
 entry:
-  %mag = load half, half addrspace(1)* %arg_mag
-  %sign = load half, half addrspace(1)* %arg_sign
+  %mag = load volatile half, half addrspace(1)* %arg_mag
+  %sign = load volatile half, half addrspace(1)* %arg_sign
   %out = call half @llvm.copysign.f16(half %mag, half %sign)
   store half %out, half addrspace(1)* %arg_out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index e5893e5..196da93 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -8,12 +8,11 @@
 
 ; Try to identify arg based on higher address.
 ; FUNC-LABEL: {{^}}test_copysign_f32:
-; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb
-; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc
-; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c
-; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30
-; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
-; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
+; SI: s_load_dwordx2 s{{\[}}[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]{{\]}}, {{.*}} 0xb
+; VI: s_load_dwordx2 s{{\[}}[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]{{\]}}, {{.*}} 0x2c
+
+; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], s[[SSIGN]]
+; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], s[[SMAG]]
 ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
 ; GCN: buffer_store_dword [[RESULT]],
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 161b7ad..b681e4a 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -6,10 +6,10 @@
 declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone
 
 ; FUNC-LABEL: {{^}}test_copysign_f64:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x1d
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x74
 ; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
 ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2
@@ -17,15 +17,15 @@
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
 ; GCN: s_endpgm
-define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, [8 x i32], double %mag, [8 x i32], double %sign) nounwind {
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
   store double %result, double addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}test_copysign_f64_f32:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN-DAG: s_load_dword s[[SSIGN:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
@@ -33,7 +33,7 @@
 ; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]]
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
-define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, [8 x i32], double %mag, float %sign) nounwind {
   %c = fpext float %sign to double
   %result = call double @llvm.copysign.f64(double %mag, double %c)
   store double %result, double addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll
index 68e25ee..3cb6090 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.ll
@@ -64,9 +64,9 @@
 ; SI: v_fma_f32
 ; SI: v_fma_f32
 ; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
 ; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+$}}
-; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
-; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
 
 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}},
 ; EG-DAG: FMA {{\*? *}}[[RES]].X
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
index 0494295..199934e 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -25,14 +25,12 @@
 }
 
 ; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
 
-; SI-SAFE-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
-; SI-NONAN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]]
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[VA]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[VB]]
+; SI-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]]
 
 define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %cmp = fcmp ule float %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 4002712..4d08651 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -1,6 +1,6 @@
-; XUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
+; XUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
 
 
 ; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't
@@ -44,7 +44,7 @@
 ; GCN-DAG: buffer_store_dword [[MUL2]]
 ; GCN-DAG: buffer_store_dword [[MAD]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, [8 x i32], float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %mad = fadd fast float %mul2, %y
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index e1d33b6..c75c500 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -17,8 +17,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fmul half %a.val, %b.val
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -36,7 +36,7 @@
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
-  %b.val = load half, half addrspace(1)* %b
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fmul half 3.0, %b.val
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -55,24 +55,24 @@
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
+  %a.val = load volatile half, half addrspace(1)* %a
   %r.val = fmul half %a.val, 4.0
   store half %r.val, half addrspace(1)* %r
   ret void
 }
 
 ; GCN-LABEL: {{^}}fmul_v2f16:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; SIVI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; SIVI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
-; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI-DAG:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
@@ -82,6 +82,8 @@
 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
 
+; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 ; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
@@ -100,13 +102,13 @@
 
 ; GCN-LABEL: {{^}}fmul_v2f16_imm_a:
 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 
 
 ; VI-DAG:  v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400
@@ -133,13 +135,13 @@
 
 ; GCN-LABEL: {{^}}fmul_v2f16_imm_b:
 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 
 ; VI-DAG:  v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200
 ; VI-DAG:  v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -164,13 +166,15 @@
 }
 
 ; GCN-LABEL: {{^}}fmul_v4f16:
-; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
+; GFX9: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
+; GFX9: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
 
 ; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
 ; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
 ; GFX9: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[MUL_HI]]{{\]}}
 
+; VI: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
+; VI: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
 ; VI: v_mul_f16_sdwa
 ; VI: v_mul_f16_e32
 ; VI: v_mul_f16_sdwa
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index d5c7c8a..f43c2f8 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -109,8 +109,9 @@
 }
 
 ; GCN-LABEL: {{^}}fold_user_fneg_fabs_v2f16:
-; CI: s_load_dword s
-; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
+; CI: s_load_dword [[IN:s[0-9]+]]
+; CI: s_or_b32 [[FNEG_FABS:s[0-9]+]], [[IN]], 0x80008000
+; CI: s_lshr_b32
 ; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index bc0e599..45fd0f7 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -55,14 +55,13 @@
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_f64:
-; GCN-DAG: s_load_dwordx2
 ; GCN-DAG: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
-; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
-; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
+; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x13
+; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x4c
 ; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
 ; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
-define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, [8 x i32], double %in) {
   %fabs = call double @llvm.fabs.f64(double %in)
   %fsub = fsub double -0.000000e+00, %fabs
   store double %fsub, double addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index 3f20ca7..c72dab0 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
 ; SI-NOT: and
-; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}|
+; SI: v_sub_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{s[0-9]+}}|
 define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
   %fabs = call float @llvm.fabs.f32(float %x)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -15,7 +15,7 @@
 
 ; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32:
 ; SI-NOT: and
-; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}|
+; SI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{s[0-9]+}}|
 ; SI-NOT: and
 define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
   %fabs = call float @llvm.fabs.f32(float %x)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg.f64.ll
index 9b4b4d6..508eb0b 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f64.ll
@@ -48,11 +48,11 @@
 }
 
 ; GCN-LABEL: {{^}}fneg_fold_f64:
-; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN-NOT: xor
 ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, [8 x i32], double %in) {
   %fsub = fsub double -0.0, %in
   %fmul = fmul double %fsub, %in
   store double %fmul, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-amdgiz.ll b/llvm/test/CodeGen/AMDGPU/frame-index-amdgiz.ll
index e2b1950..8863b94 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-amdgiz.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-amdgiz.ll
@@ -13,18 +13,17 @@
 define amdgpu_kernel void @f(i32 addrspace(1)* nocapture %a, i32 %i, i32 %j) local_unnamed_addr #0 {
 entry:
 ; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; CHECK: s_load_dword s2, s[0:1], 0xb
-; CHECK: s_load_dword s0, s[0:1], 0xc
+; CHECK: s_load_dwordx2 s[0:1], s[0:1], 0xb
 ; CHECK: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; CHECK: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; CHECK: s_mov_b32 s10, -1
-; CHECK: s_waitcnt lgkmcnt(0)
-; CHECK: s_lshl_b32 s1, s2, 2
 ; CHECK: v_mov_b32_e32 v0, 4
-; CHECK: s_mov_b32 s11, 0xe8f000
-; CHECK: v_add_i32_e32 v1, vcc, s1, v0
-; CHECK: v_mov_b32_e32 v2, 7
+; CHECK: s_waitcnt lgkmcnt(0)
 ; CHECK: s_lshl_b32 s0, s0, 2
+; CHECK: v_add_i32_e32 v1, vcc, s0, v0
+; CHECK: s_lshl_b32 s0, s1, 2
+; CHECK: s_mov_b32 s11, 0xe8f000
+; CHECK: v_mov_b32_e32 v2, 7
 ; CHECK: buffer_store_dword v2, v1, s[8:11], s3 offen
 ; CHECK: v_add_i32_e32 v0, vcc, s0, v0
 ; CHECK: s_mov_b32 s7, 0xf000
@@ -35,14 +34,14 @@
 ; CHECK: s_endpgm
 
   %x = alloca [100 x i32], align 4, addrspace(5)
-  %0 = bitcast [100 x i32] addrspace(5)* %x to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0
+  %alloca.bc = bitcast [100 x i32] addrspace(5)* %x to i8 addrspace(5)*
+  call void @llvm.lifetime.start.p5i8(i64 400, i8 addrspace(5)* nonnull %alloca.bc) #0
   %arrayidx = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %i
   store i32 7, i32 addrspace(5)* %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %j
-  %1 = load i32, i32 addrspace(5)* %arrayidx2, align 4
-  store i32 %1, i32 addrspace(1)* %a, align 4
-  call void @llvm.lifetime.end.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0
+  %ld = load i32, i32 addrspace(5)* %arrayidx2, align 4
+  store i32 %ld, i32 addrspace(1)* %a, align 4
+  call void @llvm.lifetime.end.p5i8(i64 400, i8 addrspace(5)* nonnull %alloca.bc) #0
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
index fc055a5..066e49d 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -17,8 +17,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fsub half %a.val, %b.val
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -36,7 +36,7 @@
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
-  %b.val = load half, half addrspace(1)* %b
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = fsub half 1.0, %b.val
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -54,33 +54,41 @@
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
+  %a.val = load volatile half, half addrspace(1)* %a
   %r.val = fsub half %a.val, 2.0
   store half %r.val, half addrspace(1)* %r
   ret void
 }
 
 ; GCN-LABEL: {{^}}fsub_v2f16:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
-; SI:  v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
+; SI-DAG:  v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI-DAG:  v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
+; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+
 ; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
 ; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
+
+; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+
 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
@@ -101,13 +109,13 @@
 ; GCN-LABEL: {{^}}fsub_v2f16_imm_a:
 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
@@ -135,13 +143,13 @@
 ; GCN-LABEL: {{^}}fsub_v2f16_imm_b:
 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd.ll b/llvm/test/CodeGen/AMDGPU/global_smrd.ll
index 2089089..59c2f1b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_smrd.ll
@@ -5,7 +5,7 @@
 ; CHECK: s_load_dwordx4
 ; CHECK-NOT: flat_load_dword
 
-define amdgpu_kernel void @uniform_load(float addrspace(1)* %arg, float addrspace(1)*  %arg1) {
+define amdgpu_kernel void @uniform_load(float addrspace(1)* %arg, [8 x i32], float addrspace(1)* %arg1) {
 bb:
   %tmp2 = load float, float addrspace(1)* %arg, align 4, !tbaa !8
   %tmp3 = fadd float %tmp2, 0.000000e+00
@@ -28,7 +28,7 @@
 ; CHECK: flat_load_dword
 ; CHECK-NOT: s_load_dwordx4
 
-define amdgpu_kernel void @non-uniform_load(float addrspace(1)* %arg, float addrspace(1)* %arg1) #0 {
+define amdgpu_kernel void @non-uniform_load(float addrspace(1)* %arg, [8 x i32], float addrspace(1)* %arg1) #0 {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp
@@ -59,7 +59,7 @@
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
 
-define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
+define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, [8 x i32], i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1) {
   store i32 0, i32 addrspace(1)* %out0
   %val = load i32, i32 addrspace(1)* %in
   store i32 %val, i32 addrspace(1)* %out1
@@ -71,7 +71,7 @@
 ; CHECK: flat_store_dword
 ; CHECK: flat_load_dword [[VVAL:v[0-9]+]]
 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
-define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
+define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, [8 x i32], i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1) {
   store i32 0, i32 addrspace(1)* %out0
   %val = load i32, i32 addrspace(1)* %in
   store i32 %val, i32 addrspace(1)* %out1
@@ -80,19 +80,20 @@
 
 ; uniform load from global array
 ; CHECK-LABEL:  @global_array
-; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]]
+; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]]
+; CHECK: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0
+; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0
 ; CHECK: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0
 ; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
-
 @A = common local_unnamed_addr addrspace(1) global i32 addrspace(1)* null, align 4
 
 define amdgpu_kernel void @global_array(i32 addrspace(1)* nocapture %out) {
 entry:
-  %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
-  %1 = load i32, i32 addrspace(1)* %0, align 4
-  store i32 %1, i32 addrspace(1)* %out, align 4
+  %load0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
+  %load1 = load i32, i32 addrspace(1)* %load0, align 4
+  store i32 %load1, i32 addrspace(1)* %out, align 4
   ret void
 }
 
@@ -105,13 +106,13 @@
 ; CHECK: flat_load_dwordx2 [[A_ADDR:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}}
 ; CHECK: flat_load_dword [[VVAL:v[0-9]+]], [[A_ADDR]]
 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
-define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, i32 %n) {
+define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, [8 x i32], i32 %n) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1) * %out, i32 %n
   store i32 12, i32 addrspace(1) * %gep
-  %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
-  %1 = load i32, i32 addrspace(1)* %0, align 4
-  store i32 %1, i32 addrspace(1)* %out, align 4
+  %load0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
+  %load1 = load i32, i32 addrspace(1)* %load0, align 4
+  store i32 %load1, i32 addrspace(1)* %out, align 4
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index e2c6e61..107a912 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -22,13 +22,8 @@
 }
 
 ; GCN-LABEL: {{^}}load_v3f16_arg:
-; SI: s_load_dwordx2
-; SI: s_load_dword s
-; SI: s_load_dword s
-
-; VI: s_load_dwordx2
-; VI: s_load_dwordx2
-
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
 ; GCN-NOT: {buffer|flat|global}}_load_
 
 
@@ -45,11 +40,7 @@
 
 ; FIXME: Why not one load?
 ; GCN-LABEL: {{^}}load_v4f16_arg:
-; SI-DAG: s_load_dword s[[ARG0_LO:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2
-; SI-DAG: s_load_dword s[[ARG0_HI:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x3
-
-; VI: s_load_dwordx2 s{{\[}}[[ARG0_LO:[0-9]+]]:[[ARG0_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
-
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG0_LO:[0-9]+]]:[[ARG0_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x2|0x8}}
 ; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], s[[ARG0_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], s[[ARG0_HI]]
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
@@ -86,14 +77,8 @@
 }
 
 ; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
-; SI: s_load_dwordx2 s
-; SI: s_load_dword s
-; SI: s_load_dword s
-
-; VI: s_load_dwordx2
-; VI: s_load_dwordx2
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-
+; GCN: s_load_dwordx2 s
+; GCN: s_load_dwordx2 s
 ; GCN-NOT: _load
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: v_cvt_f32_f16_e32
@@ -116,14 +101,7 @@
 }
 
 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-
-; VI: s_load_dwordx2 s
-; VI: s_load_dwordx2 s
-; VI: s_load_dwordx2 s
+; GCN: s_load_dwordx4
 
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: v_cvt_f32_f16_e32
@@ -154,7 +132,7 @@
 }
 
 ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
-; GCN: s_load_dword
+; GCN-DAG: s_load_dword s
 ; GCN: s_lshr_b32
 
 ; GCN-DAG: v_cvt_f32_f16_e32
@@ -169,14 +147,8 @@
 }
 
 ; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
-; SI: s_load_dword
-; SI: s_load_dword
-
-; VI: s_load_dwordx2
-; VI: s_load_dwordx2
-
-; GCN: s_lshr_b32
-
+; GCN: s_load_dwordx2 s
+; GCN: s_load_dwordx2 s
 ; GCN-DAG: v_cvt_f32_f16_e32
 ; GCN-DAG: v_cvt_f32_f16_e32
 ; GCN-DAG: v_cvt_f32_f16_e32
@@ -191,19 +163,17 @@
 }
 
 ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
-; SI: s_load_dword s
-; SI: s_load_dword s
+; GCN: s_load_dwordx2 s
+; GCN: s_load_dwordx2 s
 
-; VI: s_load_dwordx2 s
-
-; GCN-DAG: v_cvt_f32_f16_e32
-; GCN-DAG: v_cvt_f32_f16_e32
-; GCN-DAG: v_cvt_f32_f16_e32
-; GCN-DAG: v_cvt_f32_f16_e32
-; GCN-DAG: v_cvt_f64_f32_e32
-; GCN-DAG: v_cvt_f64_f32_e32
-; GCN-DAG: v_cvt_f64_f32_e32
-; GCN-DAG: v_cvt_f64_f32_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
 ; GCN: s_endpgm
 define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
   %ext = fpext <4 x half> %arg to <4 x double>
@@ -212,14 +182,8 @@
 }
 
 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
-; SI: s_load_dword s
-; SI-NEXT: s_load_dword s
-; SI-NEXT: s_load_dword s
-; SI-NEXT: s_load_dword s
-; SI-NOT: _load_
-
-; VI: s_load_dwordx2 s
-; VI: s_load_dwordx2 s
+; GCN: s_load_dwordx2 s
+; GCN: s_load_dwordx4 s
 
 ; GCN-DAG: v_cvt_f32_f16_e32
 ; GCN-DAG: v_cvt_f32_f16_e32
@@ -299,12 +263,13 @@
 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
 ; GCN: flat_load_dword [[LOAD:v[0-9]+]],
 
-; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
-; SI: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
+; SI-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
+
 ; SI: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
 
-; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
+; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
 ; GCN: s_endpgm
@@ -348,6 +313,7 @@
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
 
 ; GCN: flat_store_dwordx4
 
@@ -361,7 +327,6 @@
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
-; SI: v_cvt_f32_f16_e32
 
 ; VI: v_cvt_f32_f16_e32
 ; VI: v_cvt_f32_f16_sdwa
@@ -430,19 +395,19 @@
 ; XVI-NOT: v_cvt_f32_f16
 
 ; GCN: flat_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]
-; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
-; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
-; SI-DAG:      v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
-; SI-DAG:  v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
-; VI-DAG:  v_cvt_f32_f16_sdwa [[Y32:v[0-9]+]], v[[IN_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
+; GCN-NOT: v_cvt_f32_f16
 
-; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
-; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]]
-; GCN-DAG: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]]
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
 ; GCN-NOT: v_cvt_f64_f32_e32
 
-; GCN-DAG: flat_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[XLO]]:[[YHI]]{{\]}}
-; GCN-DAG: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[Z]]
+; GCN-DAG: flat_store_dwordx4
+; GCN-DAG: flat_store_dwordx2
 ; GCN: s_endpgm
 define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
   %val = load <3 x half>, <3 x half> addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 379dd33..3dc3f32 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -8,7 +8,7 @@
 ; CHECK:  Version: [ 1, 0 ]
 ; CHECK:  Kernels:
 
-; CHECK: - Name:       test
+; CHECK-LABEL: - Name:       test
 ; CHECK:   SymbolName: 'test@kd'
 ; CHECK:   CodeProps:
 ; CHECK:     KernargSegmentSize:      24
@@ -16,8 +16,8 @@
 ; CHECK:     PrivateSegmentFixedSize: 0
 ; CHECK:     KernargSegmentAlign:     8
 ; CHECK:     WavefrontSize:           64
-; CHECK:     NumSGPRs:                6
-; CHECK:     NumVGPRs:                3
+; CHECK:     NumSGPRs:                8
+; CHECK:     NumVGPRs:                6
 ; CHECK:     MaxFlatWorkGroupSize:    256
 define amdgpu_kernel void @test(
     half addrspace(1)* %r,
@@ -31,18 +31,24 @@
   ret void
 }
 
-; CHECK: - Name:       num_spilled_sgprs
+; CHECK-LABEL: - Name:       num_spilled_sgprs
 ; CHECK:   SymbolName: 'num_spilled_sgprs@kd'
 ; CHECK:   CodeProps:
-; CHECK:     NumSpilledSGPRs: 41
+; GFX700:     NumSpilledSGPRs: 40
+; GFX803:     NumSpilledSGPRs: 24
+; GFX900:     NumSpilledSGPRs: 24
 define amdgpu_kernel void @num_spilled_sgprs(
-    i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %out2,
-    i32 addrspace(1)* %out3, i32 addrspace(1)* %out4, i32 addrspace(1)* %out5,
-    i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, i32 addrspace(1)* %out8,
-    i32 addrspace(1)* %out9, i32 addrspace(1)* %outa, i32 addrspace(1)* %outb,
-    i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, i32 addrspace(1)* %oute,
-    i32 addrspace(1)* %outf, i32 %in0, i32 %in1, i32 %in2, i32 %in3, i32 %in4,
-    i32 %in5, i32 %in6, i32 %in7, i32 %in8, i32 %in9, i32 %ina, i32 %inb,
+    i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],
+    i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32],
+    i32 addrspace(1)* %out4, i32 addrspace(1)* %out5, [8 x i32],
+    i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, [8 x i32],
+    i32 addrspace(1)* %out8, i32 addrspace(1)* %out9, [8 x i32],
+    i32 addrspace(1)* %outa, i32 addrspace(1)* %outb, [8 x i32],
+    i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, [8 x i32],
+    i32 addrspace(1)* %oute, i32 addrspace(1)* %outf, [8 x i32],
+    i32 %in0, i32 %in1, i32 %in2, i32 %in3, [8 x i32],
+    i32 %in4, i32 %in5, i32 %in6, i32 %in7, [8 x i32],
+    i32 %in8, i32 %in9, i32 %ina, i32 %inb, [8 x i32],
     i32 %inc, i32 %ind, i32 %ine, i32 %inf) #0 {
 entry:
   store i32 %in0, i32 addrspace(1)* %out0
@@ -64,7 +70,7 @@
   ret void
 }
 
-; CHECK: - Name:       num_spilled_vgprs
+; CHECK-LABEL: - Name:       num_spilled_vgprs
 ; CHECK:   SymbolName: 'num_spilled_vgprs@kd'
 ; CHECK:   CodeProps:
 ; CHECK:     NumSpilledVGPRs: 14
diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll
index c66d834..bb9da71 100644
--- a/llvm/test/CodeGen/AMDGPU/imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm.ll
@@ -344,114 +344,114 @@
 
 
 ; GCN-LABEL: {{^}}add_inline_imm_0.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_0.5_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0.5
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, -0.5
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_1.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 1.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, -1.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_2.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 2.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, -2.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_4.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 4.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, -4.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_inv_2pi_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
 ; SI-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; SI-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fc45f30
 ; SI: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
 
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; VI: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.15915494{{$}}
 ; VI: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0x3fc45f306dc9c882
   store double %y, double addrspace(1)* %out
   ret void
@@ -461,40 +461,40 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30
 ; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0xbfc45f306dc9c882
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_1_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0x0000000000000001
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_2_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0x0000000000000002
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_16_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0x0000000000000010
   store double %y, double addrspace(1)* %out
   ret void
@@ -504,7 +504,7 @@
 ; GCN: v_mov_b32_e32 v0, -1
 ; GCN: v_mov_b32_e32 v1, v0
 ; GCN: buffer_store_dwordx2 v[0:1]
-define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0xffffffffffffffff
   store double %y, double addrspace(1)* %out
   ret void
@@ -514,7 +514,7 @@
 ; GCN: v_mov_b32_e32 v0, -2
 ; GCN: v_mov_b32_e32 v1, -1
 ; GCN: buffer_store_dwordx2 v[0:1]
-define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0xfffffffffffffffe
   store double %y, double addrspace(1)* %out
   ret void
@@ -524,29 +524,29 @@
 ; GCN: v_mov_b32_e32 v0, -16
 ; GCN: v_mov_b32_e32 v1, -1
 ; GCN: buffer_store_dwordx2 v[0:1]
-define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0xfffffffffffffff0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_63_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0x000000000000003F
   store double %y, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_inline_imm_64_f64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64
 ; GCN: buffer_store_dwordx2 [[REG]]
-define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, [8 x i32], double %x) {
   %y = fadd double %x, 0x0000000000000040
   store double %y, double addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll
index 3227633..84be1b8 100644
--- a/llvm/test/CodeGen/AMDGPU/immv216.ll
+++ b/llvm/test/CodeGen/AMDGPU/immv216.ll
@@ -310,9 +310,9 @@
 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
 ; GFX9: buffer_store_dword [[REG]]
 
-; VI: buffer_load_dword
+; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
+; VI-DAG: buffer_load_dword
 ; VI-NOT: and
-; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
 ; VI: v_or_b32
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 59cf494..a62ad82 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
 
 ; FIXME: Broken on evergreen
 ; FIXME: For some reason the 8 and 16 vectors are being stored as
@@ -75,8 +75,9 @@
 
 ; GCN-LABEL: {{^}}insertelement_to_sgpr:
 ; GCN-NOT: v_readfirstlane
-define amdgpu_ps <4 x float> @insertelement_to_sgpr(<4 x i32> inreg %samp) nounwind {
-  %tmp1 = insertelement <4 x i32> %samp, i32 0, i32 0
+define <4 x float> @insertelement_to_sgpr() nounwind {
+  %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
+  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
   %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
   ret <4 x float> %tmp2
 }
@@ -154,11 +155,11 @@
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v4i32:
-; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
+; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]]
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
@@ -201,23 +202,17 @@
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:
-; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
+; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
 ; VI-NOT: _load
-; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8
 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: v_lshlrev_b16_e64 [[ELT1_SHIFT:v[0-9]+]], 8, [[ELT1]]
-; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
 ; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1
-
-; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[MASK]]
-; VI: v_or_b32_e32 [[BUILD_VECTOR:v[0-9]+]], [[ELT0]], [[ELT1_SHIFT]]
-
-; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[BUILD_VECTOR]]
-; VI-DAG: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]]
-; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[BUILD_VECTOR]]
+; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]]
+; VI: v_xor_b32_e32 [[NOT_MASK:v[0-9]+]], -1, [[MASK]]
+; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[LOAD]], [[NOT_MASK]]
+; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[AND_NOT_MASK]]
 ; VI: buffer_store_short [[OR]]
-define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
   %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
   store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
   ret void
@@ -227,68 +222,51 @@
 ; isTypeDesirableForOp in SimplifyDemandedBits
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i8:
-; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
+; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
 ; VI-NOT: _load
 
-; VI: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[LOAD]], 8
-; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[VEC_HI]]
-; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
-; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[VEC_HI]], [[ELT2]]
-; VI: s_and_b32 [[ELT2:s[0-9]+]], [[LOAD]], 0xff0000{{$}}
-
-; VI: s_mov_b32 [[MASK16:s[0-9]+]], 0xffff{{$}}
+; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], [[MASK16]], [[SCALED_IDX]]
+; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
+; VI: s_not_b32 [[NOT_MASK:s[0-9]+]], [[SHIFTED_MASK]]
+; VI: s_and_b32 [[AND_NOT_MASK:s[0-9]+]], [[NOT_MASK]], [[LOAD]]
+; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
+; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16
 
-; VI: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
-; VI: v_or_b32_sdwa [[SDWA:v[0-9]+]], [[BUILD_VEC]], [[V_ELT2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI: s_not_b32 [[NOT_SHIFT_MASK:s[0-9]+]], [[SHIFTED_MASK]]
-; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[NOT_SHIFT_MASK]], [[SDWA]]
-; VI: v_lshrrev_b32_e32 [[HI2:v[0-9]+]], 16, [[AND_NOT_MASK]]
-; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SCALED_IDX]], 5, [[SDWA]]
-; VI: buffer_store_short [[BFI]]
-; VI: buffer_store_byte [[HI2]]
-define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
+; VI-DAG: buffer_store_short [[BFI]]
+; VI-DAG: v_mov_b32_e32 [[V_HI2:v[0-9]+]], [[HI2]]
+; VI: buffer_store_byte [[V_HI2]]
+define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
   %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
   store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v4i8:
-; VI: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
+; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
 ; VI-NOT: _load
 
-; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 8
-; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]]
-; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff{{$}}
-
-
-; VI: s_lshr_b32 [[ELT3:s[0-9]+]], [[VEC]], 24
-; VI: s_lshr_b32 [[ELT2:s[0-9]+]], [[VEC]], 16
-; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, [[ELT3]]
-; VI: v_or_b32_e32
-; VI: v_or_b32_sdwa
+; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: v_or_b32_sdwa
-; VI: s_lshl_b32
-; VI: v_bfi_b32
-define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
+; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
+; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
+; VI: buffer_store_dword [[BFI]]
+define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
   %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
   store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_dynamic_insertelement_v8i8:
-; VI-NOT: {{buffer|flat|global}}
-; VI: s_load_dword [[IDX:s[0-9]]]
-; VI-NOT: {{buffer|flat|global}}
-; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
-; VI-NOT: {{buffer|flat|global}}
+; VI-NOT: {{buffer|flat|global}}_load
+; VI-DAG: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
+; VI-DAG: s_load_dword [[IDX:s[0-9]]], s[4:5], 0x10
+; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0{{$}}
+; VI-DAG: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0
 ; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
 ; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
 ; VI: s_not_b64 [[NOT_MASK:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}}
@@ -307,13 +285,8 @@
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
 ; GCN: s_load_dwordx2
+; GCN: s_load_dwordx4
 ; GCN: s_load_dword s
-; GCN: s_load_dword s
-; GCN: s_load_dword s
-; GCN: s_load_dword s
-; GCN: s_load_dword s
-; GCN-NOT: _load_
-
 
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
@@ -368,7 +341,7 @@
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2f64:
 ; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}}
-; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}}
+; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x18|0x60}}{{$}}
 
 ; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
 
@@ -390,7 +363,7 @@
 
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
   %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
   store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -420,19 +393,18 @@
 ; space is also 2x what should be required.
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v4f64:
-; GCN: SCRATCH_RSRC_DWORD
 
 ; Stack store
 
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
 
 ; Write element
-; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}}
 
 ; Stack reload
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
 
 ; Store result
 ; GCN: buffer_store_dwordx4
@@ -447,19 +419,17 @@
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v8f64:
-; GCN-DAG: SCRATCH_RSRC_DWORD
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}}
 
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}}
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}}
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}}
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}}
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}}
 
-; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}}
 
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index cb2753b..b6eda9f 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1,9 +1,9 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
 
 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0:
-; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GCN: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
 ; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x3e7{{$}}
@@ -18,17 +18,17 @@
 }
 
 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reg:
-; GCN: s_load_dword [[ELT0:s[0-9]+]]
-; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GCN-DAG: s_load_dword [[ELT_LOAD:s[0-9]+]], s[4:5],
+; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
-; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
+; CIVI-DAG: s_and_b32 [[ELT0:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}}
 ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
 
 ; GFX9-NOT: [[ELT0]]
 ; GFX9-NOT: [[VEC]]
-; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]]
-define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
+; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT_LOAD]], [[VEC]]
+define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@@ -36,29 +36,29 @@
 }
 
 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_multi_use_hi_reg:
-; GCN: s_load_dword [[ELT0:s[0-9]+]]
-; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GCN-DAG: s_load_dword [[ELT_LOAD:s[0-9]+]], s[4:5],
+; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
-; CI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
+; CI-DAG: s_and_b32 [[ELT0_MASKED:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}}
 ; CI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
 ; CI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16
-; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
+; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0_MASKED]], [[ELT1]]
 ; CI-DAG: ; use [[SHR]]
 
 
 ; FIXME: Should be able to void mask of upper bits
-; VI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
+; VI-DAG: s_and_b32 [[ELT_MASKED:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}}
 ; VI-DAG: s_and_b32 [[VEC_HIMASK:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
-; VI: s_or_b32 [[OR:s[0-9]+]], [[ELT0]], [[VEC_HIMASK]]
-; VI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
+; VI-DAG: s_or_b32 [[OR:s[0-9]+]], [[ELT_MASKED]], [[VEC_HIMASK]]
+; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
 
 ; VI-DAG: ; use [[SHR]]
 
 
 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
-; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
+; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_LOAD]], [[ELT1]]
 ; GFX9-DAG: ; use [[ELT1]]
-define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
   %elt1 = extractelement <2 x i16> %vec, i32 1
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
@@ -69,16 +69,17 @@
 }
 
 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi:
-; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
-; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GCN-DAG: s_load_dword [[ELT_ARG:s[0-9]+]], s[4:5],
+; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
+; CIVI: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
 ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
-; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_ARG]], [[ELT1]]
+; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[ELT1]]
 
 ; GFX9-NOT: [[ELT0]]
 ; GFX9-NOT: [[VEC]]
 ; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]]
-define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
   %elt.hi = lshr i32 %elt.arg, 16
   %elt = trunc i32 %elt.hi to i16
@@ -88,7 +89,7 @@
 }
 
 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_multi_use_1:
-; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
+; GCN: s_load_dword [[ELT_ARG:s[0-9]+]],
 ; GCN: s_load_dword [[VEC:s[0-9]+]],
 
 ; CIVI-DAG: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16
@@ -110,7 +111,7 @@
 }
 
 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_both_multi_use_1:
-; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
+; GCN: s_load_dword [[ELT_ARG:s[0-9]+]],
 ; GCN: s_load_dword [[VEC:s[0-9]+]],
 
 ; CI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
@@ -161,15 +162,16 @@
 }
 
 ; GCN-LABEL: {{^}}s_insertelement_v2i16_1_reg:
-; GCN: s_load_dword [[ELT1:s[0-9]+]]
-; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GCN-DAG: s_load_dword [[ELT1_LOAD:s[0-9]+]], s[4:5],
+; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
+; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[ELT1_LOAD]], 16
 ; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
 
 ; GCN-NOT: shlr
-; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]]
-define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
+; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1_LOAD]]
+define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@@ -444,12 +446,11 @@
 
 ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
 
 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
 
-; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
 
@@ -473,12 +474,11 @@
 
 ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
 
 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
 
-; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
 
@@ -501,17 +501,18 @@
 }
 
 ; GCN-LABEL: {{^}}v_insertelement_v4f16_0:
-; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
+; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[4:5],
 ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
 
 ; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
 ; GFX9: v_bfi_b32 v[[INS_LO:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[LO]]
 
+; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}}
 ; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[LO]]
-; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL]], [[AND]]
+; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL_MASKED]], [[AND]]
 
 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_LO]]:[[HI]]{{\]}}
-define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
@@ -531,12 +532,13 @@
 ; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
 ; GFX9: v_lshl_or_b32 v[[INS_HALF:[0-9]+]], [[VAL]], 16, [[AND]]
 
-; VI: s_lshl_b32 [[VAL]], [[VAL]], 16
-; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]]
+; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
+; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]]
 ; VI: v_or_b32_sdwa v[[INS_HALF:[0-9]+]], v[[LO]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 
+; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
 ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
-; CI: v_or_b32_e32 v[[INS_HALF:[0-9]+]], [[VAL]], [[AND]]
+; CI: v_or_b32_e32 v[[INS_HALF:[0-9]+]], [[VAL_HI]], [[AND]]
 
 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_HALF]]:[[HI]]{{\]}}
 define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
@@ -553,17 +555,18 @@
 }
 
 ; GCN-LABEL: {{^}}v_insertelement_v4f16_2:
-; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
+; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[4:5],
 ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
 
 ; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
 ; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]
 
+; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}}
 ; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
-; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
+; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]]
 
 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
-define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
@@ -583,12 +586,13 @@
 ; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
 ; GFX9: v_lshl_or_b32 v[[INS_HI:[0-9]+]], [[VAL]], 16, [[AND]]
 
-; VI: s_lshl_b32 [[VAL]], [[VAL]], 16
-; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]]
+; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
+; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]]
 ; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], v[[HI]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 
+; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
 ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
-; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
+; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_HI]], [[AND]]
 
 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
 define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
@@ -611,8 +615,9 @@
 ; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
 ; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]
 
+; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}}
 ; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
-; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
+; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]]
 
 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
 define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 2315825..da8c994 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -210,8 +210,10 @@
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
 
-; GCN: s_load_dword s
-; GCN-NOT: {{buffer|flat|global}}_load_
+; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+
+; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
 define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
 entry:
   store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
@@ -226,8 +228,7 @@
 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
 
-; SI: s_load_dword s
-; SI: s_load_dword s
+; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
 
 ; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
 ; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
@@ -236,6 +237,7 @@
   store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
   ret void
 }
+
 ; FUNC-LABEL: {{^}}v3i32_arg:
 ; HSA-VI: kernarg_segment_byte_size = 32
 ; HSA-VI: kernarg_segment_alignment = 4
@@ -274,8 +276,8 @@
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 
-; GCN: s_load_dword s
-; GCN-NOT: {{buffer|flat|global}}_load_
+; GCN-DAG: s_load_dwordx2 s
+; GCN-DAG: s_load_dword s
 define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(1)* %out
@@ -290,12 +292,18 @@
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
-; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
 ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
 
-; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x2c
-; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
+; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
+; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
+
+
+; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
+; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
+
+; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
+; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
 define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
 entry:
   store <4 x i16> %in, <4 x i16> addrspace(1)* %out
@@ -348,23 +356,16 @@
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 
-
-; SI: s_load_dword s
-; SI: s_load_dword s
+; SI-NOT: {{buffer|flat|global}}_load
 ; SI: s_load_dwordx2 s
+; SI-NEXT: s_load_dwordx2 s
 ; SI-NOT: {{buffer|flat|global}}_load
 
-; VI: s_load_dword s
-; VI: s_load_dword s
-
-; VI: v_lshlrev_b16
-; VI: v_or_b32_e32
-; VI: v_or_b32_sdwa
-; VI: v_or_b32_sdwa
-; VI: v_lshlrev_b16
-; VI: s_lshr_b32
-; VI: v_or_b32_sdwa
-; VI: v_or_b32_sdwa
+; VI: s_load_dwordx2 s
+; VI-NEXT: s_load_dwordx2 s
+; VI-NOT: lshl
+; VI-NOT: _or
+; VI-NOT: _sdwa
 define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
 entry:
   store <8 x i8> %in, <8 x i8> addrspace(1)* %out
@@ -383,19 +384,14 @@
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
+; SI-NEXT: s_load_dwordx2
 ; SI-NOT: {{buffer|flat|global}}_load
 
 
-; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
-; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x3c
+; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
 
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x18
+; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
 define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
 entry:
   store <8 x i16> %in, <8 x i16> addrspace(1)* %out
@@ -413,6 +409,7 @@
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
+
 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
 ; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
@@ -462,33 +459,16 @@
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4 s
+; SI-NEXT: s_load_dwordx2 s
 ; SI-NOT: {{buffer|flat|global}}_load
 
 
-; VI: s_load_dword s
-; VI: s_load_dword s
-; VI: s_load_dword s
-; VI: s_load_dword s
-
-; VI: s_lshr_b32
-; VI: v_lshlrev_b16
-; VI: s_lshr_b32
-; VI: s_lshr_b32
-; VI: v_or_b32_sdwa
-; VI: v_or_b32_sdwa
-; VI: v_lshlrev_b16
-; VI: v_lshlrev_b16
-; VI: v_or_b32_sdwa
-; VI: v_or_b32_sdwa
-; VI: v_lshlrev_b16
-; VI: v_lshlrev_b16
-; VI: v_or_b32_sdwa
-; VI: v_or_b32_sdwa
+; VI: s_load_dwordx4 s
+; VI-NOT: shr
+; VI-NOT: shl
+; VI-NOT: _sdwa
+; VI-NOT: _or_
 define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
 entry:
   store <16 x i8> %in, <16 x i8> addrspace(1)* %out
@@ -516,27 +496,14 @@
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-; SI: s_load_dword s
-
+; SI: s_load_dwordx8 s
+; SI-NEXT: s_load_dwordx2 s
 ; SI-NOT: {{buffer|flat|global}}_load
 
 
-; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
-; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x4c
-; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x54
-; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x5c
+; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
 
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x28
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
-; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x38
+; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
 define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
 entry:
   store <16 x i16> %in, <16 x i16> addrspace(1)* %out
@@ -600,22 +567,21 @@
 }
 
 ; FUNC-LABEL: {{^}}kernel_arg_i64:
-; MESA-GCN: s_load_dwordx2
-; MESA-GCN: s_load_dwordx2
+; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24
+; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
+
 ; MESA-GCN: buffer_store_dwordx2
-; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
 define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}f64_kernel_arg:
-; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
-; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
-; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
-; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
+; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
+; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
 ; MESA-GCN: buffer_store_dwordx2
-; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
+
+; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
 entry:
   store double %in, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
index 8e0cf31..5ece33f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
@@ -6,7 +6,7 @@
 ; GCN: s_load_dword s[[LO:[0-9]+]]
 ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
 ; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, half %data, i32 %index) {
+define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) {
 main_body:
   call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
index ad5d3f5..af0e4e9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
@@ -4,8 +4,8 @@
 declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
 
 ; GCN-LABEL: {{^}}class_f16:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
+; GCN-DAG: buffer_load_ushort v[[A_F16:[0-9]+]]
+; GCN-DAG: buffer_load_dword v[[B_I32:[0-9]+]]
 ; VI:  v_cmp_class_f16_e32 vcc, v[[A_F16]], v[[B_I32]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
@@ -33,7 +33,9 @@
 ; GCN: s_endpgm
 define amdgpu_kernel void @class_f16_fabs(
   i32 addrspace(1)* %r,
+  [8 x i32],
   half %a.val,
+  [8 x i32],
   i32 %b.val) {
 entry:
   %a.val.fabs = call half @llvm.fabs.f16(half %a.val)
@@ -53,7 +55,9 @@
 ; GCN: s_endpgm
 define amdgpu_kernel void @class_f16_fneg(
   i32 addrspace(1)* %r,
+  [8 x i32],
   half %a.val,
+  [8 x i32],
   i32 %b.val) {
 entry:
   %a.val.fneg = fsub half -0.0, %a.val
@@ -73,7 +77,9 @@
 ; GCN: s_endpgm
 define amdgpu_kernel void @class_f16_fabs_fneg(
   i32 addrspace(1)* %r,
+  [8 x i32],
   half %a.val,
+  [8 x i32],
   i32 %b.val) {
 entry:
   %a.val.fabs = call half @llvm.fabs.f16(half %a.val)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
index f71b975..ed4e6d3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@@ -1,4 +1,4 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
 
 declare i1 @llvm.amdgcn.class.f32(float, i32) #1
 declare i1 @llvm.amdgcn.class.f64(double, i32) #1
@@ -7,14 +7,14 @@
 declare double @llvm.fabs.f64(double) #1
 
 ; SI-LABEL: {{^}}test_class_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]]
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -22,14 +22,14 @@
 }
 
 ; SI-LABEL: {{^}}test_class_fabs_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -38,14 +38,14 @@
 }
 
 ; SI-LABEL: {{^}}test_class_fneg_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
   %a.fneg = fsub float -0.0, %a
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -54,14 +54,14 @@
 }
 
 ; SI-LABEL: {{^}}test_class_fneg_fabs_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
   %a.fneg.fabs = fsub float -0.0, %a.fabs
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
@@ -183,14 +183,14 @@
 }
 
 ; SI-LABEL: {{^}}test_class_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -198,14 +198,14 @@
 }
 
 ; SI-LABEL: {{^}}test_class_fabs_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -214,14 +214,14 @@
 }
 
 ; SI-LABEL: {{^}}test_class_fneg_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
   %a.fneg = fsub double -0.0, %a
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -230,14 +230,14 @@
 }
 
 ; SI-LABEL: {{^}}test_class_fneg_fabs_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
   %a.fneg.fabs = fsub double -0.0, %a.fabs
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
@@ -268,14 +268,14 @@
 
 ; Set all 9 bits of mask
 ; SI-LABEL: {{^}}test_class_full_mask_f64:
-; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
 ; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
 ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
 ; SI-NOT: vcc
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, [8 x i32], double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
index 241b708..4badd8b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
@@ -4,11 +4,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}s_cvt_pk_i16_i32:
-; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
-; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
-; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
-; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[X]], [[VY]]
-; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[X]], [[VY]]
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
+; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
+; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, s[[SX]], [[VY]]
 define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y)
   %r = bitcast <2 x i16> %result to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
index 8d5c9aa..c264dc7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
@@ -4,11 +4,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}s_cvt_pk_u16_u32:
-; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
-; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
-; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
-; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[X]], [[VY]]
-; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[X]], [[VY]]
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
+; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
+; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, s[[SX]], [[VY]]
 define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y)
   %r = bitcast <2 x i16> %result to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
index 822e8c2..414fe15 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
@@ -4,11 +4,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32:
-; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
-; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
-; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
-; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
-; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[X]], [[VY]]
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
+; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
+; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, s[[SX]], [[VY]]
 define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y)
   %r = bitcast <2 x i16> %result to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
index c2b8f3c..cab6c8c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
@@ -4,11 +4,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32:
-; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
-; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
-; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
-; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
-; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[X]], [[VY]]
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
+; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
+; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, s[[SX]], [[VY]]
 define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y)
   %r = bitcast <2 x i16> %result to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 9cbbdc3f..06e209a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -3,11 +3,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
 
 ; GCN-LABEL: {{^}}s_cvt_pkrtz_v2f16_f32:
-; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
-; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
-; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
-; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
-; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, [[X]], [[VY]]
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
+; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
+; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, s[[SX]], [[VY]]
 define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
   store <2 x half> %result, <2 x half> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
index 9018af3..dcb4c63 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
@@ -15,9 +15,9 @@
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
+  %c.val = load volatile half, half addrspace(1)* %c
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half %c.val)
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -35,8 +35,8 @@
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
 entry:
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+  %b.val = load volatile half, half addrspace(1)* %b
+  %c.val = load volatile half, half addrspace(1)* %c
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half %b.val, half %c.val)
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -54,8 +54,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %c.val = load half, half addrspace(1)* %c
+  %a.val = load volatile half, half addrspace(1)* %a
+  %c.val = load volatile half, half addrspace(1)* %c
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half 3.0, half %c.val)
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -73,8 +73,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half 3.0)
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -90,7 +90,7 @@
     half addrspace(1)* %r,
     half addrspace(1)* %c) {
 entry:
-  %c.val = load half, half addrspace(1)* %c
+  %c.val = load volatile half, half addrspace(1)* %c
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half 3.0, half %c.val)
   store half %r.val, half addrspace(1)* %r
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
index b8fcacf4..4d9921f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
@@ -5,18 +5,20 @@
 declare double @llvm.amdgcn.div.fixup.f64(double, double, double) nounwind readnone
 
 ; GCN-LABEL: {{^}}test_div_fixup_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
+
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
+; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
+
 ; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
 ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 25b0ad5..34b842d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,5 +1,5 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
-; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI %s
 
 ; FIXME: Enable for VI.
 
@@ -8,33 +8,36 @@
 declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
+
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
+; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
+
+; GCN-DAG: s_and_b32 [[AND_I1:s[0-9]+]], 1, s{{[0-9]+}}
+; GCN: v_cmp_eq_u32_e64 vcc, [[AND_I1]], 1
+
 ; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
 ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
 ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0:
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
 ; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
 ; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -43,26 +46,32 @@
 ; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1:
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
+
+; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
+; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
+; GCN: buffer_store_dword [[RESULT]],
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, [8 x i32], i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
+
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
+
+; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
+; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
+; GCN: buffer_store_dword [[RESULT]],
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -77,8 +86,8 @@
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
-; SI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
+; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
   %cmp = icmp eq i32 %i, 0
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
@@ -87,8 +96,8 @@
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
-; SI: s_mov_b64 vcc, 0
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; GCN: s_mov_b64 vcc, 0
+; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
@@ -96,8 +105,8 @@
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
-; SI: s_mov_b64 vcc, -1
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; GCN: s_mov_b64 vcc, -1
+; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
index 70b01d6..db7a91e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
@@ -230,13 +230,13 @@
 }
 
 ; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_1:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -244,13 +244,13 @@
 }
 
 ; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_2:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -258,14 +258,14 @@
 }
 
 ; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_1:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x1d
 ; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]]
 ; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]]
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
@@ -273,14 +273,14 @@
 }
 
 ; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_2:
-; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x1d
 ; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]]
 ; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]]
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
index 1515a58..85aaee3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
@@ -7,7 +7,7 @@
 ; GCN: s_load_dword s[[S_LO:[0-9]+]]
 ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
 ; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}],  dfmt:1,  nfmt:2, 0 idxen
-define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) {
+define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %vindex) {
 main_body:
   call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
   ret void
@@ -41,7 +41,6 @@
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
 ; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}],  dfmt:1,  nfmt:2, 0 idxen
 
-
 ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
 ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
 ; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}],  dfmt:1,  nfmt:2, 0 idxen
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 0f3cf79..49f100f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -24,14 +24,14 @@
 
 ; GCN-LABEL: {{^}}ceil_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI-NOT: and
 ; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 16f167a..7c7569c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -24,26 +24,31 @@
 }
 
 ; GCN-LABEL: {{^}}cos_v2f16
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI-DAG:  v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}}
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; SI-DAG: v_mov_b32_e32 v[[HALF_PI:[0-9]+]], 0x3e22f983{{$}}
+
 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG:  v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]]
-; SI-DAG:  v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PI]]
+; SI: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
+; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PI]]
+; SI: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
 
 ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
 ; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
+; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
+; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
 
-; GCN-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
-; GCN-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
-; GCN-DAG: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
-; GCN-DAG: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
+; GCN: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
+; GCN: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
 
-; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+
+; VI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; VI-DAG:  v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GCN-NOT: and
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
index 631bb50..be1920f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
@@ -2,11 +2,9 @@
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,OPT %s
 
 ; GCN-LABEL: {{^}}test_debug_value:
-; NOOPT: s_load_dwordx2 s[4:5]
-
-; FIXME: Why is the SGPR4_SGPR5 reference being removed from DBG_VALUE?
-; NOOPT: ; kill: def $sgpr8_sgpr9 killed $sgpr4_sgpr5
-; NOOPT-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- undef
+; NOOPT: .loc	1 1 42 prologue_end     ; /tmp/test_debug_value.cl:1:42
+; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; NOOPT-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- $sgpr4_sgpr5
 
 ; GCN: flat_store_dword
 ; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index 907fd48..081dc61 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -24,13 +24,13 @@
 
 ; GCN-LABEL: {{^}}floor_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI-NOT: and
 ; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
index f597414..846494f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
@@ -105,16 +105,17 @@
 ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
 
 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-
-; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
 ; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+
+
 ; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]]
@@ -145,8 +146,9 @@
 }
 
 ; GCN-LABEL: {{^}}fma_v2f16_imm_a:
-; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 ; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+
 
 ; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
 ; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
@@ -157,13 +159,14 @@
 ; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
 
-; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
-; SI:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]]
+; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+
+; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
+; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 
 ; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]]
@@ -186,8 +189,8 @@
 }
 
 ; GCN-LABEL: {{^}}fma_v2f16_imm_b:
-; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 
 ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
@@ -195,10 +198,10 @@
 ; SI:  v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
 ; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
 
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
@@ -229,8 +232,8 @@
 }
 
 ; GCN-LABEL: {{^}}fma_v2f16_imm_c:
-; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 
 ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
@@ -238,26 +241,31 @@
 ; SI:  v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
 ; VI:  v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
 ; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+
+; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
+; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
 ; VI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; VI-DAG:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; VI-DAG:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]]
 ; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
-
 ; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
+
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fma_v2f16_imm_c(
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index 478658c..2d1bdaf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -58,8 +58,8 @@
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+  %b.val = load volatile half, half addrspace(1)* %b
+  %c.val = load volatile half, half addrspace(1)* %c
   %r.val = call half @llvm.fmuladd.f16(half 3.0, half %b.val, half %c.val)
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -87,56 +87,64 @@
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
-  %a.val = load half, half addrspace(1)* %a
-  %c.val = load half, half addrspace(1)* %c
+  %a.val = load volatile half, half addrspace(1)* %a
+  %c.val = load volatile half, half addrspace(1)* %c
   %r.val = call half @llvm.fmuladd.f16(half %a.val, half 3.0, half %c.val)
   store half %r.val, half addrspace(1)* %r
   ret void
 }
 
 ; GCN-LABEL: {{^}}fmuladd_v2f16
-; VI:  buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; VI-FLUSH: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; VI-FLUSH: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; VI-FLUSH: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+
+; VI-DENORM: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; VI-DENORM: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; VI-DENORM: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+
+
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
-; SI:  v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
-; SI:  v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
-; SI:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
+; SI-DAG:  v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
+; SI-DAG:  v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
 
-; VI-FLUSH:     v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[B_V2_F16]], v[[C_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
-; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
+
+; VI-FLUSH:     v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; VI-FLUSH-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-FLUSH-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]]
+; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
 ; VI-FLUSH-NOT: v_and_b32
-; VI-FLUSH:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[R_F16_HI]]
+; VI-FLUSH:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]]
 
 ; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]], v[[C_V2_F16]]
-; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]], v[[C_F16_1]]
+; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
+; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
 ; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
 ; VI-DENORM-NOT: v_and_b32
 ; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
-
 define amdgpu_kernel void @fmuladd_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index c57b545..13fdd28 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -22,8 +22,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val)
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -66,17 +66,16 @@
 }
 
 ; GCN-LABEL: {{^}}maxnum_v2f16:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
 ; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
@@ -89,7 +88,7 @@
 ; VI-NOT: and
 ; VI:    v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
 
-; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
+; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
@@ -107,13 +106,13 @@
 
 ; GCN-LABEL: {{^}}maxnum_v2f16_imm_a:
 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; VI-DAG:  v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
 ; VI-DAG:  v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-DAG:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
@@ -127,7 +126,6 @@
 ; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
@@ -140,13 +138,13 @@
 
 ; GCN-LABEL: {{^}}maxnum_v2f16_imm_b:
 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 
 ; VI-DAG:  v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
 ; VI-DAG:  v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -162,7 +160,6 @@
 ; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
@@ -192,8 +189,8 @@
 ; GCN-LABEL: {{^}}maxnum_v4f16:
 ; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
 ; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
-; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
+; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
+; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
 ; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}}
 define amdgpu_kernel void @maxnum_v4f16(
     <4 x half> addrspace(1)* %r,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 8c81fdb..b34ad6f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -22,8 +22,8 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
   %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val)
   store half %r.val, half addrspace(1)* %r
   ret void
@@ -66,32 +66,31 @@
 }
 
 ; GCN-LABEL: {{^}}minnum_v2f16:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
 ; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI:     v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI-NOT: and
-; SI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
 ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
 ; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NOT: and
-; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
+; VI:    v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
 
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
+; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @minnum_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
@@ -106,22 +105,21 @@
 
 ; GCN-LABEL: {{^}}minnum_v2f16_imm_a:
 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
-; SI-DAG:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
-; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-
 ; VI-DAG:  v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
 ; VI-DAG:  v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
 
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SIVI-NOT: and
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
+; SIVI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
+
 
 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200
 ; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
@@ -139,26 +137,28 @@
 
 ; GCN-LABEL: {{^}}minnum_v2f16_imm_b:
 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+
 ; VI-DAG:  v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
 ; VI-DAG:  v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
 
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
-
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+
+
 ; SIVI-NOT: and
 ; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
+; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @minnum_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
@@ -188,8 +188,8 @@
 ; GCN-LABEL: {{^}}minnum_v4f16:
 ; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
 ; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
-; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
+; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
+; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
 ; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}}
 define amdgpu_kernel void @minnum_v4f16(
     <4 x half> addrspace(1)* %r,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 0e60f27..7c7bf3c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -25,13 +25,13 @@
 
 ; GCN-LABEL: {{^}}rint_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI-NOT: v_and_b32
 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 839ff06..aa22888 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -1,10 +1,10 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
 declare half @llvm.sin.f16(half %a)
 declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)
 
-; GCN-LABEL: {{^}}sin_f16
+; GCN-LABEL: {{^}}sin_f16:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; GCN: v_mul_f32_e32 v[[M_F32:[0-9]+]], {{0.15915494|0x3e22f983}}, v[[A_F32]]
@@ -23,16 +23,20 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}sin_v2f16
+; GCN-LABEL: {{^}}sin_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}}
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]]
-; SI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
-; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]]
-; SI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
+; SI:  v_mov_b32_e32 v[[HALF_PI:[0-9]+]], 0x3e22f983{{$}}
+
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PI]]
+; SI: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
+; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PI]]
+; SI: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
+; SI: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
+; SI: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
+; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 
 ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -40,12 +44,11 @@
 ; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
 ; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
 ; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
+; VI: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
+; VI: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
 
-; GCN-DAG: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
-; GCN-DAG: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
 ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index a8ff7cb..1e9212e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -24,13 +24,13 @@
 
 ; GCN-LABEL: {{^}}trunc_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI-NOT: v_and_b32
 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index 5d584b6..8a21f75 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -17,7 +17,7 @@
 ; GCN-NOT: load_dword
 
 ; GCN: flat_store_dwordx2
-define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, i64* %ptr0, i64* %ptr1, i64 addrspace(1)* %ptr2) {
+define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], i64* %ptr0, [8 x i32], i64* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) {
   %tmp2 = icmp eq i32 %tmp, 0
   %tmp3 = load i64, i64* %ptr0, align 8
   %tmp4 = load i64, i64* %ptr1, align 8
@@ -38,7 +38,7 @@
 ; GCN: v_cndmask_b32
 ; GCN: v_cndmask_b32
 ; GCN: flat_store_dwordx2
-define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, i64 addrspace(1)* %ptr0, i64 addrspace(1)* %ptr1, i64 addrspace(1)* %ptr2) {
+define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], i64 addrspace(1)* %ptr0, [8 x i32], i64 addrspace(1)* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) {
   %tmp2 = icmp eq i32 %tmp, 0
   %tmp3 = load i64, i64 addrspace(1)* %ptr0, align 8
   %tmp4 = load i64, i64 addrspace(1)* %ptr1, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll
new file mode 100644
index 0000000..83a2e58
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll
@@ -0,0 +1,1286 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; FIXME: Manually added checks for metadata nodes at bottom
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=HSA %s
+; RUN: opt -mtriple=amdgcn-- -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=MESA %s
+
+define amdgpu_kernel void @kern_noargs() {
+; HSA-LABEL: @kern_noargs(
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_noargs(
+; MESA-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @kern_i8(i8 %arg) #0 {
+; HSA-LABEL: @kern_i8(
+; HSA-NEXT:    [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]] to [[KERN_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    store i8 [[TMP4]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_i8(
+; MESA-NEXT:    [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    store i8 [[TMP5]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store i8 %arg, i8 addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_i16(i16 %arg) #0 {
+; HSA-LABEL: @kern_i16(
+; HSA-NEXT:    [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]] to [[KERN_I16:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+; HSA-NEXT:    store i16 [[TMP4]], i16 addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_i16(
+; MESA-NEXT:    [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I16:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+; MESA-NEXT:    store i16 [[TMP5]], i16 addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store i16 %arg, i16 addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_f16(half %arg) #0 {
+; HSA-LABEL: @kern_f16(
+; HSA-NEXT:    [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]] to [[KERN_F16:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+; HSA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i16 [[TMP4]] to half
+; HSA-NEXT:    store half [[ARG_LOAD]], half addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_f16(
+; MESA-NEXT:    [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_F16:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+; MESA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
+; MESA-NEXT:    store half [[ARG_LOAD]], half addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store half %arg, half addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_zeroext_i8(i8 zeroext %arg) #0 {
+; HSA-LABEL: @kern_zeroext_i8(
+; HSA-NEXT:    [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]] to [[KERN_ZEROEXT_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    store i8 [[TMP4]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_zeroext_i8(
+; MESA-NEXT:    [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_ZEROEXT_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !1, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    store i8 [[TMP5]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store i8 %arg, i8 addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_zeroext_i16(i16 zeroext %arg) #0 {
+; HSA-LABEL: @kern_zeroext_i16(
+; HSA-NEXT:    [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]] to [[KERN_ZEROEXT_I16:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+; HSA-NEXT:    store i16 [[TMP4]], i16 addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_zeroext_i16(
+; MESA-NEXT:    [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_ZEROEXT_I16:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !2, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+; MESA-NEXT:    store i16 [[TMP5]], i16 addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store i16 %arg, i16 addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_signext_i8(i8 signext %arg) #0 {
+; HSA-LABEL: @kern_signext_i8(
+; HSA-NEXT:    [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]] to [[KERN_SIGNEXT_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    store i8 [[TMP4]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_signext_i8(
+; MESA-NEXT:    [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_SIGNEXT_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !3, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    store i8 [[TMP5]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store i8 %arg, i8 addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_signext_i16(i16 signext %arg) #0 {
+; HSA-LABEL: @kern_signext_i16(
+; HSA-NEXT:    [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]] to [[KERN_SIGNEXT_I16:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+; HSA-NEXT:    store i16 [[TMP4]], i16 addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_signext_i16(
+; MESA-NEXT:    [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_SIGNEXT_I16:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+; MESA-NEXT:    store i16 [[TMP5]], i16 addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store i16 %arg, i16 addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_i8_i8(i8 %arg0, i8 %arg1) {
+; HSA-LABEL: @kern_i8_i8(
+; HSA-NEXT:    [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]] to [[KERN_I8_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
+; HSA-NEXT:    store volatile i8 [[TMP4]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_i8_i8(
+; MESA-NEXT:    [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I8_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i8 [[TMP9]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    ret void
+;
+  store volatile i8 %arg0, i8 addrspace(1)* undef, align 1
+  store volatile i8 %arg1, i8 addrspace(1)* undef, align 1
+  ret void
+}
+
+define amdgpu_kernel void @kern_v3i8(<3 x i8> %arg) {
+; HSA-LABEL: @kern_v3i8(
+; HSA-NEXT:    [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]] to [[KERN_V3I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
+; HSA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
+; HSA-NEXT:    store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_v3i8(
+; MESA-NEXT:    [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_V3I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i24
+; MESA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i24 [[TMP5]] to <3 x i8>
+; MESA-NEXT:    store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4
+; MESA-NEXT:    ret void
+;
+  store <3 x i8> %arg, <3 x i8> addrspace(1)* undef, align 4
+  ret void
+}
+
+define amdgpu_kernel void @kern_i24(i24 %arg0) {
+; HSA-LABEL: @kern_i24(
+; HSA-NEXT:    [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]] to [[KERN_I24:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
+; HSA-NEXT:    store i24 [[TMP4]], i24 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_i24(
+; MESA-NEXT:    [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I24:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i24
+; MESA-NEXT:    store i24 [[TMP5]], i24 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store i24 %arg0, i24 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_i32(i32 %arg0) {
+; HSA-LABEL: @kern_i32(
+; HSA-NEXT:    [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]] to [[KERN_I32:%.*]] addrspace(4)*
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32]], [[KERN_I32]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_i32(
+; MESA-NEXT:    [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I32:%.*]] addrspace(4)*
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32]], [[KERN_I32]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store i32 %arg0, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_f32(float %arg0) {
+; HSA-LABEL: @kern_f32(
+; HSA-NEXT:    [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]] to [[KERN_F32:%.*]] addrspace(4)*
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_F32]], [[KERN_F32]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_f32(
+; MESA-NEXT:    [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_F32:%.*]] addrspace(4)*
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_F32]], [[KERN_F32]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store float %arg0, float addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_v3i32(<3 x i32> %arg0) {
+; HSA-LABEL: @kern_v3i32(
+; HSA-NEXT:    [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]] to [[KERN_V3I32:%.*]] addrspace(4)*
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_V3I32]], [[KERN_V3I32]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[TMP2:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP2]], align 16, !invariant.load !0
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; HSA-NEXT:    store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_v3i32(
+; MESA-NEXT:    [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_V3I32:%.*]] addrspace(4)*
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_V3I32]], [[KERN_V3I32]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[TMP3:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP3]], align 4, !invariant.load !0
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; MESA-NEXT:    store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4
+; MESA-NEXT:    ret void
+;
+  store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef, align 4
+  ret void
+}
+
+define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) {
+; HSA-LABEL: @kern_i32_v3i32(
+; HSA-NEXT:    [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]] to [[KERN_I32_V3I32:%.*]] addrspace(4)*
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP1]], i32 0, i32 1
+; HSA-NEXT:    [[TMP2:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP2]], align 16, !invariant.load !0
+; HSA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; HSA-NEXT:    store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_i32_v3i32(
+; MESA-NEXT:    [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I32_V3I32:%.*]] addrspace(4)*
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP2]], i32 0, i32 1
+; MESA-NEXT:    [[TMP3:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP3]], align 4, !invariant.load !0
+; MESA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; MESA-NEXT:    store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
+; MESA-NEXT:    ret void
+;
+  store i32 %arg0, i32 addrspace(1)* undef
+  store <3 x i32> %arg1, <3 x i32> addrspace(1)* undef, align 4
+  ret void
+}
+
+%struct.a = type { i32, i8, [4 x i8] }
+%struct.b.packed = type { i8, i32, [3 x i16], <2 x double> }
+
+define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) {
+; HSA-LABEL: @kern_struct_a(
+; HSA-NEXT:    [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]] to [[KERN_STRUCT_A:%.*]] addrspace(4)*
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_A]], [[KERN_STRUCT_A]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A:%.*]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_struct_a(
+; MESA-NEXT:    [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_STRUCT_A:%.*]] addrspace(4)*
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_A]], [[KERN_STRUCT_A]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A:%.*]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store %struct.a %arg0, %struct.a addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_struct_b_packed(%struct.b.packed %arg0) #0 {
+; HSA-LABEL: @kern_struct_b_packed(
+; HSA-NEXT:    [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]] to [[KERN_STRUCT_B_PACKED:%.*]] addrspace(4)*
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_B_PACKED]], [[KERN_STRUCT_B_PACKED]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED:%.*]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_struct_b_packed(
+; MESA-NEXT:    [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_STRUCT_B_PACKED:%.*]] addrspace(4)*
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_B_PACKED]], [[KERN_STRUCT_B_PACKED]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED:%.*]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store %struct.b.packed %arg0, %struct.b.packed addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_implicit_arg_num_bytes(i32 %arg0) #1 {
+; HSA-LABEL: @kern_implicit_arg_num_bytes(
+; HSA-NEXT:    [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]] to [[KERN_IMPLICIT_ARG_NUM_BYTES:%.*]] addrspace(4)*
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_IMPLICIT_ARG_NUM_BYTES]], [[KERN_IMPLICIT_ARG_NUM_BYTES]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_implicit_arg_num_bytes(
+; MESA-NEXT:    [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(80) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_IMPLICIT_ARG_NUM_BYTES:%.*]] addrspace(4)*
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_IMPLICIT_ARG_NUM_BYTES]], [[KERN_IMPLICIT_ARG_NUM_BYTES]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store i32 %arg0, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_lds_ptr(i32 addrspace(3)* %lds) #0 {
+; HSA-LABEL: @kern_lds_ptr(
+; HSA-NEXT:    [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]] to [[KERN_LDS_PTR:%.*]] addrspace(4)*
+; HSA-NEXT:    [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_LDS_PTR]], [[KERN_LDS_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_lds_ptr(
+; MESA-NEXT:    [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_LDS_PTR:%.*]] addrspace(4)*
+; MESA-NEXT:    [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_LDS_PTR]], [[KERN_LDS_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
+; MESA-NEXT:    ret void
+;
+  store i32 0, i32 addrspace(3)* %lds, align 4
+  ret void
+}
+
+define amdgpu_kernel void @kern_lds_ptr_si(i32 addrspace(3)* %lds) #2 {
+; HSA-LABEL: @kern_lds_ptr_si(
+; HSA-NEXT:    [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_LDS_PTR_SI_KERNARG_SEGMENT]] to [[KERN_LDS_PTR_SI:%.*]] addrspace(4)*
+; HSA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_lds_ptr_si(
+; MESA-NEXT:    [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_SI_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_LDS_PTR_SI:%.*]] addrspace(4)*
+; MESA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
+; MESA-NEXT:    ret void
+;
+  store i32 0, i32 addrspace(3)* %lds, align 4
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 {
+; HSA-LABEL: @kern_realign_i8_i8(
+; HSA-NEXT:    [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
+; HSA-NEXT:    store volatile i8 [[TMP4]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i8_i8(
+; MESA-NEXT:    [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP9]], i8 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile i8 %arg1, i8 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #0 {
+; HSA-LABEL: @kern_realign_i8_i8_i8(
+; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
+; HSA-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP11:%.*]] = lshr i32 [[TMP10]], 16
+; HSA-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8
+; HSA-NEXT:    store volatile i8 [[TMP4]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP12]], i8 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i8_i8_i8(
+; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
+; MESA-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 16
+; MESA-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP9]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP13]], i8 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile i8 %arg1, i8 addrspace(1)* undef
+  store volatile i8 %arg2, i8 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) #0 {
+; HSA-LABEL: @kern_realign_i8_i8_i8_i8(
+; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8_I8_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
+; HSA-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP11:%.*]] = lshr i32 [[TMP10]], 16
+; HSA-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8
+; HSA-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP13]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP14:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP15:%.*]] = lshr i32 [[TMP14]], 24
+; HSA-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i8
+; HSA-NEXT:    store volatile i8 [[TMP4]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP12]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP16]], i8 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i8_i8_i8_i8(
+; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8_I8_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
+; MESA-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 16
+; MESA-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8
+; MESA-NEXT:    [[TMP14:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP14]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 24
+; MESA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP9]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP13]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile i8 %arg1, i8 addrspace(1)* undef
+  store volatile i8 %arg2, i8 addrspace(1)* undef
+  store volatile i8 %arg3, i8 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 {
+; HSA-LABEL: @kern_realign_i8_v3i8(
+; HSA-NEXT:    [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_V3I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 4
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i24
+; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP7]] to <3 x i8>
+; HSA-NEXT:    store volatile i8 [[TMP4]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i8_v3i8(
+; MESA-NEXT:    [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_V3I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i24
+; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP8]] to <3 x i8>
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile <3 x i8> %arg1, <3 x i8> addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 {
+; HSA-LABEL: @kern_realign_i8_i16(
+; HSA-NEXT:    [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I16:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16
+; HSA-NEXT:    store volatile i8 [[TMP4]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i16 [[TMP8]], i16 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i8_i16(
+; MESA-NEXT:    [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I16:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 16
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i16 [[TMP9]], i16 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile i16 %arg1, i16 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 {
+; HSA-LABEL: @kern_realign_i1_i1(
+; HSA-NEXT:    [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I1:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
+; HSA-NEXT:    store volatile i1 [[TMP4]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i1_i1(
+; MESA-NEXT:    [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I1:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i1
+; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP9]], i1 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i1 %arg0, i1 addrspace(1)* undef
+  store volatile i1 %arg1, i1 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #0 {
+; HSA-LABEL: @kern_realign_i1_i1_i1(
+; HSA-NEXT:    [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I1_I1:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
+; HSA-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP11:%.*]] = lshr i32 [[TMP10]], 16
+; HSA-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i1
+; HSA-NEXT:    store volatile i1 [[TMP4]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP12]], i1 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i1_i1_i1(
+; MESA-NEXT:    [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I1_I1:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i1
+; MESA-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 16
+; MESA-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i1
+; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP9]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP13]], i1 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i1 %arg0, i1 addrspace(1)* undef
+  store volatile i1 %arg1, i1 addrspace(1)* undef
+  store volatile i1 %arg2, i1 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3) #0 {
+; HSA-LABEL: @kern_realign_i1_i1_i1_i1(
+; HSA-NEXT:    [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I1_I1_I1:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
+; HSA-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP11:%.*]] = lshr i32 [[TMP10]], 16
+; HSA-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i1
+; HSA-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP13]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP14:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP15:%.*]] = lshr i32 [[TMP14]], 24
+; HSA-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i1
+; HSA-NEXT:    store volatile i1 [[TMP4]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP12]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP16]], i1 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i1_i1_i1_i1(
+; MESA-NEXT:    [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I1_I1_I1:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i1
+; MESA-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 16
+; MESA-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i1
+; MESA-NEXT:    [[TMP14:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP14]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 24
+; MESA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i1
+; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP9]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP13]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP17]], i1 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i1 %arg0, i1 addrspace(1)* undef
+  store volatile i1 %arg1, i1 addrspace(1)* undef
+  store volatile i1 %arg2, i1 addrspace(1)* undef
+  store volatile i1 %arg3, i1 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 {
+; HSA-LABEL: @kern_realign_i1_v3i1(
+; HSA-NEXT:    [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_V3I1:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 4
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
+; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP7]] to <3 x i1>
+; HSA-NEXT:    store volatile i1 [[TMP4]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i1_v3i1(
+; MESA-NEXT:    [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_V3I1:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i3
+; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP8]] to <3 x i1>
+; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i1 %arg0, i1 addrspace(1)* undef
+  store volatile <3 x i1> %arg1, <3 x i1> addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 {
+; HSA-LABEL: @kern_realign_i1_i16(
+; HSA-NEXT:    [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I16:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16
+; HSA-NEXT:    store volatile i1 [[TMP4]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i16 [[TMP8]], i16 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i1_i16(
+; MESA-NEXT:    [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I16:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 16
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
+; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i16 [[TMP9]], i16 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i1 %arg0, i1 addrspace(1)* undef
+  store volatile i16 %arg1, i16 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4, i8 %arg5, i8 %arg6, i8 %arg7) #0 {
+; HSA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
+; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
+; HSA-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP11:%.*]] = lshr i32 [[TMP10]], 16
+; HSA-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8
+; HSA-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP13]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP14:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP15:%.*]] = lshr i32 [[TMP14]], 24
+; HSA-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i8
+; HSA-NEXT:    [[TMP17:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
+; HSA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP17]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; HSA-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP18]], 8
+; HSA-NEXT:    [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
+; HSA-NEXT:    [[TMP21:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
+; HSA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP21]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP22:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; HSA-NEXT:    [[TMP23:%.*]] = lshr i32 [[TMP22]], 16
+; HSA-NEXT:    [[TMP24:%.*]] = trunc i32 [[TMP23]] to i8
+; HSA-NEXT:    [[TMP25:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
+; HSA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP25]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP26:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; HSA-NEXT:    [[TMP27:%.*]] = lshr i32 [[TMP26]], 24
+; HSA-NEXT:    [[TMP28:%.*]] = trunc i32 [[TMP27]] to i8
+; HSA-NEXT:    store volatile i8 [[TMP4]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP12]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP16]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP20]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP24]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP28]], i8 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
+; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
+; MESA-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 16
+; MESA-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8
+; MESA-NEXT:    [[TMP14:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP14]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 24
+; MESA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
+; MESA-NEXT:    [[TMP18:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4
+; MESA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP18]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP19:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP20:%.*]] = lshr i32 [[TMP19]], 8
+; MESA-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
+; MESA-NEXT:    [[TMP22:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4
+; MESA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP22]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP23:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP24:%.*]] = lshr i32 [[TMP23]], 16
+; MESA-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i8
+; MESA-NEXT:    [[TMP26:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4
+; MESA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP26]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP27:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP28:%.*]] = lshr i32 [[TMP27]], 24
+; MESA-NEXT:    [[TMP29:%.*]] = trunc i32 [[TMP28]] to i8
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP9]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP13]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP21]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP25]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP29]], i8 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile i8 %arg1, i8 addrspace(1)* undef
+  store volatile i8 %arg2, i8 addrspace(1)* undef
+  store volatile i8 %arg3, i8 addrspace(1)* undef
+  store volatile i8 %arg5, i8 addrspace(1)* undef
+  store volatile i8 %arg6, i8 addrspace(1)* undef
+  store volatile i8 %arg7, i8 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 {
+; HSA-LABEL: @kern_realign_f16_f16(
+; HSA-NEXT:    [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]] to [[KERN_REALIGN_F16_F16:%.*]] addrspace(4)*
+; HSA-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+; HSA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP4]] to half
+; HSA-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
+; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
+; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
+; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16
+; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP8]] to half
+; HSA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef
+; HSA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_realign_f16_f16(
+; MESA-NEXT:    [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_F16_F16:%.*]] addrspace(4)*
+; MESA-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+; MESA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
+; MESA-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
+; MESA-NEXT:    [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
+; MESA-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 16
+; MESA-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
+; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP9]] to half
+; MESA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef
+; MESA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile half %arg0, half addrspace(1)* undef
+  store volatile half %arg1, half addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_global_ptr(i8 addrspace(1)* %ptr) #0 {
+; HSA-LABEL: @kern_global_ptr(
+; HSA-NEXT:    [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_GLOBAL_PTR:%.*]] addrspace(4)*
+; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR]], [[KERN_GLOBAL_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_global_ptr(
+; MESA-NEXT:    [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_GLOBAL_PTR:%.*]] addrspace(4)*
+; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR]], [[KERN_GLOBAL_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_global_ptr_dereferencable(i8 addrspace(1)* dereferenceable(42) %ptr) #0 {
+; HSA-LABEL: @kern_global_ptr_dereferencable(
+; HSA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE:%.*]] addrspace(4)*
+; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE]], [[KERN_GLOBAL_PTR_DEREFERENCABLE]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !dereferenceable !1
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_global_ptr_dereferencable(
+; MESA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE:%.*]] addrspace(4)*
+; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE]], [[KERN_GLOBAL_PTR_DEREFERENCABLE]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !dereferenceable !5
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(i8 addrspace(1)* dereferenceable_or_null(128) %ptr) #0 {
+; HSA-LABEL: @kern_global_ptr_dereferencable_or_null(
+; HSA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL:%.*]] addrspace(4)*
+; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]], [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !dereferenceable_or_null !2
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_global_ptr_dereferencable_or_null(
+; MESA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL:%.*]] addrspace(4)*
+; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]], [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !dereferenceable_or_null !6
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_nonnull_global_ptr(i8 addrspace(1)* nonnull %ptr) #0 {
+; HSA-LABEL: @kern_nonnull_global_ptr(
+; HSA-NEXT:    [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_NONNULL_GLOBAL_PTR:%.*]] addrspace(4)*
+; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_NONNULL_GLOBAL_PTR]], [[KERN_NONNULL_GLOBAL_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !nonnull !0
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_nonnull_global_ptr(
+; MESA-NEXT:    [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_NONNULL_GLOBAL_PTR:%.*]] addrspace(4)*
+; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_NONNULL_GLOBAL_PTR]], [[KERN_NONNULL_GLOBAL_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !nonnull !0
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 %ptr) #0 {
+; HSA-LABEL: @kern_align32_global_ptr(
+; HSA-NEXT:    [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_ALIGN32_GLOBAL_PTR:%.*]] addrspace(4)*
+; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_ALIGN32_GLOBAL_PTR]], [[KERN_ALIGN32_GLOBAL_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !align !3
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_align32_global_ptr(
+; MESA-NEXT:    [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_ALIGN32_GLOBAL_PTR:%.*]] addrspace(4)*
+; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_ALIGN32_GLOBAL_PTR]], [[KERN_ALIGN32_GLOBAL_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0
+; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !align !7
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_noalias_global_ptr(i8 addrspace(1)* noalias %ptr) #0 {
+; HSA-LABEL: @kern_noalias_global_ptr(
+; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_NOALIAS_GLOBAL_PTR:%.*]] addrspace(4)*
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_noalias_global_ptr(
+; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_NOALIAS_GLOBAL_PTR:%.*]] addrspace(4)*
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @kern_noalias_global_ptr_x2(i8 addrspace(1)* noalias %ptr0, i8 addrspace(1)* noalias %ptr1) #0 {
+; HSA-LABEL: @kern_noalias_global_ptr_x2(
+; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]] to [[KERN_NOALIAS_GLOBAL_PTR_X2:%.*]] addrspace(4)*
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_noalias_global_ptr_x2(
+; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_NOALIAS_GLOBAL_PTR_X2:%.*]] addrspace(4)*
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store volatile i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* undef
+  store volatile i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="kaveri" }
+attributes #1 = { nounwind "target-cpu"="kaveri" "amdgpu-implicitarg-num-bytes"="40" }
+attributes #2 = { nounwind "target-cpu"="tahiti" }
+
+; HSA: 0 = !{}
+; HSA: !1 = !{i64 42}
+; HSA: !2 = !{i64 128}
+; HSA: !3 = !{i64 1024}
+
+
+; MESA: !0 = !{}
+; MESA: !1 = !{i32 0, i32 256}
+; MESA: !2 = !{i32 0, i32 65536}
+; MESA: !3 = !{i32 -128, i32 128}
+; MESA: !4 = !{i32 -32768, i32 32768}
+; MESA: !5 = !{i64 42}
+; MESA: !6 = !{i64 128}
+; MESA: !7 = !{i64 1024}
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 413c93d..e88550f 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -8,28 +8,14 @@
 ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
 
-
-; VI: s_load_dword [[LHS:s[0-9]+]]
-; VI: s_load_dword [[RHS:s[0-9]+]]
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; VI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
-; VI-DAG: s_lshl_b32
-; VI: v_or_b32_e32
-
-; CI: s_load_dword s
-; CI-NEXT: s_load_dword s
-; CI-NOT: {{buffer|flat}}
-; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
-; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-; CI: s_and_b32
-; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; CI: s_and_b32
-; CI: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
-; CI: s_lshl_b32
-; CI: v_or_b32_e32
+; CIVI: s_load_dword [[LHS:s[0-9]+]]
+; CIVI: s_load_dword [[RHS:s[0-9]+]]
+; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
+; CIVI-DAG: s_lshl_b32
+; CIVI: v_or_b32_e32
 define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
   %result = lshr <2 x i16> %lhs, %rhs
   store <2 x i16> %result, <2 x i16> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index 9fe1af8..3528eaf 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -206,14 +206,14 @@
 ; SIFoldOperands should not fold the SGPR copy into the instruction
 ; because the implicit immediate already uses the constant bus.
 ; GCN-LABEL: {{^}}madak_constant_bus_violation:
-; GCN:    s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
+; GCN:    s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
 ; GCN:    v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
 ; GCN:    {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]]
 ; GCN:    v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
 ; GCN:    v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
 ; GFX6:   buffer_store_dword [[MUL]]
 ; GFX8_9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]]
-define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
+define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 {
 bb:
   %tmp = icmp eq i32 %arg1, 0
   br i1 %tmp, label %bb3, label %bb4
diff --git a/llvm/test/CodeGen/AMDGPU/madmk.ll b/llvm/test/CodeGen/AMDGPU/madmk.ll
index 3eefa60..7f8bb87 100644
--- a/llvm/test/CodeGen/AMDGPU/madmk.ll
+++ b/llvm/test/CodeGen/AMDGPU/madmk.ll
@@ -83,7 +83,7 @@
 ; GCN-NOT: v_madmk_f32
 ; GCN: v_mac_f32_e32
 ; GCN: s_endpgm
-define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index 6387c9f..b79f477 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -216,14 +216,14 @@
 
 ; Make sure redundant and removed
 ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]]
 ; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
 ; SI: buffer_store_dword [[VMAX]]
 
 ; EG: MAX_UINT
-define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) nounwind {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
   %cmp = icmp ugt i32 %a.ext, %b.ext
@@ -236,14 +236,14 @@
 ; Make sure redundant sign_extend_inreg removed.
 
 ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_max_slt_i16:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]]
 ; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
 ; SI: buffer_store_dword [[VMAX]]
 
 ; EG: MAX_INT
-define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) nounwind {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
   %cmp = icmp sgt i32 %a.ext, %b.ext
@@ -262,7 +262,7 @@
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, [8 x i32], i16 %a, [8 x i32], i16 %b) nounwind {
   %cmp = icmp sge i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
   store i16 %val, i16 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index b4b1ab8..b2d62f5 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
 ; GCN: v_min_i32_e32
@@ -65,16 +65,14 @@
 ; GCN: s_sext_i32_i8
 ; GCN: s_sext_i32_i8
 ; GCN: s_min_i32
-define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, [8 x i32], i8 %a, [8 x i32], i8 %b) #0 {
   %cmp = icmp sle i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
   store i8 %val, i8 addrspace(1)* %out
   ret void
 }
 
-; XXX - should be able to use s_min if we stop unnecessarily doing
-; extloads with mubuf instructions.
-
+; FIXME: Why vector and sdwa for last element?
 ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8:
 ; GCN: s_load_dword s
 ; GCN: s_load_dword s
@@ -88,7 +86,7 @@
 ; VI: s_min_i32
 ; VI: s_min_i32
 ; VI: s_min_i32
-; VI: s_min_i32
+; VI: v_min_i32_sdwa
 
 ; GFX9: v_min_i16
 ; GFX9: v_min_i16
@@ -99,7 +97,7 @@
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], <4 x i8> %b) #0 {
   %cmp = icmp sle <4 x i8> %a, %b
   %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out
@@ -111,8 +109,8 @@
 ; GCN: s_load_dword s
 
 ; SI: s_ashr_i32
-; SI: s_ashr_i32
 ; SI: s_sext_i32_i16
+; SI: s_ashr_i32
 ; SI: s_sext_i32_i16
 ; SI: s_min_i32
 ; SI: s_min_i32
@@ -346,8 +344,8 @@
 }
 
 ; FUNC-LABEL: {{^}}v_test_umin_ult_i8:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
+; SI: {{buffer|flat|global}}_load_ubyte
+; SI: {{buffer|flat|global}}_load_ubyte
 ; SI: v_min_u32_e32
 
 ; GFX89: {{flat|global}}_load_ubyte
@@ -490,14 +488,14 @@
 
 ; Make sure redundant and removed
 ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16:
-; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
+; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
 ; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
 ; GCN: buffer_store_dword [[VMIN]]
 
 ; EG: MIN_UINT
-define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
+define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
   %cmp = icmp ult i32 %a.ext, %b.ext
@@ -510,14 +508,17 @@
 ; Make sure redundant sign_extend_inreg removed.
 
 ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16:
-; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
-; GCN: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
+; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
+; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
+; GCN-DAG: s_sext_i32_i16 [[EXT_A:s[0-9]+]], [[A]]
+; GCN-DAG: s_sext_i32_i16 [[EXT_B:s[0-9]+]], [[B]]
+
+; GCN: s_min_i32 [[MIN:s[0-9]+]], [[EXT_A]], [[EXT_B]]
 ; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
 ; GCN: buffer_store_dword [[VMIN]]
 
 ; EG: MIN_INT
-define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 {
+define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
   %cmp = icmp slt i32 %a.ext, %b.ext
diff --git a/llvm/test/CodeGen/AMDGPU/missing-store.ll b/llvm/test/CodeGen/AMDGPU/missing-store.ll
index 9b00507..9746ee0 100644
--- a/llvm/test/CodeGen/AMDGPU/missing-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/missing-store.ll
@@ -6,7 +6,7 @@
 ; resulting in losing the store to gptr
 
 ; FUNC-LABEL: {{^}}missing_store_reduced:
-; SI: s_load_dwordx2
+; SI: s_load_dwordx4
 ; SI: ds_read_b64
 ; SI-DAG: buffer_store_dword
 ; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
index 5bc59c9..2144b4b 100644
--- a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
@@ -19,7 +19,7 @@
 ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}},
 
-define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
+define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, [8 x i32], i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
 bb:
   %tmp = icmp sgt i32 %arg3, 0
   br i1 %tmp, label %bb4, label %bb17
diff --git a/llvm/test/CodeGen/AMDGPU/mul.i16.ll b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
index 0250002..678fc3d 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
@@ -16,7 +16,8 @@
 
 ; FIXME: Should emit scalar mul or maybe i16 v_mul here
 ; GCN-LABEL: {{^}}s_mul_i16:
-; GCN: v_mul_u32_u24
+; SI: v_mul_u32_u24
+; VI: s_mul_i16
 define amdgpu_kernel void @s_mul_i16(i16 %a, i16 %b) {
   %r.val = mul i16 %a, %b
   store volatile i16 %r.val, i16 addrspace(1)* null
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index a3d05d1..b446158 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -114,7 +114,7 @@
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind {
   %mul = mul i32 %a, %b
   store i32 %mul, i32 addrspace(1)* %out, align 4
   ret void
@@ -201,10 +201,8 @@
 
 ; FIXME: Load dwordx4
 ; FUNC-LABEL: {{^}}s_mul_i128:
-; GCN: s_load_dwordx2
-; GCN: s_load_dwordx2
-; GCN: s_load_dwordx2
-; GCN: s_load_dwordx2
+; GCN: s_load_dwordx4
+; GCN: s_load_dwordx4
 
 ; SI: v_mul_hi_u32
 ; SI: v_mul_hi_u32
@@ -220,18 +218,23 @@
 ; SI-DAG: s_mul_i32
 ; SI-DAG: v_mul_hi_u32
 
-; VI: s_mul_i32
 ; VI: v_mul_hi_u32
 ; VI: s_mul_i32
+; VI: s_mul_i32
 ; VI: v_mul_hi_u32
+; VI: v_mul_hi_u32
+; VI: s_mul_i32
 ; VI: v_mad_u64_u32
+; VI: s_mul_i32
 ; VI: v_mad_u64_u32
+; VI: s_mul_i32
+; VI: s_mul_i32
 ; VI: v_mad_u64_u32
-
+; VI: s_mul_i32
 
 
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
+define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 {
   %mul = mul i128 %a, %b
   store i128 %mul, i128 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 3137569..84f6d4f 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -70,7 +70,7 @@
 ; GCN-DAG: v_mul_i32_i24_e32
 
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
   %shl.i = shl i32 %a, 8
   %shr.i = ashr i32 %shl.i, 8
   %conv.i = sext i32 %shr.i to i64
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index e538dd7..81f72dd 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -18,8 +18,11 @@
 }
 
 ; FUNC-LABEL: {{^}}test_umul24_i16_sext:
-; GCN: v_mul_u32_u24_e{{(32|64)}} [[VI_MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; GCN: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16
+; SI: v_mul_u32_u24_e{{(32|64)}} [[VI_MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16
+
+; VI: s_mul_i32 [[MUL:s[0-9]+]]
+; VI: s_sext_i32_i16 s{{[0-9]+}}, [[MUL]]
 define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
@@ -46,9 +49,12 @@
 }
 
 ; FUNC-LABEL: {{^}}test_umul24_i16:
-; GCN: s_and_b32
-; GCN: v_mul_u32_u24_e32
-; GCN: v_and_b32_e32
+; SI: s_and_b32
+; SI: v_mul_u32_u24_e32
+; SI: v_and_b32_e32
+
+; VI: s_mul_i32
+; VI: s_and_b32
 define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
@@ -147,7 +153,7 @@
 ; GCN-NOT: s_and_b32
 ; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
 ; GCN-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
-define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
 entry:
   %tmp0 = shl i64 %a, 40
   %a.24 = lshr i64 %tmp0, 40
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index b2cfeca..e436a85 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -70,14 +70,14 @@
 ; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
 
-; GCN: ; %Flow1
+; GCN: ; %Flow5
 ; GCN-NEXT: s_or_b64 exec, exec
 ; GCN: v_cmp_ne_u32_e32 vcc, 0
 
 ; GCN: ; %exit1
 ; GCN: ds_write_b32
 
-; GCN: %Flow2
+; GCN: %Flow6
 ; GCN-NEXT: s_or_b64 exec, exec
 ; GCN: v_cmp_ne_u32_e32 vcc, 0
 ; GCN-NEXT: s_and_saveexec_b64
diff --git a/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
index 48b9c13..eb16f85 100644
--- a/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -78,7 +78,7 @@
 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
 ; SI: s_load_dword s
 ; SI: buffer_store_dword v
-define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
   %trunc = trunc i64 %arg to i32
   store i32 %trunc, i32 addrspace(1)* %out
   ret void
@@ -100,7 +100,7 @@
 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
 ; SI: s_load_dword s
 ; SI: buffer_store_dword v
-define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
   %srl = lshr i64 %arg, 32
   %trunc = trunc i64 %srl to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -147,7 +147,7 @@
 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
   %srl = lshr i64 %arg, 32
   %trunc = trunc i64 %srl to i8
   store i8 %trunc, i8 addrspace(1)* %out
@@ -171,7 +171,7 @@
 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
   %trunc = trunc i64 %arg to i8
   store i8 %trunc, i8 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll b/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
index bced3c4..763d754 100644
--- a/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
@@ -6,7 +6,7 @@
 ; GCN:  v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
 ; GCN:  flat_load_dword v{{[0-9]+}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
 
-define amdgpu_kernel void @volatile_load(i32 addrspace(1)* %arg, i32 addrspace(1)* nocapture %arg1) {
+define amdgpu_kernel void @volatile_load(i32 addrspace(1)* %arg, [8 x i32], i32 addrspace(1)* nocapture %arg1) {
 bb:
   %tmp18 = load volatile i32, i32 addrspace(1)* %arg, align 4
   %tmp26 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 5
diff --git a/llvm/test/CodeGen/AMDGPU/operand-spacing.ll b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
index fc6f070..6fc0955 100644
--- a/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
@@ -4,14 +4,18 @@
 ; Make sure there isn't an extra space between the instruction name and first operands.
 
 ; GCN-LABEL: {{^}}add_f32:
-; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
-; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
+; SI: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
+; SI: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI: v_mov_b32_e32 [[VREGA:v[0-9]+]], [[SREGA]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGB]], [[VREGA]]
+
+; VI: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; VI: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
+; VI: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
+; VI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
+
 ; GCN: buffer_store_dword [[RESULT]],
-define amdgpu_kernel void @add_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @add_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) {
   %result = fadd float %a, %b
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 8e6885c..88e01c9 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -63,26 +63,26 @@
 }
 
 ; FUNC-LABEL: {{^}}scalar_or_literal_i64:
-; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI-DAG: s_or_b32 s[[RES_HI:[0-9]+]], s[[HI]], 0xf237b
 ; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
-define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %or = or i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}scalar_or_literal_multi_use_i64:
-; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b
 ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039
 ; SI: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
-define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
   %or = or i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
 
@@ -92,7 +92,7 @@
 }
 
 ; FUNC-LABEL: {{^}}scalar_or_inline_imm_i64:
-; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI-NOT: or_b32
 ; SI: s_or_b32 s[[VAL_LO]], s[[VAL_LO]], 63
 ; SI-NOT: or_b32
@@ -101,7 +101,7 @@
 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
 ; SI-NOT: or_b32
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %or = or i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -125,7 +125,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}}
 ; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %or = or i64 %a, -8
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -239,7 +239,7 @@
 ; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
-define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
   %add = or i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
@@ -249,7 +249,7 @@
 ; FUNC-LABEL: {{^}}or_i1:
 ; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
 
-; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
+; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], vcc
 define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
   %a = load float, float addrspace(1)* %in0
   %b = load float, float addrspace(1)* %in1
diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
index 49f00e9..d31d636 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -10,26 +10,26 @@
 
 ; GCN-LABEL: {{^}}spill_sgprs_to_multiple_vgprs:
 
-; GCN: def s[8:15]
-; GCN: def s[16:23]
-; GCN: def s[24:31]
-; GCN: def s[32:39]
-; GCN: def s[40:47]
-; GCN: def s[48:55]
-; GCN: def s[56:63]
-; GCN: def s[64:71]
-; GCN: def s[72:79]
-; GCN: def s[80:87]
-; GCN: def s[88:95]
+; GCN: def s[4:11]
+; GCN: def s[12:19]
+; GCN: def s[20:27]
+; GCN: def s[28:35]
+; GCN: def s[36:43]
+; GCN: def s[44:51]
+; GCN: def s[52:59]
+; GCN: def s[60:67]
+; GCN: def s[68:75]
+; GCN: def s[76:83]
+; GCN: def s[84:91]
 
-; GCN: v_writelane_b32 v0, s8, 0
-; GCN-NEXT: v_writelane_b32 v0, s9, 1
-; GCN-NEXT: v_writelane_b32 v0, s10, 2
-; GCN-NEXT: v_writelane_b32 v0, s11, 3
-; GCN-NEXT: v_writelane_b32 v0, s12, 4
-; GCN-NEXT: v_writelane_b32 v0, s13, 5
-; GCN-NEXT: v_writelane_b32 v0, s14, 6
-; GCN-NEXT: v_writelane_b32 v0, s15, 7
+; GCN: v_writelane_b32 v0, s4, 0
+; GCN-NEXT: v_writelane_b32 v0, s5, 1
+; GCN-NEXT: v_writelane_b32 v0, s6, 2
+; GCN-NEXT: v_writelane_b32 v0, s7, 3
+; GCN-NEXT: v_writelane_b32 v0, s8, 4
+; GCN-NEXT: v_writelane_b32 v0, s9, 5
+; GCN-NEXT: v_writelane_b32 v0, s10, 6
+; GCN-NEXT: v_writelane_b32 v0, s11, 7
 
 ; GCN: def s{{\[}}[[TMP_LO:[0-9]+]]:[[TMP_HI:[0-9]+]]{{\]}}
 ; GCN: v_writelane_b32 v0, s[[TMP_LO]], 8
@@ -37,8 +37,8 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 10
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 11
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 12
-; GCN-NEXT: v_writelane_b32 v0, s13, 13
-; GCN-NEXT: v_writelane_b32 v0, s14, 14
+; GCN-NEXT: v_writelane_b32 v0, s9, 13
+; GCN-NEXT: v_writelane_b32 v0, s10, 14
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 15
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@@ -47,8 +47,8 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 18
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 19
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 20
-; GCN-NEXT: v_writelane_b32 v0, s13, 21
-; GCN-NEXT: v_writelane_b32 v0, s14, 22
+; GCN-NEXT: v_writelane_b32 v0, s9, 21
+; GCN-NEXT: v_writelane_b32 v0, s10, 22
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 23
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@@ -57,8 +57,8 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 26
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 27
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 28
-; GCN-NEXT: v_writelane_b32 v0, s13, 29
-; GCN-NEXT: v_writelane_b32 v0, s14, 30
+; GCN-NEXT: v_writelane_b32 v0, s9, 29
+; GCN-NEXT: v_writelane_b32 v0, s10, 30
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 31
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@@ -67,8 +67,8 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 34
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 35
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 36
-; GCN-NEXT: v_writelane_b32 v0, s13, 37
-; GCN-NEXT: v_writelane_b32 v0, s14, 38
+; GCN-NEXT: v_writelane_b32 v0, s9, 37
+; GCN-NEXT: v_writelane_b32 v0, s10, 38
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 39
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@@ -77,8 +77,8 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 42
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 43
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 44
-; GCN-NEXT: v_writelane_b32 v0, s13, 45
-; GCN-NEXT: v_writelane_b32 v0, s14, 46
+; GCN-NEXT: v_writelane_b32 v0, s9, 45
+; GCN-NEXT: v_writelane_b32 v0, s10, 46
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 47
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@@ -87,90 +87,90 @@
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 50
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 51
 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 52
-; GCN-NEXT: v_writelane_b32 v0, s13, 53
-; GCN-NEXT: v_writelane_b32 v0, s14, 54
+; GCN-NEXT: v_writelane_b32 v0, s9, 53
+; GCN-NEXT: v_writelane_b32 v0, s10, 54
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 55
 
-; GCN-NEXT: v_writelane_b32 v0, s88, 56
-; GCN-NEXT: v_writelane_b32 v0, s89, 57
-; GCN-NEXT: v_writelane_b32 v0, s90, 58
-; GCN-NEXT: v_writelane_b32 v0, s91, 59
-; GCN-NEXT: v_writelane_b32 v0, s92, 60
-; GCN-NEXT: v_writelane_b32 v0, s93, 61
-; GCN-NEXT: v_writelane_b32 v0, s94, 62
-; GCN-NEXT: v_writelane_b32 v0, s95, 63
-; GCN-NEXT: v_writelane_b32 v1, s16, 0
-; GCN-NEXT: v_writelane_b32 v1, s17, 1
-; GCN-NEXT: v_writelane_b32 v1, s18, 2
-; GCN-NEXT: v_writelane_b32 v1, s19, 3
-; GCN-NEXT: v_writelane_b32 v1, s20, 4
-; GCN-NEXT: v_writelane_b32 v1, s21, 5
-; GCN-NEXT: v_writelane_b32 v1, s22, 6
-; GCN-NEXT: v_writelane_b32 v1, s23, 7
-; GCN-NEXT: v_writelane_b32 v1, s24, 8
-; GCN-NEXT: v_writelane_b32 v1, s25, 9
-; GCN-NEXT: v_writelane_b32 v1, s26, 10
-; GCN-NEXT: v_writelane_b32 v1, s27, 11
-; GCN-NEXT: v_writelane_b32 v1, s28, 12
-; GCN-NEXT: v_writelane_b32 v1, s29, 13
-; GCN-NEXT: v_writelane_b32 v1, s30, 14
-; GCN-NEXT: v_writelane_b32 v1, s31, 15
-; GCN-NEXT: v_writelane_b32 v1, s32, 16
-; GCN-NEXT: v_writelane_b32 v1, s33, 17
-; GCN-NEXT: v_writelane_b32 v1, s34, 18
-; GCN-NEXT: v_writelane_b32 v1, s35, 19
-; GCN-NEXT: v_writelane_b32 v1, s36, 20
-; GCN-NEXT: v_writelane_b32 v1, s37, 21
-; GCN-NEXT: v_writelane_b32 v1, s38, 22
-; GCN-NEXT: v_writelane_b32 v1, s39, 23
-; GCN-NEXT: v_writelane_b32 v1, s40, 24
-; GCN-NEXT: v_writelane_b32 v1, s41, 25
-; GCN-NEXT: v_writelane_b32 v1, s42, 26
-; GCN-NEXT: v_writelane_b32 v1, s43, 27
-; GCN-NEXT: v_writelane_b32 v1, s44, 28
-; GCN-NEXT: v_writelane_b32 v1, s45, 29
-; GCN-NEXT: v_writelane_b32 v1, s46, 30
-; GCN-NEXT: v_writelane_b32 v1, s47, 31
-; GCN-NEXT: v_writelane_b32 v1, s48, 32
-; GCN-NEXT: v_writelane_b32 v1, s49, 33
-; GCN-NEXT: v_writelane_b32 v1, s50, 34
-; GCN-NEXT: v_writelane_b32 v1, s51, 35
-; GCN-NEXT: v_writelane_b32 v1, s52, 36
-; GCN-NEXT: v_writelane_b32 v1, s53, 37
-; GCN-NEXT: v_writelane_b32 v1, s54, 38
-; GCN-NEXT: v_writelane_b32 v1, s55, 39
-; GCN-NEXT: v_writelane_b32 v1, s56, 40
-; GCN-NEXT: v_writelane_b32 v1, s57, 41
-; GCN-NEXT: v_writelane_b32 v1, s58, 42
-; GCN-NEXT: v_writelane_b32 v1, s59, 43
-; GCN-NEXT: v_writelane_b32 v1, s60, 44
-; GCN-NEXT: v_writelane_b32 v1, s61, 45
-; GCN-NEXT: v_writelane_b32 v1, s62, 46
-; GCN-NEXT: v_writelane_b32 v1, s63, 47
-; GCN-NEXT: v_writelane_b32 v1, s64, 48
-; GCN-NEXT: v_writelane_b32 v1, s65, 49
-; GCN-NEXT: v_writelane_b32 v1, s66, 50
-; GCN-NEXT: v_writelane_b32 v1, s67, 51
-; GCN-NEXT: v_writelane_b32 v1, s68, 52
-; GCN-NEXT: v_writelane_b32 v1, s69, 53
-; GCN-NEXT: v_writelane_b32 v1, s70, 54
-; GCN-NEXT: v_writelane_b32 v1, s71, 55
-; GCN-NEXT: v_writelane_b32 v1, s72, 56
-; GCN-NEXT: v_writelane_b32 v1, s73, 57
-; GCN-NEXT: v_writelane_b32 v1, s74, 58
-; GCN-NEXT: v_writelane_b32 v1, s75, 59
-; GCN-NEXT: v_writelane_b32 v1, s76, 60
-; GCN-NEXT: v_writelane_b32 v1, s77, 61
-; GCN-NEXT: v_writelane_b32 v1, s78, 62
-; GCN-NEXT: v_writelane_b32 v1, s79, 63
-; GCN-NEXT: v_writelane_b32 v2, s80, 0
-; GCN-NEXT: v_writelane_b32 v2, s81, 1
-; GCN-NEXT: v_writelane_b32 v2, s82, 2
-; GCN-NEXT: v_writelane_b32 v2, s83, 3
-; GCN-NEXT: v_writelane_b32 v2, s84, 4
-; GCN-NEXT: v_writelane_b32 v2, s85, 5
-; GCN-NEXT: v_writelane_b32 v2, s86, 6
-; GCN-NEXT: v_writelane_b32 v2, s87, 7
+; GCN-NEXT: v_writelane_b32 v0, s84, 56
+; GCN-NEXT: v_writelane_b32 v0, s85, 57
+; GCN-NEXT: v_writelane_b32 v0, s86, 58
+; GCN-NEXT: v_writelane_b32 v0, s87, 59
+; GCN-NEXT: v_writelane_b32 v0, s88, 60
+; GCN-NEXT: v_writelane_b32 v0, s89, 61
+; GCN-NEXT: v_writelane_b32 v0, s90, 62
+; GCN-NEXT: v_writelane_b32 v0, s91, 63
+; GCN-NEXT: v_writelane_b32 v1, s12, 0
+; GCN-NEXT: v_writelane_b32 v1, s13, 1
+; GCN-NEXT: v_writelane_b32 v1, s14, 2
+; GCN-NEXT: v_writelane_b32 v1, s15, 3
+; GCN-NEXT: v_writelane_b32 v1, s16, 4
+; GCN-NEXT: v_writelane_b32 v1, s17, 5
+; GCN-NEXT: v_writelane_b32 v1, s18, 6
+; GCN-NEXT: v_writelane_b32 v1, s19, 7
+; GCN-NEXT: v_writelane_b32 v1, s20, 8
+; GCN-NEXT: v_writelane_b32 v1, s21, 9
+; GCN-NEXT: v_writelane_b32 v1, s22, 10
+; GCN-NEXT: v_writelane_b32 v1, s23, 11
+; GCN-NEXT: v_writelane_b32 v1, s24, 12
+; GCN-NEXT: v_writelane_b32 v1, s25, 13
+; GCN-NEXT: v_writelane_b32 v1, s26, 14
+; GCN-NEXT: v_writelane_b32 v1, s27, 15
+; GCN-NEXT: v_writelane_b32 v1, s28, 16
+; GCN-NEXT: v_writelane_b32 v1, s29, 17
+; GCN-NEXT: v_writelane_b32 v1, s30, 18
+; GCN-NEXT: v_writelane_b32 v1, s31, 19
+; GCN-NEXT: v_writelane_b32 v1, s32, 20
+; GCN-NEXT: v_writelane_b32 v1, s33, 21
+; GCN-NEXT: v_writelane_b32 v1, s34, 22
+; GCN-NEXT: v_writelane_b32 v1, s35, 23
+; GCN-NEXT: v_writelane_b32 v1, s36, 24
+; GCN-NEXT: v_writelane_b32 v1, s37, 25
+; GCN-NEXT: v_writelane_b32 v1, s38, 26
+; GCN-NEXT: v_writelane_b32 v1, s39, 27
+; GCN-NEXT: v_writelane_b32 v1, s40, 28
+; GCN-NEXT: v_writelane_b32 v1, s41, 29
+; GCN-NEXT: v_writelane_b32 v1, s42, 30
+; GCN-NEXT: v_writelane_b32 v1, s43, 31
+; GCN-NEXT: v_writelane_b32 v1, s44, 32
+; GCN-NEXT: v_writelane_b32 v1, s45, 33
+; GCN-NEXT: v_writelane_b32 v1, s46, 34
+; GCN-NEXT: v_writelane_b32 v1, s47, 35
+; GCN-NEXT: v_writelane_b32 v1, s48, 36
+; GCN-NEXT: v_writelane_b32 v1, s49, 37
+; GCN-NEXT: v_writelane_b32 v1, s50, 38
+; GCN-NEXT: v_writelane_b32 v1, s51, 39
+; GCN-NEXT: v_writelane_b32 v1, s52, 40
+; GCN-NEXT: v_writelane_b32 v1, s53, 41
+; GCN-NEXT: v_writelane_b32 v1, s54, 42
+; GCN-NEXT: v_writelane_b32 v1, s55, 43
+; GCN-NEXT: v_writelane_b32 v1, s56, 44
+; GCN-NEXT: v_writelane_b32 v1, s57, 45
+; GCN-NEXT: v_writelane_b32 v1, s58, 46
+; GCN-NEXT: v_writelane_b32 v1, s59, 47
+; GCN-NEXT: v_writelane_b32 v1, s60, 48
+; GCN-NEXT: v_writelane_b32 v1, s61, 49
+; GCN-NEXT: v_writelane_b32 v1, s62, 50
+; GCN-NEXT: v_writelane_b32 v1, s63, 51
+; GCN-NEXT: v_writelane_b32 v1, s64, 52
+; GCN-NEXT: v_writelane_b32 v1, s65, 53
+; GCN-NEXT: v_writelane_b32 v1, s66, 54
+; GCN-NEXT: v_writelane_b32 v1, s67, 55
+; GCN-NEXT: v_writelane_b32 v1, s68, 56
+; GCN-NEXT: v_writelane_b32 v1, s69, 57
+; GCN-NEXT: v_writelane_b32 v1, s70, 58
+; GCN-NEXT: v_writelane_b32 v1, s71, 59
+; GCN-NEXT: v_writelane_b32 v1, s72, 60
+; GCN-NEXT: v_writelane_b32 v1, s73, 61
+; GCN-NEXT: v_writelane_b32 v1, s74, 62
+; GCN-NEXT: v_writelane_b32 v1, s75, 63
+; GCN-NEXT: v_writelane_b32 v2, s76, 0
+; GCN-NEXT: v_writelane_b32 v2, s77, 1
+; GCN-NEXT: v_writelane_b32 v2, s78, 2
+; GCN-NEXT: v_writelane_b32 v2, s79, 3
+; GCN-NEXT: v_writelane_b32 v2, s80, 4
+; GCN-NEXT: v_writelane_b32 v2, s81, 5
+; GCN-NEXT: v_writelane_b32 v2, s82, 6
+; GCN-NEXT: v_writelane_b32 v2, s83, 7
 ; GCN: s_cbranch_scc1
 
 
@@ -393,24 +393,25 @@
 ; into the next available VGPR.
 
 ; GCN-LABEL: {{^}}split_sgpr_spill_2_vgprs:
-; GCN: def s[24:39]
+; GCN: def s[4:19]
+; GCN: def s[20:35]
 
-; GCN: v_writelane_b32 v0, s24, 50
-; GCN-NEXT: v_writelane_b32 v0, s25, 51
-; GCN-NEXT: v_writelane_b32 v0, s26, 52
-; GCN-NEXT: v_writelane_b32 v0, s27, 53
-; GCN-NEXT: v_writelane_b32 v0, s28, 54
-; GCN-NEXT: v_writelane_b32 v0, s29, 55
-; GCN-NEXT: v_writelane_b32 v0, s30, 56
-; GCN-NEXT: v_writelane_b32 v0, s31, 57
-; GCN-NEXT: v_writelane_b32 v0, s32, 58
-; GCN-NEXT: v_writelane_b32 v0, s33, 59
-; GCN-NEXT: v_writelane_b32 v0, s34, 60
-; GCN-NEXT: v_writelane_b32 v0, s35, 61
-; GCN-NEXT: v_writelane_b32 v0, s36, 62
-; GCN-NEXT: v_writelane_b32 v0, s37, 63
-; GCN-NEXT: v_writelane_b32 v1, s38, 0
-; GCN-NEXT: v_writelane_b32 v1, s39, 1
+; GCN: v_writelane_b32 v0, s4, 50
+; GCN-NEXT: v_writelane_b32 v0, s5, 51
+; GCN-NEXT: v_writelane_b32 v0, s6, 52
+; GCN-NEXT: v_writelane_b32 v0, s7, 53
+; GCN-NEXT: v_writelane_b32 v0, s8, 54
+; GCN-NEXT: v_writelane_b32 v0, s9, 55
+; GCN-NEXT: v_writelane_b32 v0, s10, 56
+; GCN-NEXT: v_writelane_b32 v0, s11, 57
+; GCN-NEXT: v_writelane_b32 v0, s12, 58
+; GCN-NEXT: v_writelane_b32 v0, s13, 59
+; GCN-NEXT: v_writelane_b32 v0, s14, 60
+; GCN-NEXT: v_writelane_b32 v0, s15, 61
+; GCN-NEXT: v_writelane_b32 v0, s16, 62
+; GCN-NEXT: v_writelane_b32 v0, s17, 63
+; GCN-NEXT: v_writelane_b32 v1, s18, 0
+; GCN-NEXT: v_writelane_b32 v1, s19, 1
 
 ; GCN: v_readlane_b32 s4, v0, 50
 ; GCN-NEXT: v_readlane_b32 s5, v0, 51
diff --git a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
index 7650873..aae8e64 100644
--- a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
@@ -40,8 +40,7 @@
 
 ; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4:
 ; GCN: s_load_dword s
-; GCN-NEXT: s_load_dword s
-; GCN-NEXT: s_load_dword s
+; GCN-NEXT: s_load_dwordx2 s
 ; GCN-NOT: {{buffer|flat|global}}
 
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index b6d6006..f57ca3a 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}v_sad_u32_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
@@ -203,8 +203,11 @@
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_i16_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) {
+; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out) {
+  %a = load volatile i16, i16 addrspace(1)* undef
+  %b = load volatile i16, i16 addrspace(1)* undef
+  %c = load volatile i16, i16 addrspace(1)* undef
   %icmp0 = icmp ugt i16 %a, %b
   %sub0 = sub i16 %a, %b
   %sub1 = sub i16 %b, %a
@@ -233,8 +236,31 @@
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_i8_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
+; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out) {
+  %a = load volatile i8, i8 addrspace(1)* undef
+  %b = load volatile i8, i8 addrspace(1)* undef
+  %c = load volatile i8, i8 addrspace(1)* undef
+  %icmp0 = icmp ugt i8 %a, %b
+  %sub0 = sub i8 %a, %b
+  %sub1 = sub i8 %b, %a
+  %ret0 = select i1 %icmp0, i8 %sub0, i8 %sub1
+
+  %ret = add i8 %ret0, %c
+
+  store i8 %ret, i8 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_sad_u32_i8_pat2:
+; GCN: s_load_dword
+; GCN: s_bfe_u32
+; GCN: s_sub_i32
+; GCN: s_and_b32
+; GCN: s_sub_i32
+; GCN: s_lshr_b32
+; GCN: v_add_i32_e32
+define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
   %icmp0 = icmp ugt i8 %a, %b
   %sub0 = sub i8 %a, %b
   %sub1 = sub i8 %b, %a
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
index 118bb9a..0645bb4 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
@@ -2,14 +2,11 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s
 
 ; FUNC-LABEL: {{^}}cluster_arg_loads:
-; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
-; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
+; SI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+
+; VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
 define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
   store i32 %x, i32 addrspace(1)* %out0, align 4
   store i32 %y, i32 addrspace(1)* %out1, align 4
@@ -42,7 +39,7 @@
     i64 %arg112, i64 %arg113, i64 %arg114, i64 %arg115, i64 %arg116, i64 %arg117, i64 %arg118, i64 %arg119,
     i64 %arg120, i64 %arg121, i64 %arg122, i64 %arg123, i64 %arg124, i64 %arg125, i64 %arg126) {
 entry:
-  %value = add i64 %arg125, %arg126
+  %value = add i64 %arg124, %arg126
   store i64 %value, i64 addrspace(1)* %out, align 8
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
index 2072cf5..ee32dce 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -1,10 +1,13 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MINREG %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MAXOCC %s
+; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MINREG %s
+; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MAXOCC %s
 
-; SI: NumSgprs: {{[1-9]$}}
-; SI: NumVgprs: {{[1-9]$}}
+; SI-MINREG: NumSgprs: {{[1-9]$}}
+; SI-MINREG: NumVgprs: {{[1-9]$}}
+
+; SI-MAXOCC: NumSgprs: {{[0-4][0-9]$}}
+; SI-MAXOCC: NumVgprs: {{[0-4][0-9]$}}
 
 ; stores may alias loads
 ; VI: NumSgprs: {{[0-9]$}}
diff --git a/llvm/test/CodeGen/AMDGPU/select-i1.ll b/llvm/test/CodeGen/AMDGPU/select-i1.ll
index f784e22..8e60aa1 100644
--- a/llvm/test/CodeGen/AMDGPU/select-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-i1.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; FIXME: This should go in existing select.ll test, except the current testcase there is broken on GCN
 
@@ -18,12 +18,12 @@
 ; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8
 ; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16
 ; GCN-DAG: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]]
-; GCN-DAG: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
-; GCN-DAG: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
+; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
+; GCN: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
 ; GCN: v_cmp_eq_u32_e64 vcc, [[COND]], 1
 ; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]]
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]]
-define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
+define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, [8 x i32], i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
   %cmp = icmp slt i1 %cond, false
   %sel = select i1 %cmp, i1 %a, i1 %b
   store i1 %sel, i1 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll
index 540eb9c..33028f1 100644
--- a/llvm/test/CodeGen/AMDGPU/select-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll
@@ -134,8 +134,9 @@
 }
 
 ; GCN-LABEL: {{^}}regression:
-; GCN: v_cmp_neq_f32_e64
-; GCN: v_cmp_neq_f32_e64 {{[^,]*}}, s{{[0-9]+}}, 0
+; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0
+; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}}
+; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}}
 ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 
 define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 9030baa..7d1f75a 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -24,10 +24,10 @@
     half addrspace(1)* %c,
     half addrspace(1)* %d) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
-  %d.val = load half, half addrspace(1)* %d
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
+  %c.val = load volatile half, half addrspace(1)* %c
+  %d.val = load volatile half, half addrspace(1)* %d
   %fcmp = fcmp olt half %a.val, %b.val
   %r.val = select i1 %fcmp, half %c.val, half %d.val
   store half %r.val, half addrspace(1)* %r
@@ -54,9 +54,9 @@
     half addrspace(1)* %c,
     half addrspace(1)* %d) {
 entry:
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
-  %d.val = load half, half addrspace(1)* %d
+  %b.val = load volatile half, half addrspace(1)* %b
+  %c.val = load volatile half, half addrspace(1)* %c
+  %d.val = load volatile half, half addrspace(1)* %d
   %fcmp = fcmp olt half 0xH3800, %b.val
   %r.val = select i1 %fcmp, half %c.val, half %d.val
   store half %r.val, half addrspace(1)* %r
@@ -84,9 +84,9 @@
     half addrspace(1)* %c,
     half addrspace(1)* %d) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %c.val = load half, half addrspace(1)* %c
-  %d.val = load half, half addrspace(1)* %d
+  %a.val = load volatile half, half addrspace(1)* %a
+  %c.val = load volatile half, half addrspace(1)* %c
+  %d.val = load volatile half, half addrspace(1)* %d
   %fcmp = fcmp olt half %a.val, 0xH3800
   %r.val = select i1 %fcmp, half %c.val, half %d.val
   store half %r.val, half addrspace(1)* %r
@@ -115,9 +115,9 @@
     half addrspace(1)* %b,
     half addrspace(1)* %d) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
-  %d.val = load half, half addrspace(1)* %d
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
+  %d.val = load volatile half, half addrspace(1)* %d
   %fcmp = fcmp olt half %a.val, %b.val
   %r.val = select i1 %fcmp, half 0xH3800, half %d.val
   store half %r.val, half addrspace(1)* %r
@@ -145,9 +145,9 @@
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
+  %c.val = load volatile half, half addrspace(1)* %c
   %fcmp = fcmp olt half %a.val, %b.val
   %r.val = select i1 %fcmp, half %c.val, half 0xH3800
   store half %r.val, half addrspace(1)* %r
@@ -197,10 +197,10 @@
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 
-; SI: v_cmp_lt_f32_e32 vcc, 0.5
-; SI: v_cndmask_b32_e32
 ; SI: v_cmp_gt_f32_e32
 ; SI: v_cndmask_b32_e32
+  ; SI: v_cmp_lt_f32_e32 vcc, 0.5
+; SI: v_cndmask_b32_e32
 
 ; VI: v_cmp_lt_f16_e32
 ; VI: v_cndmask_b32_e32
@@ -233,10 +233,10 @@
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 
-; SI: v_cmp_gt_f32_e32 vcc, 0.5
-; SI: v_cndmask_b32_e32
 ; SI: v_cmp_lt_f32_e32
 ; SI: v_cndmask_b32_e32
+; SI: v_cmp_gt_f32_e32 vcc, 0.5
+; SI: v_cndmask_b32_e32
 
 ; VI: v_cmp_gt_f16_e32
 ; VI: v_cndmask_b32_e32
@@ -272,7 +272,7 @@
 ; SI: v_cmp_nlt_f32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cmp_nlt_f32_e32
-; SI: v_cndmask_b32_e32
+; SI-DAG: v_cndmask_b32_e32
 
 ; VI: v_cmp_nlt_f16_e32
 ; VI: v_cndmask_b32_e32
@@ -280,8 +280,8 @@
 ; VI: v_cmp_nlt_f16_e32
 ; VI: v_cndmask_b32_e32
 
-; SI:  v_cvt_f16_f32_e32
-; SI:  v_cvt_f16_f32_e32
+; SI-DAG: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
 ; GCN: s_endpgm
 define amdgpu_kernel void @select_v2f16_imm_c(
     <2 x half> addrspace(1)* %r,
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
index caddb6f..a0c5f3b 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -180,16 +180,14 @@
   ret void
 }
 
-; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_sext_arg:
-; GCN: s_load_dword [[B:s[0-9]+]]
-; GCN: v_cmp_ne_u32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[B]], -1{{$}}
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
-; GCN-NEXT: buffer_store_byte [[RESULT]]
-; GCN: s_endpgm
-define amdgpu_kernel void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind {
+; FUNC-LABEL: {{^}}v_cmp_sext_k_neg1_i8_sext_arg:
+; GCN: v_cmp_ne_u32_e32 vcc, -1, v0
+; GCN-NEXT: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0, 1, vcc
+; GCN: buffer_store_byte [[SELECT]]
+define void @v_cmp_sext_k_neg1_i8_sext_arg(i8 signext %b) nounwind {
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
-  store i1 %icmp0, i1 addrspace(1)* %out
+  store i1 %icmp0, i1 addrspace(1)* undef
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index 575938b..a1bd5b0 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -38,37 +38,37 @@
 ; SI: s_cbranch_scc0 [[IF:BB[0-9]+_[0-9]+]]
 
 ; SI: ; %bb.1: ; %else
-; SI: s_load_dword [[LOAD0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xe
-; SI: s_load_dword [[LOAD1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xf
+; SI: s_load_dword [[LOAD0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
+; SI: s_load_dword [[LOAD1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2e
 ; SI-NOT: add
 ; SI: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]
 
 ; SI: [[IF]]: ; %if
-; SI: s_load_dword [[LOAD0]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_load_dword [[LOAD1]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: s_load_dword [[LOAD0]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI: s_load_dword [[LOAD1]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
 ; SI-NOT: add
 
 ; SI: [[ENDIF]]: ; %endif
 ; SI: s_add_i32 s{{[0-9]+}}, [[LOAD0]], [[LOAD1]]
 ; SI: buffer_store_dword
 ; SI-NEXT: s_endpgm
-define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) {
 entry:
-  %0 = icmp eq i32 %a, 0
-  br i1 %0, label %if, label %else
+  %cmp0 = icmp eq i32 %a, 0
+  br i1 %cmp0, label %if, label %else
 
 if:
-  %1 = add i32 %b, %c
+  %add0 = add i32 %b, %c
   br label %endif
 
 else:
-  %2 = add i32 %d, %e
+  %add1 = add i32 %d, %e
   br label %endif
 
 endif:
-  %3 = phi i32 [%1, %if], [%2, %else]
-  %4 = add i32 %3, %a
-  store i32 %4, i32 addrspace(1)* %out
+  %phi = phi i32 [%add0, %if], [%add1, %else]
+  %add2 = add i32 %phi, %a
+  store i32 %add2, i32 addrspace(1)* %out
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 14f4646..ade2803 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -275,11 +275,11 @@
 
 ; Make sure load width gets reduced to i32 load.
 ; GCN-LABEL: {{^}}s_shl_32_i64:
-; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
+; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %result = shl i64 %a, 32
   store i64 %result, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 083ec9e..ce6125d 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -14,10 +14,12 @@
 ; VI: s_lshr_b32
 ; VI: s_and_b32
 ; VI: s_and_b32
+; VI: s_lshl_b32
+; VI: s_lshl_b32
+; VI: s_lshl_b32
 ; VI: s_and_b32
 ; VI: s_or_b32
 
-
 ; CI: s_load_dword s
 ; CI: s_load_dword s
 ; CI: s_lshr_b32
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll b/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
index 26cdae3..5270de2 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
@@ -54,14 +54,13 @@
 }
 
 ; FUNC-LABEL: {{^}}test_add_shl_add_constant:
-; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI-DAG: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
-; SI: s_add_i32 [[RESULT:s[0-9]+]], [[SHL3]], [[Y]]
+; SI-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Y:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_lshl_b32 [[SHL3:s[0-9]+]], s[[X]], 3
+; SI: s_add_i32 [[RESULT:s[0-9]+]], [[SHL3]], s[[Y]]
 ; SI: s_addk_i32 [[RESULT]], 0x3d8
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
 ; SI: buffer_store_dword [[VRESULT]]
-define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, [8 x i32], i32 %x, i32 %y) #0 {
   %add.0 = add i32 %x, 123
   %shl = shl i32 %add.0, 3
   %add.1 = add i32 %shl, %y
@@ -70,15 +69,14 @@
 }
 
 ; FUNC-LABEL: {{^}}test_add_shl_add_constant_inv:
-; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
-; SI: s_add_i32 [[TMP:s[0-9]+]], [[Y]], [[SHL3]]
+; SI-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Y:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI: s_lshl_b32 [[SHL3:s[0-9]+]], s[[X]], 3
+; SI: s_add_i32 [[TMP:s[0-9]+]], s[[Y]], [[SHL3]]
 ; SI: s_addk_i32 [[TMP]], 0x3d8
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[TMP]]
 ; SI: buffer_store_dword [[VRESULT]]
 
-define amdgpu_kernel void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, [8 x i32], i32 %x, i32 %y) #0 {
   %add.0 = add i32 %x, 123
   %shl = shl i32 %add.0, 3
   %add.1 = add i32 %y, %shl
diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index 429493c..dfdd775 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -55,6 +55,7 @@
 }
 
 ; GCN-LABEL: {{^}}s_sext_i16_to_i64:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000
 define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
   %sext = sext i16 %a to i64
diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll
index 9e10f04..22f11da 100644
--- a/llvm/test/CodeGen/AMDGPU/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/smed3.ll
@@ -370,7 +370,7 @@
 ; GCN: s_sext_i32_i16
 ; GCN: s_sext_i32_i16
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, [8 x i32], i16 %x, [8 x i32], i16 %y, [8 x i32], i16 %z) #1 {
 bb:
   %tmp0 = call i16 @smin16(i16 %x, i16 %y)
   %tmp1 = call i16 @smax16(i16 %x, i16 %y)
@@ -385,7 +385,7 @@
 ; GCN: s_sext_i32_i8
 ; GCN: s_sext_i32_i8
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, [8 x i32], i8 %x, [8 x i32], i8 %y, [8 x i32], i8 %z) #1 {
 bb:
   %tmp0 = call i8 @smin8(i8 %x, i8 %y)
   %tmp1 = call i8 @smax8(i8 %x, i8 %y)
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.ll b/llvm/test/CodeGen/AMDGPU/sminmax.ll
index 96a318f..167a179 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.ll
@@ -193,7 +193,7 @@
 
 ; GCN-DAG: s_min_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
 ; GCN-DAG: s_max_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
-define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind {
+define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32], i32 %val0, [8 x i32], i32 %val1) nounwind {
   %cond0 = icmp sgt i32 %val0, %val1
   %sel0 = select i1 %cond0, i32 %val0, i32 %val1
   %sel1 = select i1 %cond0, i32 %val1, i32 %val0
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 6529bf4..598048d 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -8,34 +8,15 @@
 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
 
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-; VI: s_sub_i32
-; VI: s_sub_i32
-; VI: s_max_i32
-; VI: s_max_i32
-; SI: s_add_i32
-; SI: s_add_i32
-; SI: s_and_b32
-; SI: s_or_b32
-
-; CI-NOT: {{buffer|flat}}_load
-; CI: s_load_dword s
-; CI-NOT: {{buffer|flat}}_load
-; CI: s_lshr_b32
-; CI: s_ashr_i32
-; CI: s_sext_i32_i16
-; CI: s_sub_i32
-; CI: s_sub_i32
-; CI: s_sext_i32_i16
-; CI: s_sext_i32_i16
-; CI: s_max_i32
-; CI: s_max_i32
-; CI: s_lshl_b32
-; CI: s_add_i32
-; CI: s_add_i32
-; CI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xffff
-; CI: s_or_b32
-
+; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; CIVI: s_sub_i32
+; CIVI: s_sub_i32
+; CIVI: s_max_i32
+; CIVI: s_max_i32
+; CIVI: s_add_i32
+; CIVI: s_add_i32
+; CIVI: s_and_b32
+; CIVI: s_or_b32
 define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
   %neg = sub <2 x i16> zeroinitializer, %val
   %cond = icmp sgt <2 x i16> %val, %neg
@@ -61,6 +42,17 @@
 ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[TWO]]  dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NOT: v_and_b32
 ; VI: v_or_b32_e32
+
+; CI: buffer_load_dword v
+; CI: v_lshrrev_b32_e32
+; CI: v_sub_i32_e32
+; CI: v_bfe_i32
+; CI: v_bfe_i32
+; CI: v_max_i32
+; CI: v_max_i32
+; CI: v_add_i32
+; CI: v_add_i32
+; CI: v_or_b32
 define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %src, i32 %tid
@@ -115,10 +107,8 @@
 ; GFX9: s_load_dwordx2 s{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}, s[0:1], 0x2c
 ; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, s[[VAL0]]
 ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[VAL1]]
-
 ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[VAL0]], [[SUB0]]
 ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], s[[VAL1]], [[SUB1]]
-
 ; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 op_sel_hi:[1,0]
 ; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 op_sel_hi:[1,0]
 define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll
index 8020b97..d16dcd9 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@@ -46,10 +46,10 @@
 ; GCN-LABEL: {{^}}smrd3:
 ; FIXME: There are too many copies here because we don't fold immediates
 ;        through REG_SEQUENCE
-; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
+; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0x13 ; encoding: [0x13
 ; TODO: Add VI checks
 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, [8 x i32], i32 addrspace(4)* %ptr) #0 {
 entry:
   %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296
   %tmp1 = load i32, i32 addrspace(4)* %tmp
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index ee82737..36e332c5 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -231,11 +231,11 @@
 }
 
 ; GCN-LABEL: {{^}}s_ashr_32_i64:
-; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x14|0x50}}
 ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
 ; GCN: s_add_u32 s{{[0-9]+}}, s[[HI]], s{{[0-9]+}}
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[SHIFT]], s{{[0-9]+}}
-define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
   %result = ashr i64 %a, 32
   %add = add i64 %result, %b
   store i64 %add, i64 addrspace(1)* %out
@@ -258,11 +258,11 @@
 }
 
 ; GCN-LABEL: {{^}}s_ashr_63_i64:
-; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x14|0x50}}
 ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
 ; GCN: s_add_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
 ; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
-define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
   %result = ashr i64 %a, 63
   %add = add i64 %result, %b
   store i64 %add, i64 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index 8878b45..83940ea 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -189,11 +189,11 @@
 
 ; Make sure load width gets reduced to i32 load.
 ; GCN-LABEL: {{^}}s_lshr_32_i64:
-; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc{{$}}
+; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x14{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %result = lshr i64 %a, 32
   store i64 %result, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 280ce62..5bcf78f 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -18,9 +18,9 @@
 }
 
 ; GCN-LABEL: {{^}}local_store_i55:
-; CIVI-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6
-; CIVI-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
-; CIVI-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+; CIVI-DAG: ds_write_b8 v{{[0-9]+}}, v{{[0-9]+}} offset:6
+; CIVI-DAG: ds_write_b16 v{{[0-9]+}}, v{{[0-9]+}} offset:4
+; CIVI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+$}}
 
 ; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6
 ; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 908d13e..fb1cbe7 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -6,9 +6,9 @@
 declare i32 @llvm.r600.read.tidig.x() readnone
 
 ; FUNC-LABEL: {{^}}s_sub_i32:
-; GCN: s_load_dword [[A:s[0-9]+]]
-; GCN: s_load_dword [[B:s[0-9]+]]
-; GCN: s_sub_i32 s{{[0-9]+}}, [[A]], [[B]]
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}
+; GCN: s_sub_i32 s{{[0-9]+}}, s[[A]], s[[B]]
 define amdgpu_kernel void @s_sub_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %result = sub i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
index 6a9d8ea..02c847e 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
@@ -4,20 +4,23 @@
 target triple="amdgcn--"
 
 ; CHECK-LABEL: foobar:
-; CHECK: s_load_dword s2, s[0:1], 0x9
-; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK: v_mbcnt_lo_u32_b32_e64
+; CHECK: s_load_dwordx2 s[2:3], s[0:1], 0x9
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64
 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; BB0_1:
-; CHECK: s_load_dword s0, s[0:1], 0xa
 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; BB0_2:
+; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
+
+; CHECK: BB0_1:
+; CHECK-NEXT: ; kill: def $vgpr0_vgpr1 killed $sgpr2_sgpr3 killed $exec
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+
+; CHECK: BB0_2:
 ; CHECK: s_or_b64 exec, exec, s[2:3]
-; CHECK-NEXT: s_mov_b32 s7, 0xf000
-; CHECK-NEXT: s_mov_b32 s6, -1
-; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; CHECK-NEXT: s_mov_b32 s3, 0xf000
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0
 ; CHECK-NEXT: s_endpgm
 define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
index 4ea2352..112c499 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
@@ -1,37 +1,38 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
 
-; SI-LABEL: {{^}}global_truncstore_i32_to_i1:
-; SI: s_load_dword [[LOAD:s[0-9]+]],
-; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
-; SI: buffer_store_byte [[VREG]],
+; GCN-LABEL: {{^}}global_truncstore_i32_to_i1:
+; GCN: s_load_dword [[LOAD:s[0-9]+]],
+; GCN: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
+; GCN: buffer_store_byte [[VREG]],
 define amdgpu_kernel void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
   %trunc = trunc i32 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: {{^}}global_truncstore_i64_to_i1:
-; SI: buffer_store_byte
+; GCN-LABEL: {{^}}global_truncstore_i64_to_i1:
+; GCN: buffer_store_byte
 define amdgpu_kernel void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
   %trunc = trunc i64 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: {{^}}s_arg_global_truncstore_i16_to_i1:
-; SI: s_load_dword [[LOAD:s[0-9]+]],
-; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
-; SI: buffer_store_byte [[VREG]],
+; FIXME: VGPR on VI
+; GCN-LABEL: {{^}}s_arg_global_truncstore_i16_to_i1:
+; GCN: s_load_dword [[LOAD:s[0-9]+]],
+; GCN: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
+; GCN: buffer_store_byte [[VREG]],
 define amdgpu_kernel void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
   %trunc = trunc i16 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
 }
-; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
+; GCN-LABEL: {{^}}global_truncstore_i16_to_i1:
 define amdgpu_kernel void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind {
   %add = add i16 %val0, %val1
   %trunc = trunc i16 %add to i1
diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll
index b390958..7c609d7 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@@ -1,10 +1,10 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI  %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI  %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, [8 x i32], i64 %in) {
 ; GCN-LABEL: {{^}}trunc_i64_to_i32_store:
 ; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[0:1],
 ; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
@@ -28,7 +28,7 @@
 ; SI: buffer_store_dword [[VSHL]]
 ; VI: flat_store_dword v[{{[0-9:]+}}], [[VSHL]]
 
-define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, [8 x i32], i64 %a) {
   %b = shl i64 %a, 2
   %result = trunc i64 %b to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -94,12 +94,12 @@
 }
 
 ; GCN-LABEL: {{^}}s_trunc_i64_to_i1:
-; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]]
 ; GCN: v_cmp_eq_u32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], [[MASKED]], 1{{$}}
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]]
-define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, [8 x i32], i64 %x) {
   %trunc = trunc i64 %x to i1
   %sel = select i1 %trunc, i32 63, i32 -12
   store i32 %sel, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index b85c6bf..acdf60d 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -51,7 +51,7 @@
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-NOT: v_and_b32
 ; SI: s_endpgm
-define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
+define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1, [8 x i32], i32 %x, [8 x i32], i32 %y) {
   %result0 = udiv i32 %x, %y
   store i32 %result0, i32 addrspace(1)* %out0
   %result1 = urem i32 %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll
index 350be19..071fc07 100644
--- a/llvm/test/CodeGen/AMDGPU/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/umed3.ll
@@ -368,7 +368,7 @@
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, [8 x i32], i16 %x, [8 x i32], i16 %y, [8 x i32], i16 %z) #1 {
 bb:
   %tmp0 = call i16 @umin16(i16 %x, i16 %y)
   %tmp1 = call i16 @umax16(i16 %x, i16 %y)
@@ -383,7 +383,7 @@
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, [8 x i32], i8 %x, [8 x i32], i8 %y, [8 x i32], i8 %z) #1 {
 bb:
   %tmp0 = call i8 @umin8(i8 %x, i8 %y)
   %tmp1 = call i8 @umax8(i8 %x, i8 %y)
diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
index e5fe0bd..d30aeb5 100644
--- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -442,7 +442,7 @@
 ; ALIGNED: buffer_load_ushort
 ; ALIGNED: buffer_load_ushort
 
-; UNALIGNED: s_load_dwordx2
+; UNALIGNED: s_load_dwordx4
 ; UNALIGNED: buffer_store_dwordx2
 define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(4)* %p, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
index 33a4202..b5435ef 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -195,7 +195,7 @@
 
 ; GCN-LABEL: {{^}}uniform_if_else:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
-; GCN-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
+; GCN: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; GCN: v_mov_b32_e32 [[IMM_REG:v[0-9]+]], 2
 ; GCN: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]]
@@ -248,10 +248,10 @@
 }
 
 ; GCN-LABEL: {{^}}icmp_users_different_blocks:
-; GCN: s_load_dword [[COND:s[0-9]+]]
-; GCN: s_cmp_lt_i32 [[COND]], 1
+; GCN: s_load_dwordx2 s{{\[}}[[COND0:[0-9]+]]:[[COND1:[0-9]+]]{{\]}}
+; GCN: s_cmp_lt_i32 s[[COND0]], 1
 ; GCN: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]]
-; GCN: v_cmp_gt_i32_e64 {{[^,]*}}, [[COND]], 0{{$}}
+; GCN: v_cmp_gt_i32_e64 {{[^,]*}}, s[[COND1]], 0{{$}}
 ; GCN: s_cbranch_vccz [[BODY:[A-Za-z0-9_]+]]
 ; GCN: {{^}}[[EXIT]]:
 ; GCN: s_endpgm
@@ -432,8 +432,7 @@
 ; GCN-LABEL: {{^}}uniform_if_scc_i64_eq:
 ; VI-DAG: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 0
 ; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
-
-; SI: v_cmp_eq_u64_e64
+; SI-DAG: v_cmp_eq_u64_e64
 ; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; VI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
@@ -465,7 +464,7 @@
 ; VI-DAG: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, 0
 ; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
 
-; SI: v_cmp_ne_u64_e64
+; SI-DAG: v_cmp_ne_u64_e64
 ; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; VI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
@@ -494,8 +493,8 @@
 }
 
 ; GCN-LABEL: {{^}}uniform_if_scc_i64_sgt:
-; GCN: s_mov_b32 [[S_VAL:s[0-9]+]], 0
-; GCN: v_cmp_gt_i64_e64
+; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
+; GCN-DAG: v_cmp_gt_i64_e64
 ; GCN: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; Fall-through to the else
diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index 3cce2c9..9e8c47b 100644
--- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -28,12 +28,10 @@
 }
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
-; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[SGPR0]], s[[SGPR0]], [[VGPR1]]
 ; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
@@ -42,8 +40,7 @@
 }
 
 ; GCN-LABEL: {{^}}test_use_s_v_s:
-; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[SA:[0-9]+]]:[[SB:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; SI: buffer_load_dword [[VA0:v[0-9]+]]
 ; SI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
 
@@ -53,11 +50,11 @@
 ; VI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
 
 ; GCN-NOT: v_mov_b32
-; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[SB]]
 ; GCN-NOT: v_mov_b32
 
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SA]], [[VA0]], [[VB]]
-; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SA]], [[VA1]], [[VB]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SA]], [[VA0]], [[VB]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SA]], [[VA1]], [[VB]]
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
@@ -71,12 +68,10 @@
 }
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
-; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[SGPR0]], [[VGPR1]], s[[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
@@ -85,12 +80,10 @@
 }
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
-; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], s[[SGPR0]], s[[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
@@ -152,11 +145,11 @@
 }
 
 ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_k_s_x2:
-; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR0]]
-; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VK]], [[VK]], [[SGPR1]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], s[[SGPR0]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VK]], [[VK]], s[[SGPR1]]
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
@@ -180,11 +173,11 @@
 }
 
 ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_s_k_x2:
-; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VK]], [[VK]]
-; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR1]], [[VK]], [[VK]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VK]], [[VK]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR1]], [[VK]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
@@ -208,11 +201,11 @@
 }
 
 ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_s_k_k_x2:
-; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VK]], [[VK]]
-; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR1]], [[VK]], [[VK]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VK]], [[VK]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR1]], [[VK]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
@@ -225,14 +218,14 @@
 }
 
 ; GCN-LABEL: {{^}}test_s0_s1_k_f32:
-; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000
-; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], [[SGPR1]]
+; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], s[[SGPR1]]
 
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK0]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK0]]
 ; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000
-; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK1]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK1]]
 
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
@@ -246,8 +239,8 @@
 
 ; FIXME: Immediate in SGPRs just copied to VGPRs
 ; GCN-LABEL: {{^}}test_s0_s1_k_f64:
-; GCN-DAG: s_load_dwordx2 [[SGPR0:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
+; GCN-DAG: s_load_dwordx2 [[SGPR0:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x1d|0x74}}
 ; GCN-DAG: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], 0x40900000
 ; GCN-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0{{$}}
 
@@ -261,7 +254,7 @@
 
 ; GCN: buffer_store_dwordx2 [[RESULT0]]
 ; GCN: buffer_store_dwordx2 [[RESULT1]]
-define amdgpu_kernel void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @test_s0_s1_k_f64(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) #0 {
   %fma0 = call double @llvm.fma.f64(double %a, double %b, double 1024.0) #1
   %fma1 = call double @llvm.fma.f64(double %a, double %b, double 4096.0) #1
   store volatile double %fma0, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index bd7738e..53ef9d5 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -42,12 +42,12 @@
 ; (select (cmp (sgprX, constant)), constant, sgprZ)
 
 ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprZ_f32:
-; GCN: s_load_dword [[X:s[0-9]+]]
-; GCN: s_load_dword [[Z:s[0-9]+]]
-; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
-; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}
+; GCN-DAG: v_cmp_nlg_f32_e64 vcc, s[[X]], 0
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -73,12 +73,11 @@
 }
 
 ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprZ_f32:
-; GCN-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[Z:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
-; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
-; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
+; GCN-DAG: v_cmp_nlg_f32_e64 vcc, s[[X]], 0
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc
-define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -315,10 +314,10 @@
 
 ; Different types compared vs. selected
 ; GCN-LABEL: {{^}}fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
-; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_dwordx2
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3ff00000
+; GCN-DAG: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dwordx2
 
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3ff00000
 ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
index fe5e5a1..bf9ae67 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -350,8 +350,8 @@
 
 ; GCN-LABEL: {{^}}mac_v2f16_same_add:
 ; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI-DAG:  v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -390,8 +390,8 @@
 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
 
-; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}}
-; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
@@ -418,8 +418,8 @@
 ; GCN-LABEL: {{^}}mac_v2f16_neg_b
 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
-; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}}
-; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 
 ; VI-NOT: v_mac_f16
@@ -452,8 +452,8 @@
 ; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
 ; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
 
-; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT2]]
-; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT5]]
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index ab47cc9..237c186 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -35,9 +35,9 @@
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+  %a.val = load volatile half, half addrspace(1)* %a
+  %b.val = load volatile half, half addrspace(1)* %b
+  %c.val = load volatile half, half addrspace(1)* %c
 
   %t0.val = fmul half %a.val, %b.val
   %t1.val = fmul half %a.val, %c.val
diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index 8dab6fa..90b5519 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -14,7 +14,7 @@
 ; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
 ; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
 
-; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
+; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock4
 ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
 ; SI: v_mov_b32_e32 v{{[0-9]}}, -1
 ; SI: s_and_saveexec_b64
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index 847a1d7..df32706 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -1,6 +1,6 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
 
 
 ; FUNC-LABEL: {{^}}xor_v2i32:
@@ -40,9 +40,9 @@
 ; FUNC-LABEL: {{^}}xor_i1:
 ; EG: XOR_INT {{\** *}}{{T[0-9]+\.[XYZW]}}, {{PS|PV\.[XYZW]}}, {{PS|PV\.[XYZW]}}
 
-; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}}
-; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}}
-; SI: s_xor_b64 [[XOR:vcc]], [[CMP0]], [[CMP1]]
+; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 1.0, {{v[0-9]+}}
+; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
+; SI: s_xor_b64 [[XOR:vcc]], [[CMP1]], [[CMP0]]
 ; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
@@ -173,26 +173,26 @@
 }
 
 ; FUNC-LABEL: {{^}}scalar_xor_literal_i64:
-; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; SI-DAG: s_xor_b32 s[[RES_HI:[0-9]+]], s[[HI]], 0xf237b
-; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
+; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
+; SI-DAG: s_xor_b32 s[[RES_HI:[0-9]+]], s{{[0-9]+}}, 0xf237b
+; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s{{[0-9]+}}, 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
-define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %or = xor i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}scalar_xor_literal_multi_use_i64:
-; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI: s_load_dwordx4 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b
 ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039
 ; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
-define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, i64 %b) {
   %or = xor i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
 
@@ -202,25 +202,25 @@
 }
 
 ; FUNC-LABEL: {{^}}scalar_xor_inline_imm_i64:
-; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI-NOT: xor_b32
-; SI: s_xor_b32 s[[VAL_LO]], s[[VAL_LO]], 63
+; SI: s_xor_b32 s[[VAL_LO]], s{{[0-9]+}}, 63
 ; SI-NOT: xor_b32
-; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[VAL_LO]]
+; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: xor_b32
-; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
+; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: xor_b32
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define amdgpu_kernel void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %or = xor i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}scalar_xor_neg_inline_imm_i64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; SI: s_xor_b64 [[VAL]], [[VAL]], -8
-define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
+; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -8
+define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %or = xor i64 %a, -8
   store i64 %or, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index f256d89..ee9bbb6 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -1,14 +1,14 @@
-; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
 
 ; R600: {{^}}s_mad_zext_i32_to_i64:
 ; R600: MEM_RAT_CACHELESS STORE_RAW
 ; R600: MEM_RAT_CACHELESS STORE_RAW
 
-; SI: {{^}}s_mad_zext_i32_to_i64:
-; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
-; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
+; GCN: {{^}}s_mad_zext_i32_to_i64:
+; GCN: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
+; GCN: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
 define amdgpu_kernel void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
 entry:
   %tmp0 = mul i32 %a, %b
@@ -18,8 +18,8 @@
   ret void
 }
 
-; SI-LABEL: {{^}}s_cmp_zext_i1_to_i32
-; SI: v_cndmask_b32
+; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i32
+; GCN: v_cndmask_b32
 define amdgpu_kernel void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %tmp0 = icmp eq i32 %a, %b
@@ -28,17 +28,17 @@
   ret void
 }
 
-; SI-LABEL: {{^}}s_arg_zext_i1_to_i64:
+; GCN-LABEL: {{^}}s_arg_zext_i1_to_i64:
 define amdgpu_kernel void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
   %ext = zext i1 %arg to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: {{^}}s_cmp_zext_i1_to_i64:
-; SI: s_mov_b32 s{{[0-9]+}}, 0
-; SI: v_cmp_eq_u32
-; SI: v_cndmask_b32
+; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i64:
+; GCN: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: v_cmp_eq_u32
+; GCN: v_cndmask_b32
 define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp eq i32 %a, %b
   %ext = zext i1 %cmp to i64
@@ -46,10 +46,20 @@
   ret void
 }
 
-; SI-LABEL: {{^}}s_cmp_zext_i1_to_i16
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; SI: buffer_store_short [[RESULT]]
-define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
+; FIXME: Why different commute?
+; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i16
+; GCN: s_load_dword [[A:s[0-9]+]]
+; GCN: s_load_dword [[B:s[0-9]+]]
+
+; SI: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
+; SI: v_cmp_eq_u32_e32 vcc, [[B]], [[V_A]]
+
+; VI: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
+; VI: v_cmp_eq_u32_e32 vcc, [[A]], [[V_B]]
+
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: buffer_store_short [[RESULT]]
+define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
   %tmp0 = icmp eq i16 %a, %b
   %tmp1 = zext i1 %tmp0 to i16
   store i16 %tmp1, i16 addrspace(1)* %out