AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel

Currently the default C calling convention functions are treated
the same as compute kernels. Make this explicit so the default
calling convention can be changed to a non-kernel.

Converted with perl -pi -e 's/define void/define amdgpu_kernel void/'
on the relevant test directories (and undoing in one place that actually
wanted a non-kernel).

llvm-svn: 298444
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
index 76b21d2..6419eb1 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
@@ -3,7 +3,7 @@
 
 ; CHECK: 'add_i32'
 ; CHECK: estimated cost of 1 for {{.*}} add i32
-define void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %add = add i32 %vec, %b
   store i32 %add, i32 addrspace(1)* %out
@@ -12,7 +12,7 @@
 
 ; CHECK: 'add_v2i32'
 ; CHECK: estimated cost of 2 for {{.*}} add <2 x i32>
-define void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
+define amdgpu_kernel void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %add = add <2 x i32> %vec, %b
   store <2 x i32> %add, <2 x i32> addrspace(1)* %out
@@ -21,7 +21,7 @@
 
 ; CHECK: 'add_v3i32'
 ; CHECK: estimated cost of 3 for {{.*}} add <3 x i32>
-define void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
+define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
   %add = add <3 x i32> %vec, %b
   store <3 x i32> %add, <3 x i32> addrspace(1)* %out
@@ -30,7 +30,7 @@
 
 ; CHECK: 'add_v4i32'
 ; CHECK: estimated cost of 4 for {{.*}} add <4 x i32>
-define void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
+define amdgpu_kernel void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
   %add = add <4 x i32> %vec, %b
   store <4 x i32> %add, <4 x i32> addrspace(1)* %out
@@ -39,7 +39,7 @@
 
 ; CHECK: 'add_i64'
 ; CHECK: estimated cost of 2 for {{.*}} add i64
-define void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %add = add i64 %vec, %b
   store i64 %add, i64 addrspace(1)* %out
@@ -48,7 +48,7 @@
 
 ; CHECK: 'add_v2i64'
 ; CHECK: estimated cost of 4 for {{.*}} add <2 x i64>
-define void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
+define amdgpu_kernel void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %add = add <2 x i64> %vec, %b
   store <2 x i64> %add, <2 x i64> addrspace(1)* %out
@@ -57,7 +57,7 @@
 
 ; CHECK: 'add_v3i64'
 ; CHECK: estimated cost of 6 for {{.*}} add <3 x i64>
-define void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
+define amdgpu_kernel void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
   %add = add <3 x i64> %vec, %b
   store <3 x i64> %add, <3 x i64> addrspace(1)* %out
@@ -66,7 +66,7 @@
 
 ; CHECK: 'add_v4i64'
 ; CHECK: estimated cost of 8 for {{.*}} add <4 x i64>
-define void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
+define amdgpu_kernel void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
   %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
   %add = add <4 x i64> %vec, %b
   store <4 x i64> %add, <4 x i64> addrspace(1)* %out
@@ -75,7 +75,7 @@
 
 ; CHECK: 'add_v16i64'
 ; CHECK: estimated cost of 32 for {{.*}} add <16 x i64>
-define void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
+define amdgpu_kernel void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
   %vec = load <16 x i64>, <16 x i64> addrspace(1)* %vaddr
   %add = add <16 x i64> %vec, %b
   store <16 x i64> %add, <16 x i64> addrspace(1)* %out
@@ -84,7 +84,7 @@
 
 ; CHECK: 'add_i16'
 ; CHECK: estimated cost of 1 for {{.*}} add i16
-define void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
+define amdgpu_kernel void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
   %add = add i16 %vec, %b
   store i16 %add, i16 addrspace(1)* %out
@@ -93,7 +93,7 @@
 
 ; CHECK: 'add_v2i16'
 ; CHECK: estimated cost of 2 for {{.*}} add <2 x i16>
-define void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
+define amdgpu_kernel void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %add = add <2 x i16> %vec, %b
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
@@ -102,7 +102,7 @@
 
 ; CHECK: 'sub_i32'
 ; CHECK: estimated cost of 1 for {{.*}} sub i32
-define void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %sub = sub i32 %vec, %b
   store i32 %sub, i32 addrspace(1)* %out
@@ -111,7 +111,7 @@
 
 ; CHECK: 'sub_i64'
 ; CHECK: estimated cost of 2 for {{.*}} sub i64
-define void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %sub = sub i64 %vec, %b
   store i64 %sub, i64 addrspace(1)* %out
@@ -119,7 +119,7 @@
 }
 ; CHECK: 'sub_i16'
 ; CHECK: estimated cost of 1 for {{.*}} sub i16
-define void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
+define amdgpu_kernel void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
   %sub = sub i16 %vec, %b
   store i16 %sub, i16 addrspace(1)* %out
@@ -128,7 +128,7 @@
 
 ; CHECK: 'sub_v2i16'
 ; CHECK: estimated cost of 2 for {{.*}} sub <2 x i16>
-define void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
+define amdgpu_kernel void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %sub = sub <2 x i16> %vec, %b
   store <2 x i16> %sub, <2 x i16> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll b/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll
index a809dbd..aa70f50 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: 'or_i32'
 ; CHECK: estimated cost of 1 for {{.*}} or i32
-define void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = or i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@
 
 ; CHECK: 'or_i64'
 ; CHECK: estimated cost of 2 for {{.*}} or i64
-define void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = or i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -20,7 +20,7 @@
 
 ; CHECK: 'xor_i32'
 ; CHECK: estimated cost of 1 for {{.*}} xor i32
-define void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = xor i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@
 
 ; CHECK: 'xor_i64'
 ; CHECK: estimated cost of 2 for {{.*}} xor i64
-define void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = xor i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -39,7 +39,7 @@
 
 ; CHECK: 'and_i32'
 ; CHECK: estimated cost of 1 for {{.*}} and i32
-define void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = and i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -48,7 +48,7 @@
 
 ; CHECK: 'and_i64'
 ; CHECK: estimated cost of 2 for {{.*}} and i64
-define void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = and i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/br.ll b/llvm/test/Analysis/CostModel/AMDGPU/br.ll
index 0b96493..494f8d2 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/br.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/br.ll
@@ -4,7 +4,7 @@
 ; CHECK: estimated cost of 10 for instruction: br i1
 ; CHECK: estimated cost of 10 for instruction: br label
 ; CHECK: estimated cost of 10 for instruction: ret void
-define void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
 bb0:
   br i1 undef, label %bb1, label %bb2
 
@@ -21,7 +21,7 @@
 
 ; CHECK: 'test_switch_cost'
 ; CHECK: Unknown cost for instruction:   switch
-define void @test_switch_cost(i32 %a) #0 {
+define amdgpu_kernel void @test_switch_cost(i32 %a) #0 {
 entry:
   switch i32 %a, label %default [
     i32 0, label %case0
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll b/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll
index c328d76..1efbb58 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: 'extractelement_v2i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i32>
-define void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %elt = extractelement <2 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@
 
 ; CHECK: 'extractelement_v2f32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x float>
-define void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %elt = extractelement <2 x float> %vec, i32 1
   store float %elt, float addrspace(1)* %out
@@ -20,7 +20,7 @@
 
 ; CHECK: 'extractelement_v3i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i32>
-define void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
   %elt = extractelement <3 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@
 
 ; CHECK: 'extractelement_v4i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i32>
-define void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
   %elt = extractelement <4 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -38,7 +38,7 @@
 
 ; CHECK: 'extractelement_v8i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i32>
-define void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
   %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
   %elt = extractelement <8 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -48,7 +48,7 @@
 ; FIXME: Should be non-0
 ; CHECK: 'extractelement_v8i32_dynindex'
 ; CHECK: estimated cost of 2 for {{.*}} extractelement <8 x i32>
-define void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
+define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
   %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
   %elt = extractelement <8 x i32> %vec, i32 %idx
   store i32 %elt, i32 addrspace(1)* %out
@@ -57,7 +57,7 @@
 
 ; CHECK: 'extractelement_v2i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i64>
-define void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %elt = extractelement <2 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -66,7 +66,7 @@
 
 ; CHECK: 'extractelement_v3i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i64>
-define void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
   %elt = extractelement <3 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -75,7 +75,7 @@
 
 ; CHECK: 'extractelement_v4i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i64>
-define void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
   %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
   %elt = extractelement <4 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -84,7 +84,7 @@
 
 ; CHECK: 'extractelement_v8i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i64>
-define void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
   %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
   %elt = extractelement <8 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -93,7 +93,7 @@
 
 ; CHECK: 'extractelement_v4i8'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i8>
-define void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
   %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr
   %elt = extractelement <4 x i8> %vec, i8 1
   store i8 %elt, i8 addrspace(1)* %out
@@ -102,7 +102,7 @@
 
 ; CHECK: 'extractelement_v2i16'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i16>
-define void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %elt = extractelement <2 x i16> %vec, i16 1
   store i16 %elt, i16 addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll b/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
index 9c551ec..0d49e29 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: 'fabs_f32'
 ; CHECK: estimated cost of 0 for {{.*}} call float @llvm.fabs.f32
-define void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %fabs = call float @llvm.fabs.f32(float %vec) #1
   store float %fabs, float addrspace(1)* %out
@@ -11,7 +11,7 @@
 
 ; CHECK: 'fabs_v2f32'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x float> @llvm.fabs.v2f32
-define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %vec) #1
   store <2 x float> %fabs, <2 x float> addrspace(1)* %out
@@ -20,7 +20,7 @@
 
 ; CHECK: 'fabs_v3f32'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x float> @llvm.fabs.v3f32
-define void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %vec) #1
   store <3 x float> %fabs, <3 x float> addrspace(1)* %out
@@ -29,7 +29,7 @@
 
 ; CHECK: 'fabs_f64'
 ; CHECK: estimated cost of 0 for {{.*}} call double @llvm.fabs.f64
-define void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %fabs = call double @llvm.fabs.f64(double %vec) #1
   store double %fabs, double addrspace(1)* %out
@@ -38,7 +38,7 @@
 
 ; CHECK: 'fabs_v2f64'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x double> @llvm.fabs.v2f64
-define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %vec) #1
   store <2 x double> %fabs, <2 x double> addrspace(1)* %out
@@ -47,7 +47,7 @@
 
 ; CHECK: 'fabs_v3f64'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x double> @llvm.fabs.v3f64
-define void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %fabs = call <3 x double> @llvm.fabs.v3f64(<3 x double> %vec) #1
   store <3 x double> %fabs, <3 x double> addrspace(1)* %out
@@ -56,7 +56,7 @@
 
 ; CHECK: 'fabs_f16'
 ; CHECK: estimated cost of 0 for {{.*}} call half @llvm.fabs.f16
-define void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %fabs = call half @llvm.fabs.f16(half %vec) #1
   store half %fabs, half addrspace(1)* %out
@@ -65,7 +65,7 @@
 
 ; CHECK: 'fabs_v2f16'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x half> @llvm.fabs.v2f16
-define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %vec) #1
   store <2 x half> %fabs, <2 x half> addrspace(1)* %out
@@ -74,7 +74,7 @@
 
 ; CHECK: 'fabs_v3f16'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x half> @llvm.fabs.v3f16
-define void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
   %fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %vec) #1
   store <3 x half> %fabs, <3 x half> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
index 00e91bd..d7ac735 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
@@ -3,7 +3,7 @@
 
 ; ALL: 'fadd_f32'
 ; ALL: estimated cost of 1 for {{.*}} fadd float
-define void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fadd float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -12,7 +12,7 @@
 
 ; ALL: 'fadd_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fadd <2 x float>
-define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fadd <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@
 
 ; ALL: 'fadd_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fadd <3 x float>
-define void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fadd <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -31,7 +31,7 @@
 ; ALL: 'fadd_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fadd double
 ; SLOWF64: estimated cost of 3 for {{.*}} fadd double
-define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fadd double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -41,7 +41,7 @@
 ; ALL: 'fadd_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double>
 ; SLOWF64: estimated cost of 6 for {{.*}} fadd <2 x double>
-define void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fadd <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -51,7 +51,7 @@
 ; ALL: 'fadd_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double>
 ; SLOWF64: estimated cost of 9 for {{.*}} fadd <3 x double>
-define void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fadd <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -60,7 +60,7 @@
 
 ; ALL 'fadd_f16'
 ; ALL estimated cost of 1 for {{.*}} fadd half
-define void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fadd half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -69,7 +69,7 @@
 
 ; ALL 'fadd_v2f16'
 ; ALL estimated cost of 2 for {{.*}} fadd <2 x half>
-define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fadd <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -78,7 +78,7 @@
 
 ; ALL 'fadd_v4f16'
 ; ALL estimated cost of 4 for {{.*}} fadd <4 x half>
-define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fadd <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
index 3f37442..caa9bff 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
@@ -5,7 +5,7 @@
 
 ; CHECK: 'fdiv_f32'
 ; ALL: estimated cost of 10 for {{.*}} fdiv float
-define void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fdiv float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -14,7 +14,7 @@
 
 ; ALL: 'fdiv_v2f32'
 ; ALL: estimated cost of 20 for {{.*}} fdiv <2 x float>
-define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fdiv <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -23,7 +23,7 @@
 
 ; ALL: 'fdiv_v3f32'
 ; ALL: estimated cost of 30 for {{.*}} fdiv <3 x float>
-define void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fdiv <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -35,7 +35,7 @@
 ; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
 ; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
 ; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
-define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fdiv double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -47,7 +47,7 @@
 ; CISLOWF64: estimated cost of 66 for {{.*}} fdiv <2 x double>
 ; SIFASTF64: estimated cost of 64 for {{.*}} fdiv <2 x double>
 ; SISLOWF64: estimated cost of 72 for {{.*}} fdiv <2 x double>
-define void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fdiv <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -59,7 +59,7 @@
 ; CISLOWF64: estimated cost of 99 for {{.*}} fdiv <3 x double>
 ; SIFASTF64: estimated cost of 96 for {{.*}} fdiv <3 x double>
 ; SISLOWF64: estimated cost of 108 for {{.*}} fdiv <3 x double>
-define void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fdiv <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -68,7 +68,7 @@
 
 ; ALL: 'fdiv_f16'
 ; ALL: estimated cost of 10 for {{.*}} fdiv half
-define void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fdiv half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -77,7 +77,7 @@
 
 ; ALL: 'fdiv_v2f16'
 ; ALL: estimated cost of 20 for {{.*}} fdiv <2 x half>
-define void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fdiv <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -86,7 +86,7 @@
 
 ; ALL: 'fdiv_v4f16'
 ; ALL: estimated cost of 40 for {{.*}} fdiv <4 x half>
-define void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fdiv <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
index 6303bb7..915c35a 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -3,7 +3,7 @@
 
 ; ALL: 'fmul_f32'
 ; ALL: estimated cost of 1 for {{.*}} fmul float
-define void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fmul float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -12,7 +12,7 @@
 
 ; ALL: 'fmul_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fmul <2 x float>
-define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fmul <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@
 
 ; ALL: 'fmul_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fmul <3 x float>
-define void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fmul <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -31,7 +31,7 @@
 ; ALL: 'fmul_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fmul double
 ; SLOWF64: estimated cost of 3 for {{.*}} fmul double
-define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fmul double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -41,7 +41,7 @@
 ; ALL: 'fmul_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fmul <2 x double>
 ; SLOWF64: estimated cost of 6 for {{.*}} fmul <2 x double>
-define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fmul <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -51,7 +51,7 @@
 ; ALL: 'fmul_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fmul <3 x double>
 ; SLOWF64: estimated cost of 9 for {{.*}} fmul <3 x double>
-define void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fmul <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -60,7 +60,7 @@
 
 ; ALL 'fmul_f16'
 ; ALL estimated cost of 1 for {{.*}} fmul half
-define void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fmul half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -69,7 +69,7 @@
 
 ; ALL 'fmul_v2f16'
 ; ALL estimated cost of 2 for {{.*}} fmul <2 x half>
-define void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fmul <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -78,7 +78,7 @@
 
 ; ALL 'fmul_v4f16'
 ; ALL estimated cost of 4 for {{.*}} fmul <4 x half>
-define void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fmul <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
index e0850be..cb89d29 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
@@ -3,7 +3,7 @@
 
 ; ALL: 'fsub_f32'
 ; ALL: estimated cost of 1 for {{.*}} fsub float
-define void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fsub float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -12,7 +12,7 @@
 
 ; ALL: 'fsub_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fsub <2 x float>
-define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fsub <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@
 
 ; ALL: 'fsub_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fsub <3 x float>
-define void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fsub <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -31,7 +31,7 @@
 ; ALL: 'fsub_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fsub double
 ; SLOWF64: estimated cost of 3 for {{.*}} fsub double
-define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fsub double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -41,7 +41,7 @@
 ; ALL: 'fsub_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fsub <2 x double>
 ; SLOWF64: estimated cost of 6 for {{.*}} fsub <2 x double>
-define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fsub <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -51,7 +51,7 @@
 ; ALL: 'fsub_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fsub <3 x double>
 ; SLOWF64: estimated cost of 9 for {{.*}} fsub <3 x double>
-define void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fsub <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -60,7 +60,7 @@
 
 ; ALL: 'fsub_f16'
 ; ALL: estimated cost of 1 for {{.*}} fsub half
-define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fsub half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -69,7 +69,7 @@
 
 ; ALL: 'fsub_v2f16'
 ; ALL: estimated cost of 2 for {{.*}} fsub <2 x half>
-define void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fsub <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -78,7 +78,7 @@
 
 ; ALL: 'fsub_v4f16'
 ; ALL: estimated cost of 4 for {{.*}} fsub <4 x half>
-define void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fsub <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll b/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll
index 1765afe..6f296a3 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: 'insertelement_v2i32'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i32>
-define void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %insert = insertelement <2 x i32> %vec, i32 1, i32 123
   store <2 x i32> %insert, <2 x i32> addrspace(1)* %out
@@ -11,7 +11,7 @@
 
 ; CHECK: 'insertelement_v2i64'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i64>
-define void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %insert = insertelement <2 x i64> %vec, i64 1, i64 123
   store <2 x i64> %insert, <2 x i64> addrspace(1)* %out
@@ -20,7 +20,7 @@
 
 ; CHECK: 'insertelement_v2i16'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i16>
-define void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %insert = insertelement <2 x i16> %vec, i16 1, i16 123
   store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
@@ -29,7 +29,7 @@
 
 ; CHECK: 'insertelement_v2i8'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i8>
-define void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
   %vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr
   %insert = insertelement <2 x i8> %vec, i8 1, i8 123
   store <2 x i8> %insert, <2 x i8> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
index cbc755a..aac7b68 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: 'mul_i32'
 ; CHECK: estimated cost of 3 for {{.*}} mul i32
-define void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %mul = mul i32 %vec, %b
   store i32 %mul, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@
 
 ; CHECK: 'mul_v2i32'
 ; CHECK: estimated cost of 6 for {{.*}} mul <2 x i32>
-define void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
+define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %mul = mul <2 x i32> %vec, %b
   store <2 x i32> %mul, <2 x i32> addrspace(1)* %out
@@ -20,7 +20,7 @@
 
 ; CHECK: 'mul_v3i32'
 ; CHECK: estimated cost of 9 for {{.*}} mul <3 x i32>
-define void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
+define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
   %mul = mul <3 x i32> %vec, %b
   store <3 x i32> %mul, <3 x i32> addrspace(1)* %out
@@ -29,7 +29,7 @@
 
 ; CHECK: 'mul_v4i32'
 ; CHECK: estimated cost of 12 for {{.*}} mul <4 x i32>
-define void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
+define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
   %mul = mul <4 x i32> %vec, %b
   store <4 x i32> %mul, <4 x i32> addrspace(1)* %out
@@ -38,7 +38,7 @@
 
 ; CHECK: 'mul_i64'
 ; CHECK: estimated cost of 16 for {{.*}} mul i64
-define void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %mul = mul i64 %vec, %b
   store i64 %mul, i64 addrspace(1)* %out
@@ -47,7 +47,7 @@
 
 ; CHECK: 'mul_v2i64'
 ; CHECK: estimated cost of 32 for {{.*}} mul <2 x i64>
-define void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %mul = mul <2 x i64> %vec, %b
   store <2 x i64> %mul, <2 x i64> addrspace(1)* %out
@@ -56,7 +56,7 @@
 
 ; CHECK: 'mul_v3i64'
 ; CHECK: estimated cost of 48 for {{.*}} mul <3 x i64>
-define void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
   %mul = mul <3 x i64> %vec, %b
   store <3 x i64> %mul, <3 x i64> addrspace(1)* %out
@@ -65,7 +65,7 @@
 
 ; CHECK: 'mul_v4i64'
 ; CHECK: estimated cost of 64 for {{.*}} mul <4 x i64>
-define void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
   %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
   %mul = mul <4 x i64> %vec, %b
   store <4 x i64> %mul, <4 x i64> addrspace(1)* %out
@@ -75,7 +75,7 @@
 
 ; CHECK: 'mul_v8i64'
 ; CHECK: estimated cost of 128 for {{.*}} mul <8 x i64>
-define void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
   %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
   %mul = mul <8 x i64> %vec, %b
   store <8 x i64> %mul, <8 x i64> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
index 003aed7..85fb0eb 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
@@ -3,7 +3,7 @@
 
 ; ALL: 'shl_i32'
 ; ALL: estimated cost of 1 for {{.*}} shl i32
-define void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = shl i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -13,7 +13,7 @@
 ; ALL: 'shl_i64'
 ; FAST64: estimated cost of 2 for {{.*}} shl i64
 ; SLOW64: estimated cost of 3 for {{.*}} shl i64
-define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = shl i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -22,7 +22,7 @@
 
 ; ALL: 'lshr_i32'
 ; ALL: estimated cost of 1 for {{.*}} lshr i32
-define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = lshr i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -32,7 +32,7 @@
 ; ALL: 'lshr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} lshr i64
 ; SLOW64: estimated cost of 3 for {{.*}} lshr i64
-define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = lshr i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -41,7 +41,7 @@
 
 ; ALL: 'ashr_i32'
 ; ALL: estimated cost of 1 for {{.*}} ashr i32
-define void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = ashr i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -51,7 +51,7 @@
 ; ALL: 'ashr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} ashr i64
 ; SLOW64: estimated cost of 3 for {{.*}} ashr i64
-define void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = ashr i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
index 319a697..d226695 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s
 
 ; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
-define void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 {
+define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 {
   %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
   store i32 %swizzle, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
index b4fa79a..6144ffe 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
@@ -5,7 +5,7 @@
 ; CHECK: DIVERGENT:  %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4
 
 ; The post dominator tree does not have a root node in this case
-define void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 {
+define amdgpu_kernel void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 {
 bb0:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp2 = sext i32 %tmp to i64
diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
index ca93dda..7ade8ea 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
@@ -1,7 +1,7 @@
 ; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s
 
 ; CHECK: DIVERGENT:  %tmp = cmpxchg volatile
-define void @unreachable_loop(i32 %tidx) #0 {
+define amdgpu_kernel void @unreachable_loop(i32 %tidx) #0 {
 entry:
   unreachable
 
diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
index 669ee80..98fbc88 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
@@ -7,35 +7,35 @@
 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 
 ; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-define void @workitem_id_x() #1 {
+define amdgpu_kernel void @workitem_id_x() #1 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %id.x, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %id.y = call i32 @llvm.amdgcn.workitem.id.y()
-define void @workitem_id_y() #1 {
+define amdgpu_kernel void @workitem_id_y() #1 {
   %id.y = call i32 @llvm.amdgcn.workitem.id.y()
   store volatile i32 %id.y, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %id.z = call i32 @llvm.amdgcn.workitem.id.z()
-define void @workitem_id_z() #1 {
+define amdgpu_kernel void @workitem_id_z() #1 {
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
   store volatile i32 %id.z, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
-define void @mbcnt_lo() #1 {
+define amdgpu_kernel void @mbcnt_lo() #1 {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
   store volatile i32 %mbcnt.lo, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
-define void @mbcnt_hi() #1 {
+define amdgpu_kernel void @mbcnt_hi() #1 {
   %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
   store volatile i32 %mbcnt.hi, i32 addrspace(1)* undef
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index edad18e..ca661cf 100644
--- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -13,7 +13,7 @@
 ; FUNC-LABEL: {{^}}local_address_load:
 ; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
 ; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
-define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = load i32, i32 addrspace(3)* %in
   store i32 %0, i32 addrspace(1)* %out
@@ -24,7 +24,7 @@
 ; SI: s_add_i32 [[SPTR:s[0-9]]]
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_read_b32 [[VPTR]]
-define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
+define amdgpu_kernel void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
 entry:
   %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset
   %1 = load i32, i32 addrspace(3)* %0
@@ -35,7 +35,7 @@
 ; FUNC-LABEL: {{^}}local_address_gep_const_offset:
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
-define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1
   %1 = load i32, i32 addrspace(3)* %0
@@ -48,7 +48,7 @@
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_read_b32 [[VPTR]]
-define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385
   %1 = load i32, i32 addrspace(3)* %0
@@ -60,7 +60,7 @@
 ; SI: v_cmp_ne_u32
 ; SI-NOT: v_cmp_ne_u32
 ; SI: v_cndmask_b32
-define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
+define amdgpu_kernel void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
   %cmp = icmp ne i32 addrspace(3)* %lds, null
   %x = select i1 %cmp, i32 123, i32 456
   store i32 %x, i32 addrspace(1)* %out
@@ -71,7 +71,7 @@
 ; SI: s_mul_i32
 ; SI-NEXT: s_add_i32
 ; SI: ds_read_b32
-define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
+define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
   %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
   %val = load float, float addrspace(3)* %ptr
   store float %val, float addrspace(1)* %out
@@ -83,7 +83,7 @@
 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
-define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
+define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
   %val = load float, float addrspace(3)* @g_lds
   store float %val, float addrspace(1)* %out
   ret void
@@ -95,14 +95,14 @@
 
 ; FUNC-LABEL: {{^}}global_ptr:
 ; SI: ds_write_b32
-define void @global_ptr() nounwind {
+define amdgpu_kernel void @global_ptr() nounwind {
   store i32 addrspace(3)* getelementptr ([16383 x i32], [16383 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_address_store:
 ; SI: ds_write_b32
-define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
   store i32 %val, i32 addrspace(3)* %out
   ret void
 }
@@ -111,7 +111,7 @@
 ; SI: s_add_i32 [[SADDR:s[0-9]+]],
 ; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
 ; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
-define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
+define amdgpu_kernel void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
@@ -121,7 +121,7 @@
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
-define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
@@ -132,7 +132,7 @@
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
-define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
index e10ffa0..56a9e70 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
@@ -4,7 +4,7 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
+  define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
 ...
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
index 78bf9d8..ea2ad2b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
@@ -5,7 +5,7 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
+  define amdgpu_kernel void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
 ...
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
index 070d3bc..ea43572 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
@@ -4,7 +4,7 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
+  define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
 ...
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
index 9447336..3496b1a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
@@ -3,12 +3,12 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @load_constant(i32 addrspace(2)* %ptr0) { ret void }
-  define void @load_global_uniform(i32 addrspace(1)* %ptr1) {
+  define amdgpu_kernel void @load_constant(i32 addrspace(2)* %ptr0) { ret void }
+  define amdgpu_kernel void @load_global_uniform(i32 addrspace(1)* %ptr1) {
     %tmp0 = load i32, i32 addrspace(1)* %ptr1
     ret void
   }
-  define void @load_global_non_uniform(i32 addrspace(1)* %ptr2) {
+  define amdgpu_kernel void @load_global_non_uniform(i32 addrspace(1)* %ptr2) {
     %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
     %tmp1 = getelementptr i32, i32 addrspace(1)* %ptr2, i32 %tmp0
     %tmp2 = load i32, i32 addrspace(1)* %tmp1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
index a3630b1a..8a6b3df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@@ -9,7 +9,7 @@
 ; GCN-LABEL: {{^}}smrd0:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
   %1 = load i32, i32 addrspace(2)* %0
@@ -21,7 +21,7 @@
 ; GCN-LABEL: {{^}}smrd1:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
   %1 = load i32, i32 addrspace(2)* %0
@@ -36,7 +36,7 @@
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
-define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
   %1 = load i32, i32 addrspace(2)* %0
@@ -51,7 +51,7 @@
 ; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
 ; TODO: Add VI checks
 ; XGCN: s_endpgm
-define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
   %1 = load i32, i32 addrspace(2)* %0
@@ -65,7 +65,7 @@
 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
   %1 = load i32, i32 addrspace(2)* %0
@@ -79,7 +79,7 @@
 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
   %1 = load i32, i32 addrspace(2)* %0
diff --git a/llvm/test/CodeGen/AMDGPU/add-debug.ll b/llvm/test/CodeGen/AMDGPU/add-debug.ll
index 529905d..b90c20b 100644
--- a/llvm/test/CodeGen/AMDGPU/add-debug.ll
+++ b/llvm/test/CodeGen/AMDGPU/add-debug.ll
@@ -3,7 +3,7 @@
 ; REQUIRES: asserts
 
 ; Check that SelectionDAGDumper does not crash on int_SI_if.
-define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/add.i16.ll b/llvm/test/CodeGen/AMDGPU/add.i16.ll
index 6c5cdd3..b65e79f 100644
--- a/llvm/test/CodeGen/AMDGPU/add.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.i16.ll
@@ -6,7 +6,7 @@
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -23,7 +23,7 @@
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -38,7 +38,7 @@
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -53,7 +53,7 @@
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -69,7 +69,7 @@
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_dword [[ADD]]
-define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -89,7 +89,7 @@
 ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
 ; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -109,7 +109,7 @@
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]],  [[B]], [[A]]
 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: buffer_store_dword [[SEXT]]
-define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -130,7 +130,7 @@
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index a6247c7..7e4546d 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -8,7 +8,7 @@
 ;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}}
 ;SI-NOT: [[REG]]
 ;SI: buffer_store_dword [[REG]],
-define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -24,7 +24,7 @@
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -44,7 +44,7 @@
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -71,7 +71,7 @@
 ; SI: s_add_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
-define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
+define amdgpu_kernel void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
 entry:
   %0 = add <8 x i32> %a, %b
   store <8 x i32> %0, <8 x i32> addrspace(1)* %out
@@ -112,7 +112,7 @@
 ; SI: s_add_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
-define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
+define amdgpu_kernel void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
 entry:
   %0 = add <16 x i32> %a, %b
   store <16 x i32> %0, <16 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = add i64 %a, %b
   store i64 %0, i64 addrspace(1)* %out
@@ -150,7 +150,7 @@
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
 entry:
   %0 = load i64, i64 addrspace(1)* %in
   %1 = add i64 %a, %0
@@ -169,7 +169,7 @@
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index db3c88a..dcf6d24 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -7,7 +7,7 @@
 
 ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -27,7 +27,7 @@
 
 ; VI: s_add_i32
 ; VI: s_add_i32
-define void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
+define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
   %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
   %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
   %add = add <2 x i16> %a, %b
@@ -41,7 +41,7 @@
 
 ; VI: s_add_i32
 ; VI: s_add_i32
-define void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
+define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
   %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
   %add = add <2 x i16> %a, %a
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
@@ -54,7 +54,7 @@
 
 ; VI: v_add_i32
 ; VI: v_add_i32
-define void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
+define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
   %add = add <2 x i16> %a, %b
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
   ret void
@@ -66,7 +66,7 @@
 
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x1c8, v{{[0-9]+}}
-define void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -83,7 +83,7 @@
 
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffc21, v{{[0-9]+}}
-define void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -102,7 +102,7 @@
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
 ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -121,7 +121,7 @@
 ; VI-NOT: v_add_u16
 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -141,7 +141,7 @@
 ; VI-NOT: v_add_u16
 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -173,7 +173,7 @@
 ; VI-NOT: and
 ; VI-NOT: shl
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
-define void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -208,7 +208,7 @@
 ; VI: v_add_u16_e32
 
 ; VI: buffer_store_dwordx4
-define void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -236,7 +236,7 @@
 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; VI: buffer_store_dwordx2
-define void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -264,7 +264,7 @@
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-define void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/add_i128.ll b/llvm/test/CodeGen/AMDGPU/add_i128.ll
index c80157c..00a125c 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i128.ll
@@ -6,7 +6,7 @@
 ; GCN-NEXT: v_addc_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN-NEXT: v_addc_u32_e32 v[[HI:[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]],
-define void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i128, i128 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i128, i128 addrspace(1)* %inB, i32 %tid
@@ -23,7 +23,7 @@
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
-define void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
+define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
   %foo = load i128, i128 addrspace(1)* %in, align 8
   %result = add i128 %foo, %a
   store i128 %result, i128 addrspace(1)* %out
@@ -35,7 +35,7 @@
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
-define void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
+define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
   %foo = load i128, i128 addrspace(1)* %in, align 8
   %result = add i128 %a, %foo
   store i128 %result, i128 addrspace(1)* %out
@@ -47,7 +47,7 @@
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
-define void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) {
+define amdgpu_kernel void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) {
   %result = add i128 %a, %b
   store i128 %result, i128 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/add_i64.ll b/llvm/test/CodeGen/AMDGPU/add_i64.ll
index 3d360b7..62733d5 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i64.ll
@@ -6,7 +6,7 @@
 ; SI-LABEL: {{^}}test_i64_vreg:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
@@ -21,7 +21,7 @@
 ; SI-LABEL: {{^}}sgpr_operand:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
+define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
   %foo = load i64, i64 addrspace(1)* %in, align 8
   %result = add i64 %foo, %a
   store i64 %result, i64 addrspace(1)* %out
@@ -34,7 +34,7 @@
 ; SI-LABEL: {{^}}sgpr_operand_reversed:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
+define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
   %foo = load i64, i64 addrspace(1)* %in, align 8
   %result = add i64 %a, %foo
   store i64 %result, i64 addrspace(1)* %out
@@ -47,7 +47,7 @@
 ; SI: s_addc_u32
 ; SI: s_add_u32
 ; SI: s_addc_u32
-define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
+define amdgpu_kernel void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
   %result = add <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
@@ -58,7 +58,7 @@
 ; SI: v_addc_u32
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
@@ -76,7 +76,7 @@
 ; SI-NOT: addc
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
-define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %add = add i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-captured.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-captured.ll
index 481a3e2..138bc36 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-captured.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-captured.ll
@@ -9,7 +9,7 @@
 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
 ; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
 ; CHECK: store i32 %ptr2int, i32 addrspace(1)* %out
-define void @addrspacecast_captured(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @addrspacecast_captured(i32 addrspace(1)* %out) #0 {
 entry:
   %data = alloca i32, align 4
   %cast = addrspacecast i32* %data to i32 addrspace(4)*
@@ -22,7 +22,7 @@
 ; CHECK: %data = alloca i32, align 4
 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
 ; CHECK: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %out
-define void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 {
+define amdgpu_kernel void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 {
 entry:
   %data = alloca i32, align 4
   %cast = addrspacecast i32* %data to i32 addrspace(4)*
@@ -35,7 +35,7 @@
 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
 ; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
 ; CHECK: call void @consume_ptr2int(i32 %ptr2int)
-define void @addrspacecast_captured_call() #0 {
+define amdgpu_kernel void @addrspacecast_captured_call() #0 {
 entry:
   %data = alloca i32, align 4
   %cast = addrspacecast i32* %data to i32 addrspace(4)*
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index 67a1939..8cabc7d 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -9,57 +9,57 @@
 @global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4
 
 ; HSA: @store_cast_0_flat_to_group_addrspacecast() #1
-define void @store_cast_0_flat_to_group_addrspacecast() #1 {
+define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 {
   store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
   ret void
 }
 
 ; HSA: @store_cast_0_group_to_flat_addrspacecast() #2
-define void @store_cast_0_group_to_flat_addrspacecast() #1 {
+define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 {
   store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*)
   ret void
 }
 
-; HSA: define void @store_constant_cast_group_gv_to_flat() #2
-define void @store_constant_cast_group_gv_to_flat() #1 {
+; HSA: define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #2
+define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 {
   store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds.i32 to i32 addrspace(4)*)
   ret void
 }
 
 ; HSA: @store_constant_cast_group_gv_gep_to_flat() #2
-define void @store_constant_cast_group_gv_gep_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat() #1 {
   store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
   ret void
 }
 
 ; HSA: @store_constant_cast_global_gv_to_flat() #1
-define void @store_constant_cast_global_gv_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_global_gv_to_flat() #1 {
   store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global.i32 to i32 addrspace(4)*)
   ret void
 }
 
 ; HSA: @store_constant_cast_global_gv_gep_to_flat() #1
-define void @store_constant_cast_global_gv_gep_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_global_gv_gep_to_flat() #1 {
   store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(1)* @global.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
   ret void
 }
 
 ; HSA: @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   %val = load i32, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
   store i32 %val, i32 addrspace(1)* %out
   ret void
 }
 
 ; HSA: @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   %val = atomicrmw add i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 1 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
 }
 
 ; HSA: @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   %val = cmpxchg i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst
   %val0 = extractvalue { i32, i1 } %val, 0
   store i32 %val0, i32 addrspace(1)* %out
@@ -67,28 +67,28 @@
 }
 
 ; HSA: @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   call void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* %out, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 32, i32 4, i1 false)
   ret void
 }
 
 ; Can't just search the pointer value
 ; HSA: @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #2
-define void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
+define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
   store i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 addrspace(4)* addrspace(1)* %out
   ret void
 }
 
 ; Can't just search pointer types
 ; HSA: @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #2
-define void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
   store i64 ptrtoint (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i64), i64 addrspace(1)* %out
   ret void
 }
 
 ; Cast group to flat, do GEP, cast back to group
 ; HSA: @store_constant_cast_group_gv_gep_to_flat_to_group() #2
-define void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
+define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
   store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index b6ada5e..52b7029 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -28,7 +28,7 @@
 
 ; CI: NumSgprs: {{[0-9][0-9]+}}
 ; GFX9: NumSgprs: {{[0-9]+}}
-define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %stof
   ret void
@@ -58,7 +58,7 @@
 
 ; CI: NumSgprs: {{[0-9][0-9]+}}
 ; GFX9: NumSgprs: {{[0-9]+}}
-define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
+define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
   %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %stof
   ret void
@@ -73,7 +73,7 @@
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
-define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %stof
   ret void
@@ -85,7 +85,7 @@
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
-define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
   %ld = load volatile i32, i32 addrspace(4)* %stof
   ret void
@@ -102,7 +102,7 @@
 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
-define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
   store volatile i32 0, i32 addrspace(3)* %ftos
   ret void
@@ -119,7 +119,7 @@
 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
-define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
   store volatile i32 0, i32* %ftos
   ret void
@@ -133,7 +133,7 @@
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
-define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %ftos
   ret void
@@ -144,7 +144,7 @@
 
 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
-define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
   load volatile i32, i32 addrspace(2)* %ftos
   ret void
@@ -158,7 +158,7 @@
 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_0_group_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %cast
   ret void
@@ -168,7 +168,7 @@
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: ds_write_b32 [[PTR]], [[K]]
-define void @cast_0_flat_to_group_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)*
   store volatile i32 7, i32 addrspace(3)* %cast
   ret void
@@ -179,7 +179,7 @@
 ; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_neg1_group_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %cast
   ret void
@@ -189,7 +189,7 @@
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: ds_write_b32 [[PTR]], [[K]]
-define void @cast_neg1_flat_to_group_addrspacecast() #0 {
+define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)*
   store volatile i32 7, i32 addrspace(3)* %cast
   ret void
@@ -204,7 +204,7 @@
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_0_private_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32* null to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %cast
   ret void
@@ -214,7 +214,7 @@
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-define void @cast_0_flat_to_private_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)*
   store volatile i32 7, i32* %cast
   ret void
@@ -226,7 +226,7 @@
 ; HSA-LABEL: {{^}}branch_use_flat_i32:
 ; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
 ; HSA: s_endpgm
-define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
+define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
 entry:
   %cmp = icmp ne i32 %c, 0
   br i1 %cmp, label %local, label %global
@@ -259,7 +259,7 @@
 ; HSA: flat_store_dword
 ; HSA: s_barrier
 ; HSA: flat_load_dword
-define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
+define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
   %alloca = alloca i32, i32 9, align 4
   %x = call i32 @llvm.amdgcn.workitem.id.x() #2
   %pptr = getelementptr i32, i32* %alloca, i32 %x
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
index c0c5305..ef742f5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@@ -16,7 +16,7 @@
 
 ; FUNC-LABEL: {{^}}i8ptr_v16i8ptr:
 ; SI: s_endpgm
-define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
   %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
@@ -24,7 +24,7 @@
   ret void
 }
 
-define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %load = load float, float addrspace(1)* %in, align 4
   %fadd32 = fadd float %load, 1.0
   %bc = bitcast float %fadd32 to <2 x i16>
@@ -33,7 +33,7 @@
   ret void
 }
 
-define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
   %add.v2i16 = add <2 x i16> %load, <i16 2, i16 2>
   %bc = bitcast <2 x i16> %add.v2i16 to float
@@ -42,7 +42,7 @@
   ret void
 }
 
-define void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %load = load float, float addrspace(1)* %in, align 4
   %fadd32 = fadd float %load, 1.0
   %bc = bitcast float %fadd32 to <2 x half>
@@ -51,7 +51,7 @@
   ret void
 }
 
-define void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind {
   %load = load <2 x half>, <2 x half> addrspace(1)* %in, align 4
   %add.v2f16 = fadd <2 x half> %load, <half 2.0, half 2.0>
   %bc = bitcast <2 x half> %add.v2f16 to float
@@ -60,14 +60,14 @@
   ret void
 }
 
-define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %bc = bitcast <4 x i8> %load to i32
   store i32 %bc, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %load to <4 x i8>
   store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
@@ -76,7 +76,7 @@
 
 ; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64:
 ; SI: s_endpgm
-define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %add = add <2 x i32> %val, <i32 4, i32 9>
   %bc = bitcast <2 x i32> %add to double
@@ -87,7 +87,7 @@
 
 ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
 ; SI: s_endpgm
-define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
   %val = load double, double addrspace(1)* %in, align 8
   %add = fadd double %val, 4.0
   %bc = bitcast double %add to <2 x i32>
@@ -96,7 +96,7 @@
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64:
-define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
+define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -112,7 +112,7 @@
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64:
-define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
+define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
index a6d0558..79450b9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
@@ -15,7 +15,7 @@
 ; GCN-NOT: v0
 ; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]]
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = alloca [2 x i32]
   %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index d78c751..0e56059 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -4,7 +4,7 @@
 
 ; NOOP-LABEL: @noop_fdiv_fpmath(
 ; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
-define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
+define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
   %md.25ulp = fdiv float %a, %b, !fpmath !0
   store volatile float %md.25ulp, float addrspace(1)* %out
   ret void
@@ -18,7 +18,7 @@
 ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
 ; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
 ; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
+define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
 
@@ -51,7 +51,7 @@
 ; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
 ; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
 ; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
-define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
   %no.md = fdiv float 1.0, %x
   store volatile float %no.md, float addrspace(1)* %out
 
@@ -89,7 +89,7 @@
 ; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
 ; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
 ; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
-define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
+define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
   %no.md = fdiv <2 x float> %a, %b
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
@@ -120,7 +120,7 @@
 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
-define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
@@ -158,7 +158,7 @@
 ; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
 ; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp
-define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
@@ -186,7 +186,7 @@
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp
-define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
   %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
 
   %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
@@ -206,7 +206,7 @@
 ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
 ; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index f812f4c..97cb906 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -80,7 +80,7 @@
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -102,7 +102,7 @@
 
 ; OPT-LABEL: @high_alignment(
 ; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}}
-define void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [8 x i32], align 16
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -127,7 +127,7 @@
 ; OPT: alloca [5 x i32]
 
 ; SI-NOT: ds_write
-define void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -162,7 +162,7 @@
 ; SI-NOT: v_movrel
 %struct.point = type { i32, i32 }
 
-define void @multiple_structs(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 {
 entry:
   %a = alloca %struct.point
   %b = alloca %struct.point
@@ -191,7 +191,7 @@
 ; R600-NOT: MOVA_INT
 ; SI-NOT: v_movrel
 
-define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
   %prv_array_const = alloca [2 x i32]
   %prv_array = alloca [2 x i32]
@@ -235,7 +235,7 @@
 ; SI-PROMOTE: s_load_dword [[IDX:s[0-9]+]]
 ; SI-PROMOTE: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 16
 ; SI-PROMOTE: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[SCALED_IDX]], 16
-define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i16]
   %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
@@ -258,7 +258,7 @@
 
 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0
 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0
-define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i8]
   %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
@@ -281,7 +281,7 @@
 ;
 ; A total of 5 bytes should be allocated and used.
 ; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ;
-define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1
   %1 = alloca [2 x i8], align 1
@@ -305,7 +305,7 @@
   ret void
 }
 
-define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i8]]
   %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
@@ -319,7 +319,7 @@
   ret void
 }
 
-define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@@ -332,7 +332,7 @@
   ret void
 }
 
-define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i64]]
   %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
@@ -347,7 +347,7 @@
 
 %struct.pair32 = type { i32, i32 }
 
-define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x %struct.pair32]]
   %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
@@ -360,7 +360,7 @@
   ret void
 }
 
-define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x %struct.pair32]
   %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
@@ -373,7 +373,7 @@
   ret void
 }
 
-define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
@@ -394,7 +394,7 @@
 ; SI-NOT: ds_write
 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ;
-define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32]
   %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   store i32 5, i32* %tmp0
@@ -410,7 +410,7 @@
 ; OPT-LABEL: @pointer_typed_alloca(
 ; OPT:  getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}}
 ; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4
-define void @pointer_typed_alloca(i32 addrspace(1)* %A) {
+define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
@@ -462,7 +462,7 @@
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
 
-define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
   %alloca = alloca [2 x <16 x i32>]
   %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>]* %alloca, i32 0, i32 %a
   %tmp5 = load <16 x i32>, <16 x i32>* %tmp0
@@ -506,7 +506,7 @@
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
 
-define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
   %alloca = alloca [2 x <16 x float>]
   %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>]* %alloca, i32 0, i32 %a
   %tmp5 = load <16 x float>, <16 x float>* %tmp0
@@ -522,7 +522,7 @@
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
 
-define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
   %alloca = alloca [16 x <2 x float>]
   %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>]* %alloca, i32 0, i32 %a
   %tmp5 = load <2 x float>, <2 x float>* %tmp0
@@ -533,7 +533,7 @@
 ; OPT-LABEL: @direct_alloca_read_0xi32(
 ; OPT: store [0 x i32] undef, [0 x i32] addrspace(3)*
 ; OPT: load [0 x i32], [0 x i32] addrspace(3)*
-define void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [0 x i32]
   store [0 x i32] [], [0 x i32]* %tmp
@@ -545,7 +545,7 @@
 ; OPT-LABEL: @direct_alloca_read_1xi32(
 ; OPT: store [1 x i32] zeroinitializer, [1 x i32] addrspace(3)*
 ; OPT: load [1 x i32], [1 x i32] addrspace(3)*
-define void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [1 x i32]
   store [1 x i32] [i32 0], [1 x i32]* %tmp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
index e515ca0..1873208 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
@@ -12,7 +12,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
-define void @ngroups_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -27,7 +27,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
-define void @ngroups_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -42,7 +42,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
-define void @ngroups_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -57,7 +57,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
-define void @global_size_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -72,7 +72,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
-define void @global_size_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -87,7 +87,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
-define void @global_size_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -102,7 +102,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Z
-define void @local_size_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -117,7 +117,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].W
-define void @local_size_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -132,7 +132,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[2].X
-define void @local_size_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -153,7 +153,7 @@
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_x_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_x_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -165,7 +165,7 @@
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
 
 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
-define void @tgid_y_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_y_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -181,7 +181,7 @@
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_z_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_z_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -194,7 +194,7 @@
 
 ; FUNC-LABEL: {{^}}tidig_x_legacy:
 ; GCN-NOHSA: buffer_store_dword v0
-define void @tidig_x_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_x_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -208,7 +208,7 @@
 ; FUNC-LABEL: {{^}}tidig_y_legacy:
 
 ; GCN-NOHSA: buffer_store_dword v1
-define void @tidig_y_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_y_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -221,7 +221,7 @@
 
 ; FUNC-LABEL: {{^}}tidig_z_legacy:
 ; GCN-NOHSA: buffer_store_dword v2
-define void @tidig_z_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_z_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
   store i32 %0, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/and-gcn.ll b/llvm/test/CodeGen/AMDGPU/and-gcn.ll
index dde5f8c..2aec03a 100644
--- a/llvm/test/CodeGen/AMDGPU/and-gcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/and-gcn.ll
@@ -4,7 +4,7 @@
 ; FUNC-LABEL: {{^}}v_and_i64_br:
 ; SI: v_and_b32
 ; SI: v_and_b32
-define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
 entry:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index 5d9dcf6..c356f8b 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -11,7 +11,7 @@
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -31,7 +31,7 @@
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -42,7 +42,7 @@
 
 ; FUNC-LABEL: {{^}}s_and_i32:
 ; SI: s_and_b32
-define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, %b
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
@@ -50,7 +50,7 @@
 
 ; FUNC-LABEL: {{^}}s_and_constant_i32:
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
-define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
   %and = and i32 %a, 1234567
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
@@ -66,7 +66,7 @@
 ; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
 ; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
 ; SI: buffer_store_dword [[VK]]
-define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, 1234567
 
   ; Just to stop future replacement of copy to vgpr + store with VALU op.
@@ -83,7 +83,7 @@
 ; SI: s_add_i32
 ; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]]
 ; SI: buffer_store_dword [[VK]]
-define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, 1234567
   %foo = add i32 %and, 1234567
   %bar = add i32 %foo, %b
@@ -93,7 +93,7 @@
 
 ; FUNC-LABEL: {{^}}v_and_i32_vgpr_vgpr:
 ; SI: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -109,7 +109,7 @@
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
-define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -123,7 +123,7 @@
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
-define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
+define amdgpu_kernel void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -135,7 +135,7 @@
 
 ; FUNC-LABEL: {{^}}v_and_constant_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
-define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, 1234567
   store i32 %and, i32 addrspace(1)* %out, align 4
@@ -144,7 +144,7 @@
 
 ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
-define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, 64
   store i32 %and, i32 addrspace(1)* %out, align 4
@@ -153,7 +153,7 @@
 
 ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
-define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, -16
   store i32 %and, i32 addrspace(1)* %out, align 4
@@ -162,7 +162,7 @@
 
 ; FUNC-LABEL: {{^}}s_and_i64
 ; SI: s_and_b64
-define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %and = and i64 %a, %b
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -171,7 +171,7 @@
 ; FIXME: Should use SGPRs
 ; FUNC-LABEL: {{^}}s_and_i1:
 ; SI: v_and_b32
-define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
+define amdgpu_kernel void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
   %and = and i1 %a, %b
   store i1 %and, i1 addrspace(1)* %out
   ret void
@@ -181,7 +181,7 @@
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000{{$}}
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80{{$}}
 ; SI: buffer_store_dwordx2
-define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %and = and i64 %a, 549756338176
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -191,7 +191,7 @@
 ; XSI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x80000{{$}}
 ; XSI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0x80{{$}}
 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}
-define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %and0 = and i64 %a, 549756338176
   %and1 = and i64 %b, 549756338176
   store volatile i64 %and0, i64 addrspace(1)* %out
@@ -205,7 +205,7 @@
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -223,7 +223,7 @@
 ; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
   %shl.a = shl i64 %a, 1
   %shl.b = shl i64 %b, 1
   %and0 = and i64 %shl.a, 62
@@ -238,7 +238,7 @@
 ; FUNC-LABEL: {{^}}v_and_i64:
 ; SI: v_and_b32
 ; SI: v_and_b32
-define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
   %and = and i64 %a, %b
@@ -250,7 +250,7 @@
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, {{v[0-9]+}}
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
 ; SI: buffer_store_dwordx2
-define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 1231231234567
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -268,7 +268,7 @@
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]]
 ; SI: buffer_store_dwordx2
 ; SI: buffer_store_dwordx2
-define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load volatile i64, i64 addrspace(1)* %aptr
   %b = load volatile i64, i64 addrspace(1)* %aptr
   %and0 = and i64 %a, 1231231234567
@@ -288,7 +288,7 @@
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
-define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load volatile i64, i64 addrspace(1)* %aptr
   %b = load volatile i64, i64 addrspace(1)* %aptr
   %and0 = and i64 %a, 63
@@ -304,7 +304,7 @@
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]]
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -317,7 +317,7 @@
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 64
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -331,7 +331,7 @@
 ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]]
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
-define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, -8
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -344,7 +344,7 @@
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 64
 ; SI-NOT: and
 ; SI: buffer_store_dword
-define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 64
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -358,7 +358,7 @@
 ; SI-NOT: and
 ; SI: s_add_u32
 ; SI-NEXT: s_addc_u32
-define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
   %shl = shl i64 %a, 1
   %and = and i64 %shl, 64
   %add = add i64 %and, %b
@@ -372,7 +372,7 @@
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 1
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -387,7 +387,7 @@
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4607182418800017408
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -402,7 +402,7 @@
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13830554455654793216
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -417,7 +417,7 @@
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4602678819172646912
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -432,7 +432,7 @@
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13826050856027422720
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -445,7 +445,7 @@
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4611686018427387904
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -458,7 +458,7 @@
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13835058055282163712
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -473,7 +473,7 @@
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4616189618054758400
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -488,7 +488,7 @@
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13839561654909534208
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -505,7 +505,7 @@
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 1082130432
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -518,7 +518,7 @@
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, -1065353216
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -531,7 +531,7 @@
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4647714815446351872
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -544,7 +544,7 @@
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13871086852301127680
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 084a693..e2620ce 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -11,22 +11,22 @@
 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
 
-; HSA: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -34,8 +34,8 @@
   ret void
 }
 
-; HSA: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -43,15 +43,15 @@
   ret void
 }
 
-; HSA: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -59,8 +59,8 @@
   ret void
 }
 
-; HSA: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -68,8 +68,8 @@
   ret void
 }
 
-; HSA: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
   %val2 = call i32 @llvm.amdgcn.workgroup.id.z()
@@ -79,29 +79,29 @@
   ret void
 }
 
-; HSA: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
-define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
+define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
-define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
+define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.x()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -109,8 +109,8 @@
   ret void
 }
 
-; HSA: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
-define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
+define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.y()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -118,8 +118,8 @@
   ret void
 }
 
-; HSA: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
-define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
+define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
   %val1 = call i32 @llvm.amdgcn.workitem.id.y()
   %val2 = call i32 @llvm.amdgcn.workitem.id.z()
@@ -129,8 +129,8 @@
   ret void
 }
 
-; HSA: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
-define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
+define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
   %val1 = call i32 @llvm.amdgcn.workitem.id.y()
   %val2 = call i32 @llvm.amdgcn.workitem.id.z()
@@ -146,8 +146,8 @@
   ret void
 }
 
-; HSA: define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
-define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
+define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
   %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
   %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
   %val = load i32, i32 addrspace(2)* %bc
@@ -155,8 +155,8 @@
   ret void
 }
 
-; HSA: define void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
-define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
+define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
   %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
   %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
   %val = load i32, i32 addrspace(2)* %bc
@@ -164,58 +164,58 @@
   ret void
 }
 
-; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
-define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
+define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
-define void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
+define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
   %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
   store volatile i32 0, i32 addrspace(3)* %ftos
   ret void
 }
 
-; HSA: define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
   store volatile i32 0, i32* %ftos
   ret void
 }
 
 ; No-op addrspacecast should not use queue ptr
-; HSA: define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
-define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
-define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
   %ld = load volatile i32, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %ftos
   ret void
 }
 
-; HSA: define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
   %ld = load volatile i32, i32 addrspace(2)* %ftos
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index a4e7bb6..09750da 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -12,22 +12,22 @@
 declare i32 @llvm.r600.read.local.size.y() #0
 declare i32 @llvm.r600.read.local.size.z() #0
 
-; ALL: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
   %val1 = call i32 @llvm.r600.read.tgid.y()
@@ -35,8 +35,8 @@
   ret void
 }
 
-; ALL: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.x()
   %val1 = call i32 @llvm.r600.read.tgid.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -44,15 +44,15 @@
   ret void
 }
 
-; ALL: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.x()
   %val1 = call i32 @llvm.r600.read.tgid.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -60,8 +60,8 @@
   ret void
 }
 
-; ALL: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.y()
   %val1 = call i32 @llvm.r600.read.tgid.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -69,8 +69,8 @@
   ret void
 }
 
-; ALL: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.x()
   %val1 = call i32 @llvm.r600.read.tgid.y()
   %val2 = call i32 @llvm.r600.read.tgid.z()
@@ -80,29 +80,29 @@
   ret void
 }
 
-; ALL: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tidig.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
-define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
+define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tidig.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
-define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
+define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tidig.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.x()
   %val1 = call i32 @llvm.r600.read.tgid.x()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -110,8 +110,8 @@
   ret void
 }
 
-; ALL: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
-define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
+define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.y()
   %val1 = call i32 @llvm.r600.read.tgid.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -119,8 +119,8 @@
   ret void
 }
 
-; ALL: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
-define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
+define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.x()
   %val1 = call i32 @llvm.r600.read.tidig.y()
   %val2 = call i32 @llvm.r600.read.tidig.z()
@@ -130,8 +130,8 @@
   ret void
 }
 
-; ALL: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
-define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
+define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.x()
   %val1 = call i32 @llvm.r600.read.tidig.y()
   %val2 = call i32 @llvm.r600.read.tidig.z()
@@ -147,25 +147,25 @@
   ret void
 }
 
-; HSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.local.size.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.local.size.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.local.size.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/anonymous-gv.ll b/llvm/test/CodeGen/AMDGPU/anonymous-gv.ll
index f37b0f3..04fbe2a 100644
--- a/llvm/test/CodeGen/AMDGPU/anonymous-gv.ll
+++ b/llvm/test/CodeGen/AMDGPU/anonymous-gv.ll
@@ -6,13 +6,13 @@
 ; CHECK-LABEL: {{^}}test:
 ; CHECK: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, __unnamed_1
 ; CHECK: s_endpgm
-define void @test() {
+define amdgpu_kernel void @test() {
   store i32 1, i32 addrspace(1)* @0
   ret void
 }
 
 ; CHECK-LABEL: {{^}}__unnamed_2:
 ; CHECK: s_endpgm
-define void @1() {
+define amdgpu_kernel void @1() {
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll
index 87b4c8642..3f220c4 100644
--- a/llvm/test/CodeGen/AMDGPU/anyext.ll
+++ b/llvm/test/CodeGen/AMDGPU/anyext.ll
@@ -6,7 +6,7 @@
 
 ; GCN-LABEL: {{^}}anyext_i1_i32:
 ; GCN: v_cndmask_b32_e64
-define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tmp = icmp eq i32 %cond, 0
   %tmp1 = zext i1 %tmp to i8
@@ -22,7 +22,7 @@
 ; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], -1, [[ADD]]
 ; VI: v_and_b32_e32 [[AND:v[0-9]+]], 1, [[XOR]]
 ; VI: buffer_store_dword [[AND]]
-define void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
+define amdgpu_kernel void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
 entry:
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index 7dfe342..daa3442 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -24,7 +24,7 @@
 
 ; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
 ; SI-PROMOTE: ds_write_b32 [[PTRREG]]
-define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
+define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
   %alloca = alloca [16 x i32], align 16
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
index b914edf..ddeffc1 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
@@ -7,7 +7,7 @@
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_mul_hi_i32
 ; SI: s_endpgm
-define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
   %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
index 0832156..3a7facf 100644
--- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
@@ -13,7 +13,7 @@
 ; CIVI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CIVI: v_or_b32_e32
-define void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
   %result = ashr <2 x i16> %lhs, %rhs
   store <2 x i16> %result, <2 x i16> addrspace(1)* %out
   ret void
@@ -40,7 +40,7 @@
 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
 ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -57,7 +57,7 @@
 ; GFX9: s_load_dword [[RHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -72,7 +72,7 @@
 ; GFX9: s_load_dword [[LHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -86,7 +86,7 @@
 ; GCN-LABEL: {{^}}ashr_imm_v_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], -4
-define void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -100,7 +100,7 @@
 ; GCN-LABEL: {{^}}ashr_v_imm_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], 8, [[LHS]]
-define void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -117,7 +117,7 @@
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -135,7 +135,7 @@
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @ashr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @ashr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 25eae0b..4f9526d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -12,7 +12,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
@@ -33,7 +33,7 @@
 ; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
@@ -45,7 +45,7 @@
 ; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -65,7 +65,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
@@ -84,7 +84,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
 ; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll
index 4b014e09..e0fe664 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_local:
 ; R600: LDS_ADD *
 ; SI: ds_add_u32
-define void @atomic_add_local(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) {
    %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
@@ -13,7 +13,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_local_const_offset:
 ; R600: LDS_ADD *
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
@@ -22,7 +22,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_ret_local:
 ; R600: LDS_ADD_RET *
 ; SI: ds_add_rtn_u32
-define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
@@ -31,7 +31,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset:
 ; R600: LDS_ADD_RET *
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll
index c6e5b11..a027589 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_local:
 ; R600: LDS_SUB *
 ; SI: ds_sub_u32
-define void @atomic_sub_local(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) {
    %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
@@ -13,7 +13,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_local_const_offset:
 ; R600: LDS_SUB *
 ; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
@@ -22,7 +22,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_ret_local:
 ; R600: LDS_SUB_RET *
 ; SI: ds_sub_rtn_u32
-define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
@@ -31,7 +31,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset:
 ; R600: LDS_SUB_RET *
 ; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index cab377f..63a6f6a 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -5,7 +5,7 @@
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_64_max_64() #0 {
+define amdgpu_kernel void @min_64_max_64() #0 {
 entry:
   ret void
 }
@@ -16,7 +16,7 @@
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_64_max_128() #1 {
+define amdgpu_kernel void @min_64_max_128() #1 {
 entry:
   ret void
 }
@@ -27,7 +27,7 @@
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_128_max_128() #2 {
+define amdgpu_kernel void @min_128_max_128() #2 {
 entry:
   ret void
 }
@@ -39,7 +39,7 @@
 ; CHECK: NumSGPRsForWavesPerEU: 13
 ; CHECK: NumVGPRsForWavesPerEU: 32
 @var = addrspace(1) global float 0.0
-define void @min_1024_max_2048() #3 {
+define amdgpu_kernel void @min_1024_max_2048() #3 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var
   %val2 = load volatile float, float addrspace(1)* @var
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index 58fe6e2..ac2f7b4 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -8,7 +8,7 @@
 
 ; ALL: SGPRBlocks: 1
 ; ALL: NumSGPRsForWavesPerEU: 9
-define void @max_9_sgprs(i32 addrspace(1)* %out1,
+define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1,
 
                           i32 addrspace(1)* %out2,
                           i32 addrspace(1)* %out3,
@@ -49,7 +49,7 @@
 
 ; TOSMEM: SGPRBlocks: 1
 ; TOSMEM: NumSGPRsForWavesPerEU: 16
-define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
+define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
                                         i32 addrspace(1)* %out2,
                                         i32 addrspace(1)* %out3,
                                         i32 addrspace(1)* %out4,
@@ -90,7 +90,7 @@
 
 ; XALL: SGPRBlocks: 2
 ; XALL: NumSGPRsForWavesPerEU: 18
-;define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
+;define amdgpu_kernel void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
 ;                                        i32 addrspace(1)* %out2,
 ;                                        i32 addrspace(1)* %out3,
 ;                                        i32 addrspace(1)* %out4,
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
index 97feb72..979665f 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: {{^}}max_20_vgprs:
 ; CHECK: VGPRBlocks: 4
 ; CHECK: NumVGPRsForWavesPerEU: 20
-define void @max_20_vgprs() #1 {
+define amdgpu_kernel void @max_20_vgprs() #1 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var
   %val2 = load volatile float, float addrspace(1)* @var
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index c1c3342..3dda73b 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -6,7 +6,7 @@
 ; CHECK: VGPRBlocks: 32
 ; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 129
-define void @empty_exactly_1() #0 {
+define amdgpu_kernel void @empty_exactly_1() #0 {
 entry:
   ret void
 }
@@ -18,7 +18,7 @@
 ; CHECK: VGPRBlocks: 10
 ; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 41
-define void @empty_exactly_5() #1 {
+define amdgpu_kernel void @empty_exactly_5() #1 {
 entry:
   ret void
 }
@@ -30,7 +30,7 @@
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_exactly_10() #2 {
+define amdgpu_kernel void @empty_exactly_10() #2 {
 entry:
   ret void
 }
@@ -42,7 +42,7 @@
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_1() #3 {
+define amdgpu_kernel void @empty_at_least_1() #3 {
 entry:
   ret void
 }
@@ -54,7 +54,7 @@
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_5() #4 {
+define amdgpu_kernel void @empty_at_least_5() #4 {
 entry:
   ret void
 }
@@ -66,7 +66,7 @@
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_10() #5 {
+define amdgpu_kernel void @empty_at_least_10() #5 {
 entry:
   ret void
 }
@@ -80,7 +80,7 @@
 ; CHECK: VGPRBlocks: 10
 ; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 41
-define void @empty_at_most_5() #6 {
+define amdgpu_kernel void @empty_at_most_5() #6 {
 entry:
   ret void
 }
@@ -92,7 +92,7 @@
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_most_10() #7 {
+define amdgpu_kernel void @empty_at_most_10() #7 {
 entry:
   ret void
 }
@@ -106,7 +106,7 @@
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_between_5_and_10() #8 {
+define amdgpu_kernel void @empty_between_5_and_10() #8 {
 entry:
   ret void
 }
@@ -120,7 +120,7 @@
 ; CHECK: VGPRBlocks: 5
 ; CHECK: NumSGPRsForWavesPerEU: 13
 ; CHECK: NumVGPRsForWavesPerEU: 24
-define void @exactly_10() #9 {
+define amdgpu_kernel void @exactly_10() #9 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var
   %val2 = load volatile float, float addrspace(1)* @var
diff --git a/llvm/test/CodeGen/AMDGPU/attr-unparseable.ll b/llvm/test/CodeGen/AMDGPU/attr-unparseable.ll
index 0282bc3..17adb89 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-unparseable.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-unparseable.ll
@@ -1,56 +1,56 @@
 ; RUN: not llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s 2>&1 | FileCheck %s
 
 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_0() #0 {
+define amdgpu_kernel void @unparseable_single_0() #0 {
 entry:
   ret void
 }
 attributes #0 = {"amdgpu-num-sgpr"}
 
 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_1() #1 {
+define amdgpu_kernel void @unparseable_single_1() #1 {
 entry:
   ret void
 }
 attributes #1 = {"amdgpu-num-sgpr"="k"}
 
 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_2() #2 {
+define amdgpu_kernel void @unparseable_single_2() #2 {
 entry:
   ret void
 }
 attributes #2 = {"amdgpu-num-sgpr"="1,2"}
 
 ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_0() #3 {
+define amdgpu_kernel void @unparseable_pair_0() #3 {
 entry:
   ret void
 }
 attributes #3 = {"amdgpu-flat-work-group-size"}
 
 ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_1() #4 {
+define amdgpu_kernel void @unparseable_pair_1() #4 {
 entry:
   ret void
 }
 attributes #4 = {"amdgpu-flat-work-group-size"="k"}
 
 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_2() #5 {
+define amdgpu_kernel void @unparseable_pair_2() #5 {
 entry:
   ret void
 }
 attributes #5 = {"amdgpu-flat-work-group-size"="1"}
 
 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_3() #6 {
+define amdgpu_kernel void @unparseable_pair_3() #6 {
 entry:
   ret void
 }
 attributes #6 = {"amdgpu-flat-work-group-size"="1,k"}
 
 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_4() #7 {
+define amdgpu_kernel void @unparseable_pair_4() #7 {
 entry:
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
index d4538ee..e245e42 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
@@ -15,7 +15,7 @@
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
-define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
+define amdgpu_kernel void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
   %cmp = icmp ne i32 %val, 0
   br i1 %cmp, label %store, label %end
 
@@ -39,7 +39,7 @@
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
-define void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
+define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
   %cmp0 = icmp ne i1 %val, 0
   br i1 %cmp0, label %store, label %end
 
diff --git a/llvm/test/CodeGen/AMDGPU/basic-loop.ll b/llvm/test/CodeGen/AMDGPU/basic-loop.ll
index f0263ca..de45190 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-loop.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test_loop:
-define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
+define amdgpu_kernel void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
 entry:
   br label %loop.body
 
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index e48744f..5e39a6c 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -5,7 +5,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
 ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]]
-define void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
   %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@@ -32,7 +32,7 @@
 
 ; GCN: [[BFE]]
 ; GCN: [[SHL]]
-define void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
   %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@@ -52,7 +52,7 @@
 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
 ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
-define void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
   %sub = sub i32 32, %width
@@ -68,7 +68,7 @@
 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
 ; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
 ; GCN-NEXT: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
-define void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
   %sub = sub i32 32, %width
@@ -83,7 +83,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
 ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]]
-define void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
   %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@@ -110,7 +110,7 @@
 
 ; GCN: [[BFE]]
 ; GCN: [[SHL]]
-define void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
   %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@@ -130,7 +130,7 @@
 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
 ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
-define void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
   %sub = sub i32 32, %width
@@ -146,7 +146,7 @@
 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
 ; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
 ; GCN-NEXT: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
-define void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
   %sub = sub i32 32, %width
diff --git a/llvm/test/CodeGen/AMDGPU/bfe_uint.ll b/llvm/test/CodeGen/AMDGPU/bfe_uint.ll
index 32e3fc2..2c8c9a5 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe_uint.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: {{^}}bfe_def:
 ; CHECK: BFE_UINT
-define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
 entry:
   %0 = lshr i32 %x, 5
   %1 = and i32 %0, 15 ; 0xf
@@ -17,7 +17,7 @@
 
 ; CHECK: {{^}}bfe_shift:
 ; CHECK-NOT: BFE_UINT
-define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
 entry:
   %0 = lshr i32 %x, 16
   %1 = and i32 %0, 65535 ; 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 5156137..7870e5f 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -9,7 +9,7 @@
 ; R600: BFI_INT
 ; SI:   @bfi_def
 ; SI:   v_bfi_b32
-define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %x, -1
   %1 = and i32 %z, %0
@@ -25,7 +25,7 @@
 ; R600: BFI_INT
 ; SI:   @bfi_sha256_ch
 ; SI:   v_bfi_b32
-define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %y, %z
   %1 = and i32 %x, %0
@@ -42,7 +42,7 @@
 ; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
 ; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}
 
-define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = and i32 %x, %z
   %1 = or i32 %x, %z
diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll
index 790458d..5673995 100644
--- a/llvm/test/CodeGen/AMDGPU/bfm.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfm.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}bfm_pattern:
 ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %a = shl i32 1, %x
   %b = sub i32 %a, 1
   %c = shl i32 %b, %y
@@ -14,7 +14,7 @@
 
 ; FUNC-LABEL: {{^}}bfm_pattern_simple:
 ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0
-define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
   %a = shl i32 1, %x
   %b = sub i32 %a, 1
   store i32 %b, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
index 04384bf..cf95f74 100644
--- a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
@@ -11,7 +11,7 @@
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
   %vec0.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
 
@@ -27,7 +27,7 @@
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
   %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
 
@@ -43,7 +43,7 @@
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
   %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <4 x double>
   store volatile <4 x double> %vec0.bc, <4 x double> addrspace(1)* %out
 
@@ -59,7 +59,7 @@
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
   %vec0.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
 
@@ -70,7 +70,7 @@
 
 ; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source:
 ; GCN-NOT: store_dword
-define void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
+define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
   %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1
   %bc = bitcast i64 %undef to <2 x i32>
   store volatile <2 x i32> %bc, <2 x i32> addrspace(1)* %out
@@ -79,7 +79,7 @@
 
 ; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source_extractelt:
 ; GCN-NOT: store_dword
-define void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
+define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
   %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1
   %bc = bitcast i64 %undef to <2 x i32>
   %elt1 = extractelement <2 x i32> %bc, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
index f7dc1a9..3616ec1 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
@@ -7,7 +7,7 @@
 ; GCN-LABEL: {{^}}materialize_0_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_0_i32(i32 addrspace(1)* %out) {
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
@@ -16,7 +16,7 @@
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_0_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_0_i64(i64 addrspace(1)* %out) {
   store i64 0, i64 addrspace(1)* %out
   ret void
 }
@@ -24,7 +24,7 @@
 ; GCN-LABEL: {{^}}materialize_neg1_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -1{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_neg1_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_neg1_i32(i32 addrspace(1)* %out) {
   store i32 -1, i32 addrspace(1)* %out
   ret void
 }
@@ -33,7 +33,7 @@
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_neg1_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_neg1_i64(i64 addrspace(1)* %out) {
   store i64 -1, i64 addrspace(1)* %out
   ret void
 }
@@ -41,7 +41,7 @@
 ; GCN-LABEL: {{^}}materialize_signbit_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_signbit_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_signbit_i32(i32 addrspace(1)* %out) {
   store i32 -2147483648, i32 addrspace(1)* %out
   ret void
 }
@@ -50,7 +50,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_signbit_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_signbit_i64(i64 addrspace(1)* %out) {
   store i64  -9223372036854775808, i64 addrspace(1)* %out
   ret void
 }
@@ -58,7 +58,7 @@
 ; GCN-LABEL: {{^}}materialize_rev_neg16_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], -16{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
   store i32 268435455, i32 addrspace(1)* %out
   ret void
 }
@@ -67,7 +67,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], -16{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
   store i64  1152921504606846975, i64 addrspace(1)* %out
   ret void
 }
@@ -75,7 +75,7 @@
 ; GCN-LABEL: {{^}}materialize_rev_neg17_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xf7ffffff{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
   store i32 -134217729, i32 addrspace(1)* %out
   ret void
 }
@@ -84,7 +84,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0xf7ffffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
   store i64 -576460752303423489, i64 addrspace(1)* %out
   ret void
 }
@@ -92,7 +92,7 @@
 ; GCN-LABEL: {{^}}materialize_rev_64_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 64{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
   store i32 33554432, i32 addrspace(1)* %out
   ret void
 }
@@ -101,7 +101,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 64{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
   store i64 144115188075855872, i64 addrspace(1)* %out
   ret void
 }
@@ -109,7 +109,7 @@
 ; GCN-LABEL: {{^}}materialize_rev_65_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x82000000{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
   store i32 -2113929216, i32 addrspace(1)* %out
   ret void
 }
@@ -118,7 +118,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0x82000000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
   store i64 -9079256848778919936, i64 addrspace(1)* %out
   ret void
 }
@@ -126,7 +126,7 @@
 ; GCN-LABEL: {{^}}materialize_rev_3_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
   store i32 -1073741824, i32 addrspace(1)* %out
   ret void
 }
@@ -135,7 +135,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
   store i64 -4611686018427387904, i64 addrspace(1)* %out
   ret void
 }
@@ -143,7 +143,7 @@
 ; GCN-LABEL: {{^}}materialize_rev_1.0_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1fc{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
   store i32 508, i32 addrspace(1)* %out
   ret void
 }
@@ -152,70 +152,70 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0x1fc{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
   store i64 508, i64 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_0_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0{{$}}
-define void @s_materialize_0_i32() {
+define amdgpu_kernel void @s_materialize_0_i32() {
   call void asm sideeffect "; use $0", "s"(i32 0)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_1_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 1{{$}}
-define void @s_materialize_1_i32() {
+define amdgpu_kernel void @s_materialize_1_i32() {
   call void asm sideeffect "; use $0", "s"(i32 1)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_neg1_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, -1{{$}}
-define void @s_materialize_neg1_i32() {
+define amdgpu_kernel void @s_materialize_neg1_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -1)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_signbit_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, 1{{$}}
-define void @s_materialize_signbit_i32() {
+define amdgpu_kernel void @s_materialize_signbit_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -2147483648)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_64_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, 64{{$}}
-define void @s_materialize_rev_64_i32() {
+define amdgpu_kernel void @s_materialize_rev_64_i32() {
   call void asm sideeffect "; use $0", "s"(i32 33554432)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_65_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0x82000000{{$}}
-define void @s_materialize_rev_65_i32() {
+define amdgpu_kernel void @s_materialize_rev_65_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -2113929216)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_neg16_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, -16{{$}}
-define void @s_materialize_rev_neg16_i32() {
+define amdgpu_kernel void @s_materialize_rev_neg16_i32() {
   call void asm sideeffect "; use $0", "s"(i32 268435455)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_neg17_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0xf7ffffff{{$}}
-define void @s_materialize_rev_neg17_i32() {
+define amdgpu_kernel void @s_materialize_rev_neg17_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -134217729)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_1.0_i32:
 ; GCN: s_movk_i32 s{{[0-9]+}}, 0x1fc{{$}}
-define void @s_materialize_rev_1.0_i32() {
+define amdgpu_kernel void @s_materialize_rev_1.0_i32() {
   call void asm sideeffect "; use $0", "s"(i32 508)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index 43a4200..539373f 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -14,7 +14,7 @@
 
 ; FUNC-LABEL: {{^}}s_brev_i16:
 ; SI: s_brev_b32 
-define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
+define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
   %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
   store i16 %brev, i16 addrspace(1)* %out
   ret void
@@ -22,7 +22,7 @@
 
 ; FUNC-LABEL: {{^}}v_brev_i16:
 ; SI: v_bfrev_b32_e32
-define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
   %val = load i16, i16 addrspace(1)* %valptr
   %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
   store i16 %brev, i16 addrspace(1)* %out
@@ -35,7 +35,7 @@
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
 ; SI: s_endpgm
-define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
+define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
   %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
   store i32 %brev, i32 addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@
 ; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
   %val = load i32, i32 addrspace(1)* %valptr
   %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
   store i32 %brev, i32 addrspace(1)* %out
@@ -56,7 +56,7 @@
 ; FUNC-LABEL: {{^}}s_brev_v2i32:
 ; SI: s_brev_b32
 ; SI: s_brev_b32
-define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
+define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
   %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
   store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@
 ; FUNC-LABEL: {{^}}v_brev_v2i32:
 ; SI: v_bfrev_b32_e32
 ; SI: v_bfrev_b32_e32
-define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr
   %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
   store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
@@ -73,7 +73,7 @@
 }
 
 ; FUNC-LABEL: {{^}}s_brev_i64:
-define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
+define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
   %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
   store i64 %brev, i64 addrspace(1)* %out
   ret void
@@ -81,7 +81,7 @@
 
 ; FUNC-LABEL: {{^}}v_brev_i64:
 ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
-define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
   %val = load i64, i64 addrspace(1)* %valptr
   %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
   store i64 %brev, i64 addrspace(1)* %out
@@ -89,14 +89,14 @@
 }
 
 ; FUNC-LABEL: {{^}}s_brev_v2i64:
-define void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
+define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
   %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
   store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_brev_v2i64:
-define void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr
   %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
   store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
index 3fe6f87..b7a0c87 100644
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -20,7 +20,7 @@
 ; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[B_F16]]
 ; GCN: s_endpgm
-define void @br_cc_f16(
+define amdgpu_kernel void @br_cc_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -59,7 +59,7 @@
 ; GCN: two{{$}}
 ; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
 
-define void @br_cc_f16_imm_a(
+define amdgpu_kernel void @br_cc_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -92,7 +92,7 @@
 ; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}
 ; GCN: buffer_store_short v[[B_F16]]
 ; GCN: s_endpgm
-define void @br_cc_f16_imm_b(
+define amdgpu_kernel void @br_cc_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index 86b8dd8..ede1555 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -5,7 +5,7 @@
 
 ; FAIL: LLVM ERROR: Error while trying to spill VCC from class SReg_64: Cannot scavenge register without an emergency spill slot!
 
-define void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 entry:
   %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={SGPR0}"() #0
   %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={SGPR1}"() #0
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index 18bbc3e..263059d 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -26,7 +26,7 @@
 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 bb:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
@@ -68,7 +68,7 @@
 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 bb0:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
@@ -108,7 +108,7 @@
 ; GCN: [[ENDBB]]:
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
 bb0:
   %cmp = fcmp oeq float %cnd, 0.0
   br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
@@ -141,7 +141,7 @@
 ; GCN: s_or_b64 exec, exec, [[SAVE]]
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = zext i32 %tid to i64
@@ -188,7 +188,7 @@
 
 ; GCN-NEXT: [[ENDBB]]:
 ; GCN-NEXT: s_endpgm
-define void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
 bb:
   br label %bb2
 
@@ -243,7 +243,7 @@
 ; GCN: buffer_store_dword [[BB4_K]]
 ; GCN-NEXT: s_endpgm
 ; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
-define void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 bb0:
   %tmp = icmp ne i32 %arg1, 0
   br i1 %tmp, label %bb2, label %bb3
@@ -285,7 +285,7 @@
 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0{{$}}
 ; GCN-NEXT: s_setpc_b64 vcc
 ; GCN-NEXT .Lfunc_end{{[0-9]+}}:
-define void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 entry:
   br label %loop
 
@@ -342,7 +342,7 @@
 ; GCN-NEXT: v_nop_e64
 ; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: s_endpgm
-define void @expand_requires_expand(i32 %cond0) #0 {
+define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 bb0:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %cmp0 = icmp slt i32 %cond0, 0
@@ -399,7 +399,7 @@
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN-NEXT: s_sleep 5
 ; GCN-NEXT: s_endpgm
-define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
+define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %d_cmp = icmp ult i32 %tid, 16
@@ -462,7 +462,7 @@
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN: buffer_store_dword
 ; GCN-NEXT: s_endpgm
-define void @analyze_mask_branch() #0 {
+define amdgpu_kernel void @analyze_mask_branch() #0 {
 entry:
   %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"()
   %cmp0 = fcmp ogt float %reg, 0.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index c689517..d2dacd7 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -17,7 +17,7 @@
 ; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
   store i32 %bswap, i32 addrspace(1)* %out, align 4
@@ -32,7 +32,7 @@
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
   store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8
@@ -53,7 +53,7 @@
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
   store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16
@@ -86,7 +86,7 @@
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
   %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
   %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
   store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32
@@ -95,21 +95,21 @@
 
 ; FUNC-LABEL: {{^}}test_bswap_i64:
 ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
-define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
   store i64 %bswap, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
   store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16
   ret void
 }
 
-define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
   %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
   %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
   store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 0a5774c..d77b0ab 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -10,7 +10,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
 ; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
 ; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}}
-define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @build_vector2 (<2 x i32> addrspace(1)* %out) {
 entry:
   store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
   ret void
@@ -28,7 +28,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
 ; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
 ; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}}
-define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @build_vector4 (<4 x i32> addrspace(1)* %out) {
 entry:
   store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/call.ll b/llvm/test/CodeGen/AMDGPU/call.ll
index 6d101e1..769c7bb 100644
--- a/llvm/test/CodeGen/AMDGPU/call.ll
+++ b/llvm/test/CodeGen/AMDGPU/call.ll
@@ -10,7 +10,7 @@
 
 declare i32 @external_function(i32) nounwind
 
-define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -25,7 +25,7 @@
   ret i32 %y
 }
 
-define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -35,7 +35,7 @@
   ret void
 }
 
-define void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
index e2e4f16..1760141 100644
--- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: {{^}}store_fi_lifetime:
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %b = alloca i8
   call void @llvm.lifetime.start(i64 1, i8* %b)
@@ -18,7 +18,7 @@
 ; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}}
 ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO0]]
-define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
   %tmp = alloca float
   store float 4.0, float *%tmp
   store float* %tmp, float* addrspace(3)* %ptr
@@ -38,7 +38,7 @@
 
 ; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
 ; GCN: ds_write_b32  [[VLDSPTR]], [[FI1]]
-define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
   %tmp0 = alloca float
   %tmp1 = alloca float
   store float 4.0, float* %tmp0
@@ -54,7 +54,7 @@
 ; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
-define void @stored_fi_to_self() #0 {
+define amdgpu_kernel void @stored_fi_to_self() #0 {
   %tmp = alloca i32*
 
   ; Avoid optimizing everything out
@@ -73,7 +73,7 @@
 
 ; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}}
 ; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}}
-define void @stored_fi_to_self_offset() #0 {
+define amdgpu_kernel void @stored_fi_to_self_offset() #0 {
   %tmp0 = alloca [512 x i32]
   %tmp1 = alloca i32*
 
@@ -98,7 +98,7 @@
 
 ; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
-define void @stored_fi_to_fi() #0 {
+define amdgpu_kernel void @stored_fi_to_fi() #0 {
   %tmp0 = alloca i32*
   %tmp1 = alloca i32*
   %tmp2 = alloca i32*
@@ -118,7 +118,7 @@
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
   %tmp = alloca float
   store float 0.0, float *%tmp
   store float* %tmp, float* addrspace(1)* %ptr
@@ -136,7 +136,7 @@
 
 ; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
   %tmp0 = alloca float
   %tmp1 = alloca float
   %tmp2 = alloca float
@@ -163,7 +163,7 @@
 ; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
 ; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
   %tmp0 = alloca [4096 x i32]
   %tmp1 = alloca [4096 x i32]
   %gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 0
@@ -186,7 +186,7 @@
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC_HI]], g1@gotpcrel32@hi+4
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
+define amdgpu_kernel void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
 entry:
   %b = alloca i32, align 4
   %tmp1 = load volatile i32*, i32* addrspace(1)* @g1, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
index 03d8cd9..697f26b 100644
--- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -7,7 +7,7 @@
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
 ; GCN: s_endpgm
-define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   %cmp = icmp eq i32 %n, -1
   br i1 %cmp, label %for.exit, label %for.body
@@ -31,7 +31,7 @@
 ; GCN: ds_read_b32
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
-define void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   br label %for.body
 
@@ -52,7 +52,7 @@
 ; GCN-LABEL: {{^}}loop_const_false:
 ; GCN-NOT: s_branch
 ; GCN: s_endpgm
-define void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   br label %for.body
 
@@ -74,7 +74,7 @@
 ; GCN-LABEL: {{^}}loop_const_undef:
 ; GCN-NOT: s_branch
 ; GCN: s_endpgm
-define void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   br label %for.body
 
@@ -104,7 +104,7 @@
 ; GCN: s_cbranch_vccnz [[LOOPBB]]
 ; GCN-NEXT: ; BB#2
 ; GCN-NEXT: s_endpgm
-define void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
+define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
 entry:
   br label %for.body
 
diff --git a/llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll b/llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll
index 75b87e4..53fe897 100644
--- a/llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll
@@ -35,7 +35,7 @@
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested3:
-define void @nested3(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested3(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -68,7 +68,7 @@
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested4:
-define void @nested4(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested4(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -109,7 +109,7 @@
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested7:
-define void @nested7(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested7(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -174,7 +174,7 @@
 ; BUG32: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested8:
-define void @nested8(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested8(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
diff --git a/llvm/test/CodeGen/AMDGPU/cf_end.ll b/llvm/test/CodeGen/AMDGPU/cf_end.ll
index c74ee22..3c990e0 100644
--- a/llvm/test/CodeGen/AMDGPU/cf_end.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf_end.ll
@@ -4,6 +4,6 @@
 
 ; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80]
 ; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88]
-define void @eop() {
+define amdgpu_kernel void @eop() {
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index 6db9a07..ac35dd0 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -11,7 +11,7 @@
 ; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
 ; GCN: flat_load_dword
 ; GCN: {{^}}BB0_2:
-define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
@@ -43,7 +43,7 @@
 
 ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32:
 ; CI: buffer_load_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
-define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
@@ -76,7 +76,7 @@
 
 ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_constant_i32:
 ; CI: s_load_dword {{s[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-define void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index b14797c..0ebff89 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -15,7 +15,7 @@
 
 ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
 ; GCN: {{^}}BB0_2:
-define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
@@ -45,7 +45,7 @@
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
 ; GCN: {{^}}BB1_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
@@ -72,7 +72,7 @@
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
 ; GCN: {{^}}BB2_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
@@ -99,7 +99,7 @@
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
 ; GCN: {{^}}BB3_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
@@ -131,7 +131,7 @@
 ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
 ; GCN: {{^}}BB4_2:
-define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@@ -172,7 +172,7 @@
 ; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
 ; GCN: {{^BB[0-9]+}}_2:
 
-define void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@@ -209,7 +209,7 @@
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 ; GCN: {{^BB[0-9]+}}_2:
-define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@@ -241,7 +241,7 @@
 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ; GCN: {{^BB[0-9]+}}_2:
-define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
 entry:
   %offset.ext = zext i32 %offset to i64
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
@@ -271,7 +271,7 @@
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7
@@ -300,7 +300,7 @@
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255
@@ -333,7 +333,7 @@
 
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256
@@ -365,7 +365,7 @@
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295
@@ -396,7 +396,7 @@
 ; GCN: s_addc_u32
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181
@@ -426,7 +426,7 @@
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
 
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143
@@ -464,7 +464,7 @@
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144
@@ -494,7 +494,7 @@
 ; GCN: s_load_dword [[SREG1:s[0-9]+]],
 ; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
 ; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
-define void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
+define amdgpu_kernel void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
   %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
   %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
@@ -521,7 +521,7 @@
 ; OPT: if:
 ; OPT: %sunkaddr = ptrtoint i8 addrspace(2)* %in to i64
 ; OPT: %sunkaddr1 = add i64 %sunkaddr, 4095
-define void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
   %in.gep = getelementptr i8, i8 addrspace(2)* %in, i64 4095
@@ -548,7 +548,7 @@
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1 = atomicrmw add i32 addrspace(3)* %sunkaddr2, i32 2 seq_cst
-define void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@@ -574,7 +574,7 @@
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1.struct = cmpxchg i32 addrspace(3)* %sunkaddr2, i32 undef, i32 2 seq_cst monotonic
-define void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@@ -600,7 +600,7 @@
 ; OPT: %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
 ; OPT: br i1
 ; OPT: cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
-define void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
   %out.gep = getelementptr i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %out, i32 999999
   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@@ -627,7 +627,7 @@
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %sunkaddr2, i32 2)
-define void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@@ -653,7 +653,7 @@
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %sunkaddr2, i32 2)
-define void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
index a92f691..53adf09 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -36,7 +36,7 @@
 ; GCN: BB0_3:
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
 entry:
   %shr = lshr i32 %arg1, 8
   br i1 undef, label %bb0, label %bb1
@@ -76,7 +76,7 @@
 ; OPT: ret
 
 ; GCN-LABEL: {{^}}sink_sbfe_i32:
-define void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
 entry:
   %shr = ashr i32 %arg1, 8
   br i1 undef, label %bb0, label %bb1
@@ -134,7 +134,7 @@
 ; GCN: BB2_3:
 ; GCN: buffer_store_short
 ; GCN: s_endpgm
-define void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 {
 entry:
   %shr = lshr i16 %arg1, 4
   br i1 undef, label %bb0, label %bb1
@@ -187,7 +187,7 @@
 
 ; GCN: BB3_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 30
   br i1 undef, label %bb0, label %bb1
@@ -236,7 +236,7 @@
 
 ; GCN: BB4_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 15
   br i1 undef, label %bb0, label %bb1
@@ -283,7 +283,7 @@
 
 ; GCN: BB5_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 35
   br i1 undef, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
index 9b4b61c..208d97f 100644
--- a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@@ -8,7 +8,7 @@
 ; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
-define void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
+define amdgpu_kernel void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
 bb0:
   %tmp = icmp sgt i32 %arg1, 4
   %c = icmp eq i32 %arg3, 0
@@ -35,7 +35,7 @@
 ; GCN-NOT: vcc
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
-define void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
+define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
 bb0:
   %tmp = icmp sgt i32 %arg1, 4
   %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
index 4c7875c..3e1b76a 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -13,7 +13,7 @@
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; It's probably OK if this is slightly higher:
 ; CHECK: ; NumVgprs: 8
-define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
+define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
 entry:
   %cmpflag = icmp eq i32 %flag, 1
   br i1 %cmpflag, label %loop, label %exit
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
index 5851720..155de53 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
@@ -8,7 +8,7 @@
 ; SI-LLC-LABEL: {{^}}test:
 ; SI-LLC: s_mul_i32
 ; SI-LLC-NOT: mul
-define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
+define amdgpu_kernel void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
 entry:
   %0 = mul nsw i32 %a, 3
   %1 = sext i32 %0 to i64
diff --git a/llvm/test/CodeGen/AMDGPU/combine_vloads.ll b/llvm/test/CodeGen/AMDGPU/combine_vloads.ll
index 01572af..f8d4e01 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_vloads.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_vloads.ll
@@ -12,7 +12,7 @@
 ; EG-LABEL: {{^}}combine_vloads:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
+define amdgpu_kernel void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
 entry:
   br label %for.body
 
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
index 43a0e58..973c454 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
@@ -8,7 +8,7 @@
 
 ; GCN-LABEL: {{^}}commute_eq_64_i32:
 ; GCN: v_cmp_eq_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -21,7 +21,7 @@
 
 ; GCN-LABEL: {{^}}commute_ne_64_i32:
 ; GCN: v_cmp_ne_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -36,7 +36,7 @@
 ; GCN-LABEL: {{^}}commute_ne_litk_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039
 ; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}}
-define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -49,7 +49,7 @@
 
 ; GCN-LABEL: {{^}}commute_ugt_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -62,7 +62,7 @@
 
 ; GCN-LABEL: {{^}}commute_uge_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
-define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -75,7 +75,7 @@
 
 ; GCN-LABEL: {{^}}commute_ult_64_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -88,7 +88,7 @@
 
 ; GCN-LABEL: {{^}}commute_ule_63_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -104,7 +104,7 @@
 ; GCN-LABEL: {{^}}commute_ule_64_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
-define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -117,7 +117,7 @@
 
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}}
-define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -130,7 +130,7 @@
 
 ; GCN-LABEL: {{^}}commute_sge_neg2_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
-define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -143,7 +143,7 @@
 
 ; GCN-LABEL: {{^}}commute_slt_neg16_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
-define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -156,7 +156,7 @@
 
 ; GCN-LABEL: {{^}}commute_sle_5_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
-define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -173,7 +173,7 @@
 
 ; GCN-LABEL: {{^}}commute_eq_64_i64:
 ; GCN: v_cmp_eq_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -186,7 +186,7 @@
 
 ; GCN-LABEL: {{^}}commute_ne_64_i64:
 ; GCN: v_cmp_ne_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -199,7 +199,7 @@
 
 ; GCN-LABEL: {{^}}commute_ugt_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -212,7 +212,7 @@
 
 ; GCN-LABEL: {{^}}commute_uge_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -225,7 +225,7 @@
 
 ; GCN-LABEL: {{^}}commute_ult_64_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -238,7 +238,7 @@
 
 ; GCN-LABEL: {{^}}commute_ule_63_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -254,7 +254,7 @@
 ; GCN-LABEL: {{^}}commute_ule_64_i64:
 ; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -267,7 +267,7 @@
 
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -280,7 +280,7 @@
 
 ; GCN-LABEL: {{^}}commute_sge_neg2_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -293,7 +293,7 @@
 
 ; GCN-LABEL: {{^}}commute_slt_neg16_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -306,7 +306,7 @@
 
 ; GCN-LABEL: {{^}}commute_sle_5_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -324,7 +324,7 @@
 
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
 ; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -338,7 +338,7 @@
 
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
 ; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -351,7 +351,7 @@
 
 ; GCN-LABEL: {{^}}commute_oge_2.0_f32:
 ; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -364,7 +364,7 @@
 
 ; GCN-LABEL: {{^}}commute_olt_2.0_f32:
 ; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -377,7 +377,7 @@
 
 ; GCN-LABEL: {{^}}commute_ole_2.0_f32:
 ; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -390,7 +390,7 @@
 
 ; GCN-LABEL: {{^}}commute_one_2.0_f32:
 ; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -403,7 +403,7 @@
 
 ; GCN-LABEL: {{^}}commute_ord_2.0_f32:
 ; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -416,7 +416,7 @@
 
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
 ; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -429,7 +429,7 @@
 
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
 ; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -442,7 +442,7 @@
 
 ; GCN-LABEL: {{^}}commute_uge_2.0_f32:
 ; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -455,7 +455,7 @@
 
 ; GCN-LABEL: {{^}}commute_ult_2.0_f32:
 ; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -468,7 +468,7 @@
 
 ; GCN-LABEL: {{^}}commute_ule_2.0_f32:
 ; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -481,7 +481,7 @@
 
 ; GCN-LABEL: {{^}}commute_une_2.0_f32:
 ; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -494,7 +494,7 @@
 
 ; GCN-LABEL: {{^}}commute_uno_2.0_f32:
 ; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -512,7 +512,7 @@
 
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
 ; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -526,7 +526,7 @@
 
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
 ; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -539,7 +539,7 @@
 
 ; GCN-LABEL: {{^}}commute_oge_2.0_f64:
 ; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -552,7 +552,7 @@
 
 ; GCN-LABEL: {{^}}commute_olt_2.0_f64:
 ; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -565,7 +565,7 @@
 
 ; GCN-LABEL: {{^}}commute_ole_2.0_f64:
 ; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -578,7 +578,7 @@
 
 ; GCN-LABEL: {{^}}commute_one_2.0_f64:
 ; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -591,7 +591,7 @@
 
 ; GCN-LABEL: {{^}}commute_ord_2.0_f64:
 ; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -604,7 +604,7 @@
 
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
 ; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -617,7 +617,7 @@
 
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
 ; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -630,7 +630,7 @@
 
 ; GCN-LABEL: {{^}}commute_uge_2.0_f64:
 ; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -643,7 +643,7 @@
 
 ; GCN-LABEL: {{^}}commute_ult_2.0_f64:
 ; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -656,7 +656,7 @@
 
 ; GCN-LABEL: {{^}}commute_ule_2.0_f64:
 ; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -669,7 +669,7 @@
 
 ; GCN-LABEL: {{^}}commute_une_2.0_f64:
 ; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -682,7 +682,7 @@
 
 ; GCN-LABEL: {{^}}commute_uno_2.0_f64:
 ; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -703,7 +703,7 @@
 
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}}
-define void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %stack0 = alloca i32
   %ptr0 = load volatile i32*, i32* addrspace(1)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll b/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll
index ed4ec82..8820e4f 100644
--- a/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll
@@ -8,7 +8,7 @@
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, 2.0
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -22,7 +22,7 @@
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -4.0
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -37,7 +37,7 @@
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -53,7 +53,7 @@
 ; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[K]], |[[X]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -68,7 +68,7 @@
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -85,7 +85,7 @@
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -102,7 +102,7 @@
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -121,7 +121,7 @@
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -139,7 +139,7 @@
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -161,7 +161,7 @@
 ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, |[[R2]]|
 ; SI: buffer_store_dword [[RESULT]]
-define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/concat_vectors.ll b/llvm/test/CodeGen/AMDGPU/concat_vectors.ll
index 2e6be5d..7394842 100644
--- a/llvm/test/CodeGen/AMDGPU/concat_vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/concat_vectors.ll
@@ -8,7 +8,7 @@
 ; value if we want to ensure scratch memory is not being used.
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
   %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
   store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -17,7 +17,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v2i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16
   ret void
@@ -26,7 +26,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v4i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
   %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32
   ret void
@@ -35,7 +35,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v8i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
   %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64
   ret void
@@ -44,7 +44,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v16i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
   %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128
   ret void
@@ -53,7 +53,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v1f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
   %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> <i32 0, i32 1>
   store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -62,7 +62,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v2f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -71,7 +71,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v4f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -80,7 +80,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v8f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -89,7 +89,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v16f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128
   ret void
@@ -98,7 +98,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v1i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
   %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
   store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -107,7 +107,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v2i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -116,7 +116,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v4i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -125,7 +125,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v8i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
   ret void
@@ -134,7 +134,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v16i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
   ret void
@@ -143,7 +143,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v1f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
   %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
   store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -152,7 +152,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v2f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -161,7 +161,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v4f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -170,7 +170,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v8f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
   ret void
@@ -179,7 +179,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v16f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
   ret void
@@ -188,7 +188,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v1i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
   %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> <i32 0, i32 1>
   store <2 x i1> %concat, <2 x i1> addrspace(1)* %out
   ret void
@@ -197,7 +197,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v2i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
   %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i1> %concat, <4 x i1> addrspace(1)* %out
   ret void
@@ -206,7 +206,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v4i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
   %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i1> %concat, <8 x i1> addrspace(1)* %out
   ret void
@@ -215,7 +215,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v8i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
   %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x i1> %concat, <16 x i1> addrspace(1)* %out
   ret void
@@ -224,7 +224,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v16i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
   %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x i1> %concat, <32 x i1> addrspace(1)* %out
   ret void
@@ -233,7 +233,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v32i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
   %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
   store <64 x i1> %concat, <64 x i1> addrspace(1)* %out
   ret void
@@ -242,7 +242,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v1i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
   %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> <i32 0, i32 1>
   store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4
   ret void
@@ -251,7 +251,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v2i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
   %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8
   ret void
@@ -260,7 +260,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v4i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
   %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16
   ret void
@@ -269,7 +269,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v8i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
   %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32
   ret void
@@ -278,7 +278,7 @@
 ; FUNC-LABEL: {{^}}test_concat_v16i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
   %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64
   ret void
@@ -286,7 +286,7 @@
 
 ; FUNC-LABEL: {{^}}concat_vector_crash:
 ; SI: s_endpgm
-define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 bb:
   %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4
   %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
index 34bb258..62b47be 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -1,12 +1,12 @@
 # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s
 --- |
-  define void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %and = and i32 %a, 1234567
     store volatile i32 %and, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@@ -17,13 +17,13 @@
     ret void
   }
 
-  define void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %shl = shl i32 %a, 12
     store volatile i32 %shl, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@@ -34,13 +34,13 @@
     ret void
   }
 
-  define void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %ashr = ashr i32 %a, 12
     store volatile i32 %ashr, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@@ -51,13 +51,13 @@
     ret void
   }
 
-   define void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+   define amdgpu_kernel void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %lshr = lshr i32 %a, 12
     store volatile i32 %lshr, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
index 0ff75ab..0831d25 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@@ -5,7 +5,7 @@
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %and = and i32 %size, %x
@@ -17,7 +17,7 @@
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %and = and i32 %size, %x
   store i32 %and, i32 addrspace(1)* %out
@@ -28,7 +28,7 @@
 ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %or = or i32 %size, %x
@@ -42,7 +42,7 @@
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %or = or i32 %size, %x
   store i32 %or, i32 addrspace(1)* %out
@@ -53,7 +53,7 @@
 ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, %x
@@ -67,7 +67,7 @@
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, %x
   store i32 %xor, i32 addrspace(1)* %out
@@ -78,7 +78,7 @@
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], -1{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, -1
   store i32 %xor, i32 addrspace(1)* %out
@@ -91,7 +91,7 @@
 ; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]]
 ; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}}
 ; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
   %vreg = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg)
   %xor = xor i64 %ctpop, -1
@@ -110,7 +110,7 @@
 ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
   %vreg0 = load volatile i64, i64 addrspace(1)* undef
   %vreg1 = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)
@@ -126,7 +126,7 @@
 ; GCN: v_not_b32
 ; GCN: v_and_b32
 ; GCN-NOT: v_and_b32
-define void @fold_mi_and_neg1(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_and_neg1(i64 addrspace(1)* %out) {
   %vreg0 = load volatile i64, i64 addrspace(1)* undef
   %vreg1 = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 013e89f..d3e6c11 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -72,7 +72,7 @@
 ; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
-define void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %load0 = load volatile i32, i32 addrspace(3)* undef
@@ -150,7 +150,7 @@
 ; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
-define void @divergent_loop(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %load0 = load volatile i32, i32 addrspace(3)* undef
@@ -272,7 +272,7 @@
 
 ; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
-define void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %load0 = load volatile i32, i32 addrspace(3)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll b/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll
index c3d155ce..0074a41 100644
--- a/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll
+++ b/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll
@@ -6,7 +6,7 @@
 ; GCN: v_cmp_ne_u32_e64
 ; GCN: ; mask branch
 ; GCN: BB{{[0-9]+_[0-9]+}}:
-define void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1) #1
@@ -30,7 +30,7 @@
 
 ; GCN: BB{{[0-9]+_[0-9]+}}:
 
-define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index 7434d74..026dd7c 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -8,7 +8,7 @@
 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
@@ -19,7 +19,7 @@
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@@ -32,7 +32,7 @@
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@@ -47,7 +47,7 @@
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@@ -65,7 +65,7 @@
 ; GCN-DAG: buffer_store_dword
 
 ; GCN: s_endpgm
-define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
@@ -85,7 +85,7 @@
 ; GCN: {{buffer|flat}}_store_dword
 ; GCN: {{buffer|flat}}_store_dword
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
@@ -101,7 +101,7 @@
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
   ret void
@@ -113,7 +113,7 @@
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
   ret void
@@ -128,7 +128,7 @@
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
   ret void
@@ -141,7 +141,7 @@
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
@@ -157,7 +157,7 @@
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
index 3422a88..f35b070 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
@@ -6,7 +6,7 @@
 
 ; Make sure this doesn't crash
 ; CHECK-LABEL: {{^}}copy_to_reg_frameindex:
-define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %alloca = alloca [16 x i32]
   br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 1a0027d..709c647 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -27,7 +27,7 @@
 
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
@@ -43,7 +43,7 @@
 
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
@@ -61,7 +61,7 @@
 ; EG: CNDE_INT
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
   store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
@@ -89,7 +89,7 @@
 
 ; EG-DAG: FFBH_UINT
 ; EG-DAG: CNDE_INT
-define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
   store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
@@ -101,7 +101,7 @@
 ; GCN-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_byte [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %val = load i8, i8 addrspace(1)* %valptr
   %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
   store i8 %ctlz, i8 addrspace(1)* %out
@@ -119,14 +119,14 @@
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   store i64 %ctlz, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_ctlz_i64_trunc:
-define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   %trunc = trunc i64 %ctlz to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -145,7 +145,7 @@
 ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]]
 ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
-define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -156,7 +156,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i64_trunc:
-define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -172,7 +172,7 @@
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
- define void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -186,7 +186,7 @@
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp ne i32 %val, 0
@@ -202,7 +202,7 @@
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp eq i32 %ctlz, 32
@@ -217,7 +217,7 @@
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp ne i32 %ctlz, 32
@@ -230,7 +230,7 @@
 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: {{buffer|flat}}_store_byte [[FFBH]],
- define void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
   %val = load i8, i8 addrspace(1)* %valptr.gep
@@ -245,7 +245,7 @@
 ; SI: buffer_load_ushort [[VAL:v[0-9]+]],
 ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_short [[FFBH]],
- define void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
   %val = load i16, i16 addrspace(1)* %valptr
   %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
   %cmp = icmp eq i16 %val, 0
@@ -260,7 +260,7 @@
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]]
 ; GCN: {{buffer|flat}}_store_byte [[TRUNC]],
-define void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
   %val = load i7, i7 addrspace(1)* %valptr.gep
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index d390f64..87ba563 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -22,7 +22,7 @@
 ; GCN: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@
 ; GCN: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
@@ -51,7 +51,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
   store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
@@ -71,7 +71,7 @@
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
   store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
@@ -82,7 +82,7 @@
 ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_byte [[RESULT]],
-define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %val = load i8, i8 addrspace(1)* %valptr
   %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
   store i8 %ctlz, i8 addrspace(1)* %out
@@ -100,14 +100,14 @@
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   store i64 %ctlz, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64_trunc:
-define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   %trunc = trunc i64 %ctlz to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -123,7 +123,7 @@
 ; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -134,7 +134,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64_trunc:
-define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -149,7 +149,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
- define void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -162,7 +162,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
-define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp ne i32 %val, 0
@@ -175,7 +175,7 @@
 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: {{buffer|flat}}_store_byte [[FFBH]],
-define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
   %val = load i8, i8 addrspace(1)* %valptr.gep
@@ -194,7 +194,7 @@
 ; GCN-DAG: buffer_store_dword [[RESULT0]]
 ; GCN-DAG: buffer_store_byte [[RESULT1]]
 ; GCN: s_endpgm
- define void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -211,7 +211,7 @@
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
- define void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -227,7 +227,7 @@
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
-define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp ne i32 %val, 0
@@ -243,7 +243,7 @@
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
- define void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 1
@@ -259,7 +259,7 @@
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
-define void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp ne i32 %val, 1
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop.ll b/llvm/test/CodeGen/AMDGPU/ctpop.ll
index 9692236..a29e72e 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop.ll
@@ -16,7 +16,7 @@
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   store i32 %ctpop, i32 addrspace(1)* %out, align 4
   ret void
@@ -30,7 +30,7 @@
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   store i32 %ctpop, i32 addrspace(1)* %out, align 4
@@ -48,7 +48,7 @@
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
+define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
   %val0 = load i32, i32 addrspace(1)* %in0, align 4
   %val1 = load i32, i32 addrspace(1)* %in1, align 4
   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
@@ -64,7 +64,7 @@
 ; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
+define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
   %val0 = load i32, i32 addrspace(1)* %in0, align 4
   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
   %add = add i32 %ctpop0, %sval
@@ -79,7 +79,7 @@
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
   store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
@@ -97,7 +97,7 @@
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
   store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
@@ -123,7 +123,7 @@
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
   %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
   store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
@@ -165,7 +165,7 @@
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32
   %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
   store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
@@ -179,7 +179,7 @@
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, 4
@@ -194,7 +194,7 @@
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 4, %ctpop
@@ -209,7 +209,7 @@
 ; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, 99999
@@ -225,7 +225,7 @@
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, %const
@@ -241,7 +241,7 @@
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %const, %ctpop
@@ -258,7 +258,7 @@
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4
@@ -279,7 +279,7 @@
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 ; EG: BCNT_INT
-define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
+define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index cd5d805..2610684 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -17,7 +17,7 @@
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
 ; GCN: s_endpgm
-define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -31,7 +31,7 @@
 ; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
@@ -48,7 +48,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
+define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %or = or i64 %ctpop, %s.val
@@ -60,7 +60,7 @@
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_endpgm
-define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
+define amdgpu_kernel void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
   store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
@@ -73,7 +73,7 @@
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_endpgm
-define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
+define amdgpu_kernel void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
   store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
@@ -86,7 +86,7 @@
 ; GCN: v_bcnt_u32_b32
 ; GCN: v_bcnt_u32_b32
 ; GCN: s_endpgm
-define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
@@ -104,7 +104,7 @@
 ; GCN: v_bcnt_u32_b32
 ; GCN: v_bcnt_u32_b32
 ; GCN: s_endpgm
-define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
@@ -121,7 +121,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[ZERO]]
 ; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
 ; GCN: s_endpgm
-define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
+define amdgpu_kernel void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %else
@@ -146,7 +146,7 @@
 ; GCN: s_bcnt1_i32_b64 [[SRESULT1:s[0-9]+]],
 ; GCN: s_add_i32 s{{[0-9]+}}, [[SRESULT1]], [[SRESULT0]]
 ; GCN: s_endpgm
-define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
   %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
   %truncctpop = trunc i128 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -159,7 +159,7 @@
 ; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]],
 ; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]]
 ; GCN: s_endpgm
-define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
   %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone
   %truncctpop = trunc i65 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -181,7 +181,7 @@
 
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
   %val = load i128, i128 addrspace(1)* %in, align 8
   %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
   %truncctpop = trunc i128 %ctpop to i32
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index e33cc18..1fa6407 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -14,7 +14,7 @@
 ; SI: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %cttz, i32 addrspace(1)* %out, align 4
   ret void
@@ -27,7 +27,7 @@
 ; SI: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %cttz, i32 addrspace(1)* %out, align 4
@@ -43,7 +43,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
   store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
@@ -63,7 +63,7 @@
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
   store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/cube.ll b/llvm/test/CodeGen/AMDGPU/cube.ll
index 3c126a4..7b5f1af 100644
--- a/llvm/test/CodeGen/AMDGPU/cube.ll
+++ b/llvm/test/CodeGen/AMDGPU/cube.ll
@@ -12,7 +12,7 @@
 ; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: _store_dwordx4
-define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %cubeid = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
   %cubesc = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
   %cubetc = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 7baaa81..e16daa6 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -10,7 +10,7 @@
 ; GCN-NOT: lshr
 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
 ; GCN: buffer_store_dword [[CONV]],
-define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
   %load = load i8, i8 addrspace(1)* %in, align 1
   %cvt = uitofp i8 %load to float
   store float %cvt, float addrspace(1)* %out, align 4
@@ -22,7 +22,7 @@
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
   %cvt = uitofp <2 x i8> %load to <2 x float>
   store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
@@ -36,7 +36,7 @@
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <3 x i8> %load to <3 x float>
   store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
@@ -52,7 +52,7 @@
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <4 x i8> %load to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@@ -76,7 +76,7 @@
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]
 
 ; GCN: buffer_store_dwordx4
-define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <4 x i8> %load to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@@ -110,7 +110,7 @@
 ; GCN: {{buffer|flat}}_store_dword
 
 ; GCN: s_endpgm
-define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
@@ -124,7 +124,7 @@
 ; Make sure this doesn't crash.
 ; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
 ; GCN: s_endpgm
-define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <7 x i8> %load to <7 x float>
   store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
@@ -147,7 +147,7 @@
 ; GCN-NOT: lshr
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
   %cvt = uitofp <8 x i8> %load to <8 x float>
   store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
@@ -159,7 +159,7 @@
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
 ; GCN: buffer_store_dword [[CONV]],
-define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 2
   %inreg = and i32 %add, 255
@@ -169,7 +169,7 @@
 }
 
 ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
-define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %inreg = and i32 %load, 65280
   %shr = lshr i32 %inreg, 8
@@ -181,7 +181,7 @@
 ; We don't get these ones because of the zext, but instcombine removes
 ; them so it shouldn't really matter.
 ; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
-define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
   %load = load i8, i8 addrspace(1)* %in, align 1
   %ext = zext i8 %load to i32
   %cvt = uitofp i32 %ext to float
@@ -190,7 +190,7 @@
 }
 
 ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
-define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
   %ext = zext <4 x i8> %load to <4 x i32>
   %cvt = uitofp <4 x i32> %ext to <4 x float>
@@ -203,7 +203,7 @@
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %and = and i32 %val, 255
   %cvt = uitofp i32 %and to float
@@ -216,7 +216,7 @@
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %srl = lshr i32 %val, 8
   %and = and i32 %srl, 255
@@ -230,7 +230,7 @@
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %srl = lshr i32 %val, 16
   %and = and i32 %srl, 255
@@ -244,7 +244,7 @@
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %srl = lshr i32 %val, 24
   %and = and i32 %srl, 255
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll b/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
index e7773c6..c10cf1a 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
@@ -10,7 +10,7 @@
 ; SI-NOT: add
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
   %floor = call float @llvm.floor.f32(float %x) #1
   %cvt = fptosi float %floor to i32
   store i32 %cvt, i32 addrspace(1)* %out
@@ -22,7 +22,7 @@
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
   %fadd = fadd float %x, 1.0
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptosi float %floor to i32
@@ -35,7 +35,7 @@
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %floor = call float @llvm.floor.f32(float %x.fabs) #1
   %cvt = fptosi float %floor to i32
@@ -48,7 +48,7 @@
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fneg = fsub float -0.000000e+00, %x
   %floor = call float @llvm.floor.f32(float %x.fneg) #1
   %cvt = fptosi float %floor to i32
@@ -61,7 +61,7 @@
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
   %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1
@@ -75,7 +75,7 @@
 ; SI: v_floor_f32
 ; SI: v_cvt_u32_f32_e32
 ; SI: s_endpgm
-define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
   %floor = call float @llvm.floor.f32(float %x) #1
   %cvt = fptoui float %floor to i32
   store i32 %cvt, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll b/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
index d38411d..9b771eb 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
@@ -9,7 +9,7 @@
 ; SI-SAFE-NOT: v_cvt_rpi_i32_f32
 ; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
   %fadd = fadd float %x, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptosi float %floor to i32
@@ -21,7 +21,7 @@
 ; SI-SAFE-NOT: v_cvt_rpi_i32_f32
 ; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %fadd = fadd float %x.fabs, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
@@ -37,7 +37,7 @@
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fneg = fsub float -0.000000e+00, %x
   %fadd = fadd float %x.fneg, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
@@ -55,7 +55,7 @@
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
   %fadd = fadd float %x.fabs.fneg, 0.5
@@ -71,7 +71,7 @@
 ; SI: v_floor_f32
 ; SI: v_cvt_u32_f32
 ; SI: s_endpgm
-define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
   %fadd = fadd float %x, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptoui float %floor to i32
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
index a32c16d..11acbc2 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
@@ -9,7 +9,7 @@
 ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
 
-define void @store_same_base_ptr(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @store_same_base_ptr(i32 addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #0
   %offset = sext i32 %id to i64
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
index fb43ff4..ceff889 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
@@ -10,7 +10,7 @@
 ; CHECK: {{^}}sint:
 ; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %sint = load i32, i32 addrspace(1) * %in
@@ -24,7 +24,7 @@
 ;CHECK: {{^}}uint:
 ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %uint = load i32, i32 addrspace(1) * %in
diff --git a/llvm/test/CodeGen/AMDGPU/debug.ll b/llvm/test/CodeGen/AMDGPU/debug.ll
index a2e0e87..f149aad 100644
--- a/llvm/test/CodeGen/AMDGPU/debug.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug.ll
@@ -4,7 +4,7 @@
 ; Test for a crash in the custom assembly dump code.
 
 ; SI: s_endpgm
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/debugger-emit-prologue.ll b/llvm/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
index 49a7e72..734905b 100644
--- a/llvm/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
@@ -23,7 +23,7 @@
 ; NOATTR-NOT: DebuggerPrivateSegmentBufferSGPR
 
 ; Function Attrs: nounwind
-define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/debugger-insert-nops.ll b/llvm/test/CodeGen/AMDGPU/debugger-insert-nops.ll
index 7be7d94..fcdbfb1 100644
--- a/llvm/test/CodeGen/AMDGPU/debugger-insert-nops.ll
+++ b/llvm/test/CodeGen/AMDGPU/debugger-insert-nops.ll
@@ -22,7 +22,7 @@
 ; CHECK-NEXT: s_endpgm
 
 ; Function Attrs: nounwind
-define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/debugger-reserve-regs.ll b/llvm/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
index d63ba14..764c60b 100644
--- a/llvm/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
@@ -6,7 +6,7 @@
 ; CHECK-NEXT: ReservedVGPRCount: 4
 
 ; Function Attrs: nounwind
-define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll b/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll
index bfb692f..ad9111a 100644
--- a/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll
+++ b/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: {{^}}test_default_si:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
+define amdgpu_kernel void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -12,7 +12,7 @@
 ; GCN-LABEL: {{^}}test_default_vi:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
+define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -21,7 +21,7 @@
 ; GCN-LABEL: {{^}}test_f64_denormals:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
+define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -30,7 +30,7 @@
 ; GCN-LABEL: {{^}}test_f32_denormals:
 ; GCNL: FloatMode: 48
 ; GCN: IeeeMode: 1
-define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
+define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -39,7 +39,7 @@
 ; GCN-LABEL: {{^}}test_f32_f64_denormals:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 1
-define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
+define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -48,7 +48,7 @@
 ; GCN-LABEL: {{^}}test_no_denormals
 ; GCN: FloatMode: 0
 ; GCN: IeeeMode: 1
-define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
+define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -57,7 +57,7 @@
 ; GCN-LABEL: {{^}}test_f16_f64_denormals:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
+define amdgpu_kernel void @test_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
   store half 0.0, half addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -66,7 +66,7 @@
 ; GCN-LABEL: {{^}}test_no_f16_f64_denormals:
 ; GCN: FloatMode: 0
 ; GCN: IeeeMode: 1
-define void @test_no_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #7 {
+define amdgpu_kernel void @test_no_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #7 {
   store half 0.0, half addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -75,7 +75,7 @@
 ; GCN-LABEL: {{^}}test_f32_f16_f64_denormals:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 1
-define void @test_f32_f16_f64_denormals(half addrspace(1)* %out0, float addrspace(1)* %out1, double addrspace(1)* %out2) #8 {
+define amdgpu_kernel void @test_f32_f16_f64_denormals(half addrspace(1)* %out0, float addrspace(1)* %out1, double addrspace(1)* %out2) #8 {
   store half 0.0, half addrspace(1)* %out0
   store float 0.0, float addrspace(1)* %out1
   store double 0.0, double addrspace(1)* %out2
diff --git a/llvm/test/CodeGen/AMDGPU/detect-dead-lanes.mir b/llvm/test/CodeGen/AMDGPU/detect-dead-lanes.mir
index 9d70f67..32e6f7c 100644
--- a/llvm/test/CodeGen/AMDGPU/detect-dead-lanes.mir
+++ b/llvm/test/CodeGen/AMDGPU/detect-dead-lanes.mir
@@ -1,14 +1,14 @@
 # RUN: llc -march=amdgcn -run-pass detect-dead-lanes -o - %s | FileCheck %s
 --- |
-  define void @test0() { ret void }
-  define void @test1() { ret void }
-  define void @test2() { ret void }
-  define void @test3() { ret void }
-  define void @test4() { ret void }
-  define void @test5() { ret void }
-  define void @loop0() { ret void }
-  define void @loop1() { ret void }
-  define void @loop2() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
+  define amdgpu_kernel void @test1() { ret void }
+  define amdgpu_kernel void @test2() { ret void }
+  define amdgpu_kernel void @test3() { ret void }
+  define amdgpu_kernel void @test4() { ret void }
+  define amdgpu_kernel void @test5() { ret void }
+  define amdgpu_kernel void @loop0() { ret void }
+  define amdgpu_kernel void @loop1() { ret void }
+  define amdgpu_kernel void @loop2() { ret void }
 ...
 ---
 # Combined use/def transfer check, the basics.
diff --git a/llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll b/llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
index cdd2c0c..6dfe129 100644
--- a/llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
@@ -9,7 +9,7 @@
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK-NEXT: JUMP
 ; CHECK-NEXT: LOOP_BREAK
-define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
+define amdgpu_kernel void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
 entry:
   %cmp5 = icmp sgt i32 %iterations, 0
   br i1 %cmp5, label %for.body, label %for.end
diff --git a/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll b/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
index 5e1ebfd..878b5eb 100644
--- a/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
@@ -9,7 +9,7 @@
 ; GCN: buffer_load_dword
 ; GCN: ds_write2_b32
 ; GCN: s_endpgm
-define void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
+define amdgpu_kernel void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
 entry:
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx = shl i32 %tid, 2
diff --git a/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
index f461d69..5997e27 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
@@ -23,7 +23,7 @@
 ; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:34
 ; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
 ; CHECK: s_endpgm
-define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
+define amdgpu_kernel void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
 entry:
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %mul = shl nsw i32 %x.i, 1
diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
index 16fb019..d74bd5a 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -9,7 +9,7 @@
 ; GCN: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]]
 ; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b
 ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12
-define void @write_ds_sub0_offset0_global() #0 {
+define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
   %sub1 = sub i32 0, %x.i
@@ -24,7 +24,7 @@
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
 ; GCN: ds_write_b8 [[NEG]], [[K]] offset:65535
-define void @add_x_shl_neg_to_sub_max_offset() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -39,7 +39,7 @@
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x10000, [[SCALED]]
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
 ; GCN: ds_write_b8 [[NEG]], [[K]]{{$}}
-define void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -58,7 +58,7 @@
 ; GCN-NOT: v_sub
 ; GCN: ds_write_b32 [[NEG]], [[K]] offset:456{{$}}
 ; GCN: s_endpgm
-define void @add_x_shl_neg_to_sub_multi_use() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -80,7 +80,7 @@
 ; GCN-NOT: v_sub
 ; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}}
 ; GCN: s_endpgm
-define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -95,7 +95,7 @@
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
 ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset0:254 offset1:255
-define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -109,7 +109,7 @@
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x3fc, [[SCALED]]
 ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
-define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 9a31323..2c474db 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -12,7 +12,7 @@
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -31,7 +31,7 @@
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -49,7 +49,7 @@
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
 ; SI: s_endpgm
-define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -66,7 +66,7 @@
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
@@ -98,7 +98,7 @@
 ; SI: s_barrier
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
@@ -133,7 +133,7 @@
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
@@ -170,7 +170,7 @@
 ; SI: ds_read_b32
 ; SI: ds_read_b32
 ; SI: s_endpgm
-define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
@@ -196,7 +196,7 @@
 ; SI: ds_read_b32
 ; SI: ds_read_b32
 ; SI: s_endpgm
-define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
@@ -219,7 +219,7 @@
 ; SI-LABEL: {{^}}read2_ptr_is_subreg_f32:
 ; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
 ; SI: s_endpgm
-define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
   %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1
@@ -243,7 +243,7 @@
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4
@@ -261,7 +261,7 @@
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -280,7 +280,7 @@
 ; SI-LABEL: @unaligned_read2_f32
 ; SI-NOT: ds_read2_b32
 ; SI: s_endpgm
-define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 1
@@ -296,7 +296,7 @@
 ; SI-LABEL: @misaligned_2_simple_read2_f32
 ; SI-NOT: ds_read2_b32
 ; SI: s_endpgm
-define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 2
@@ -315,7 +315,7 @@
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2_f64(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -331,7 +331,7 @@
 ; SI-LABEL: @simple_read2_f64_max_offset
 ; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
 ; SI: s_endpgm
-define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -349,7 +349,7 @@
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
 ; SI: s_endpgm
-define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -367,7 +367,7 @@
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
 ; SI: s_endpgm
-define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
@@ -385,7 +385,7 @@
 ; SI-LABEL: @load_constant_adjacent_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
-define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
   %sum = add i32 %val0, %val1
@@ -396,7 +396,7 @@
 ; SI-LABEL: @load_constant_disjoint_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
-define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
   %sum = add i32 %val0, %val1
@@ -410,7 +410,7 @@
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
-define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
   %sum = add i64 %val0, %val1
@@ -426,7 +426,7 @@
 ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
 ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
 ; SI: s_endpgm
-define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
   %sum = add i64 %val0, %val1
@@ -437,7 +437,7 @@
 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
 
-define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
+define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
   %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
   %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
@@ -481,13 +481,13 @@
   ret void
 }
 
-define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
   %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
   store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
   ret void
 }
 
-define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
   %load = load i64, i64 addrspace(3)* %in, align 4
   store i64 %load, i64 addrspace(1)* %out, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
index 4a3f3fb..9668743 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -10,7 +10,7 @@
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12
-define void @offset_order(float addrspace(1)* %out) {
+define amdgpu_kernel void @offset_order(float addrspace(1)* %out) {
 entry:
   %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0
   %val0 = load float, float addrspace(3)* %ptr0
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
index 9d8375d..fc85ec0 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -12,7 +12,7 @@
 ; CI: s_waitcnt lgkmcnt(0)
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; CI: s_endpgm
-define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds  [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
   %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0, align 4
@@ -26,7 +26,7 @@
 ; CI: s_waitcnt lgkmcnt(0)
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; CI: s_endpgm
-define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
   %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0
@@ -43,7 +43,7 @@
 ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]]
 ; CI: buffer_store_dword v[[ADD2]]
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 4
@@ -68,7 +68,7 @@
 ; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]]
 ; CI: buffer_store_dword v[[ADD1]]
 ; CI: s_endpgm
-define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <3 x float>], [512 x <3 x float>] addrspace(3)* @lds.v3, i32 0, i32 %x.i
   %val0 = load <3 x float>, <3 x float> addrspace(3)* %arrayidx0, align 4
@@ -88,7 +88,7 @@
 ; CI: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
 ; CI: buffer_store_dwordx4 [[REG_ZW]]
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 8
@@ -101,7 +101,7 @@
 ; CI-DAG: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
 ; CI: buffer_store_dwordx4 [[REG_ZW]]
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0
@@ -117,7 +117,7 @@
 ; CI-DAG: buffer_store_dwordx4 [[VEC_HI]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
 ; CI-DAG: buffer_store_dwordx4 [[VEC_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}}
 ; CI: s_endpgm
-define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <8 x float>], [512 x <8 x float>] addrspace(3)* @lds.v8, i32 0, i32 %x.i
   %val0 = load <8 x float>, <8 x float> addrspace(3)* %arrayidx0
@@ -138,7 +138,7 @@
 ; CI-DAG: buffer_store_dwordx4 [[VEC8_11]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
 ; CI-DAG: buffer_store_dwordx4 [[VEC12_15]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
 ; CI: s_endpgm
-define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <16 x float>], [512 x <16 x float>] addrspace(3)* @lds.v16, i32 0, i32 %x.i
   %val0 = load <16 x float>, <16 x float> addrspace(3)* %arrayidx0
@@ -153,7 +153,7 @@
 ; CI-NOT: v_mov
 ; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}}
 ; CI: s_endpgm
-define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
@@ -176,7 +176,7 @@
 ; CI-NOT: v_mov
 ; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}}
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll b/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
index 149b4ce..81b35a4 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -10,7 +10,7 @@
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -29,7 +29,7 @@
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
@@ -49,7 +49,7 @@
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
@@ -69,7 +69,7 @@
 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
 ; SI: s_endpgm
-define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
@@ -86,7 +86,7 @@
 ; SI-LABEL: @odd_invalid_read2st64_f32_0
 ; SI-NOT: ds_read2st64_b32
 ; SI: s_endpgm
-define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -102,7 +102,7 @@
 ; SI-LABEL: @odd_invalid_read2st64_f32_1
 ; SI-NOT: ds_read2st64_b32
 ; SI: s_endpgm
-define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
@@ -122,7 +122,7 @@
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -141,7 +141,7 @@
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -161,7 +161,7 @@
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
 ; SI: s_endpgm
-define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
@@ -181,7 +181,7 @@
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 256
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -201,7 +201,7 @@
 ; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -218,7 +218,7 @@
 ; SI-LABEL: @invalid_read2st64_f64_odd_offset
 ; SI-NOT: ds_read2st64_b64
 ; SI: s_endpgm
-define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -239,7 +239,7 @@
 ; SI-NOT: ds_read2st_b64
 ; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
 ; SI: s_endpgm
-define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index ae230da..ab1cf0b 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -9,7 +9,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %val = load float, float addrspace(1)* %in.gep, align 4
@@ -27,7 +27,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -46,7 +46,7 @@
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -65,7 +65,7 @@
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -86,7 +86,7 @@
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
@@ -107,7 +107,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
   %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
@@ -126,7 +126,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
   %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
@@ -146,7 +146,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
-define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -164,7 +164,7 @@
 ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
 ; SI: s_endpgm
-define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -182,7 +182,7 @@
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
@@ -212,7 +212,7 @@
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
@@ -243,7 +243,7 @@
 ; SI: ds_write_b32
 ; SI: ds_write_b32
 ; SI: s_endpgm
-define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -270,7 +270,7 @@
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
 ; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
@@ -288,7 +288,7 @@
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
 ; SI: s_endpgm
-define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
@@ -306,7 +306,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
 ; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
@@ -325,7 +325,7 @@
 ; SI-LABEL: @store_constant_adjacent_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-define void @store_constant_adjacent_offsets() {
+define amdgpu_kernel void @store_constant_adjacent_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
   ret void
@@ -335,7 +335,7 @@
 ; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
 ; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
-define void @store_constant_disjoint_offsets() {
+define amdgpu_kernel void @store_constant_disjoint_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
   ret void
@@ -348,7 +348,7 @@
 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; SI: s_endpgm
-define void @store_misaligned64_constant_offsets() {
+define amdgpu_kernel void @store_misaligned64_constant_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
   ret void
@@ -362,7 +362,7 @@
 ; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI: s_endpgm
-define void @store_misaligned64_constant_large_offsets() {
+define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
   ret void
@@ -371,7 +371,7 @@
 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
 
-define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
   %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
   %val = load float, float addrspace(1)* %in
@@ -410,7 +410,7 @@
 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}}
 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}}
 ; CI: s_endpgm
-define void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
   %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
index 872e773..a395af3 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -7,7 +7,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
 ; SI: s_endpgm
-define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %val = load float, float addrspace(1)* %in.gep, align 4
@@ -25,7 +25,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
 ; SI: s_endpgm
-define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -46,7 +46,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
-define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -66,7 +66,7 @@
 ; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]],
 ; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
 ; SI: s_endpgm
-define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
@@ -85,7 +85,7 @@
 ; SI-NOT: ds_write2st64_b64
 ; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
 ; SI: s_endpgm
-define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 580dc00..b1107ea 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -4,7 +4,7 @@
 
 ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
 
-define void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) {
+define amdgpu_kernel void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) {
   %alloca = alloca i32, i32 %n
   store volatile i32 0, i32* %alloca
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
index d1624f8..ace0159 100644
--- a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
+++ b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
@@ -10,7 +10,7 @@
 ; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_LO:[0-9]+]], v[[ADD_LO]], v[[VAL_LO]], vcc
 ; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_HI:[0-9]+]], v[[ADD_HI]], v[[VAL_HI]], vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 entry:
   %v = load double, double addrspace(1)* %in
   %cc = fcmp oeq double %v, 1.000000e+00
@@ -32,7 +32,7 @@
 ; GCN: v_add_f64
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
-define void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
 entry:
   %v = load double, double addrspace(2)* %in
   %cc = fcmp oeq double %v, 1.000000e+00
@@ -62,7 +62,7 @@
 
 ; GCN-DAG: buffer_store_dword v
 ; GCN-DAG: buffer_store_dwordx2
-define void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <3 x i32>, <3 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
@@ -93,7 +93,7 @@
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 
 ; GCN: buffer_store_dwordx4
-define void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
index 5ae1db8..9439130 100644
--- a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
+++ b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
@@ -9,7 +9,7 @@
 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -32,7 +32,7 @@
 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[MUL]], vcc
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -58,7 +58,7 @@
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
 ; GCN: s_mov_b64 vcc, [[CMP]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
-define void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
 entry:
   %v = load i32, i32 addrspace(1)* %in
   %cc = fcmp oeq float %k, 1.000000e+00
@@ -87,7 +87,7 @@
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
 ; GCN: v_cndmask_b32_e32
-define void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -128,7 +128,7 @@
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -162,7 +162,7 @@
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -187,7 +187,7 @@
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 {
 entry:
   %v = load i32, i32 addrspace(2)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
@@ -206,7 +206,7 @@
 
 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
 ; GCN: v_cndmask_b32
-define void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 {
 entry:
   %v = load float, float addrspace(2)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -227,7 +227,7 @@
 
 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
 ; GCN: v_cndmask_b32
-define void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
 entry:
   %cc = fcmp oeq float %v, 1.000000e+00
   br i1 %cc, label %if, label %endif
@@ -248,7 +248,7 @@
 ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]]
-define void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load i32, i32 addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -274,7 +274,7 @@
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = icmp eq i32 %cond, 1
@@ -295,7 +295,7 @@
 ; GCN: s_addc_u32
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load i64, i64 addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -320,7 +320,7 @@
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load <3 x i32>, <3 x i32> addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -345,7 +345,7 @@
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load <4 x i32>, <4 x i32> addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -364,7 +364,7 @@
 ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
 ; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}}
-define void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %else, label %if
@@ -385,7 +385,7 @@
 ; GCN: {{^}}; BB#0:
 ; GCN-NEXT: s_load_dwordx2
 ; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0
-define void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   br i1 undef, label %else, label %if
 
@@ -410,7 +410,7 @@
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
@@ -435,7 +435,7 @@
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/elf.r600.ll b/llvm/test/CodeGen/AMDGPU/elf.r600.ll
index 51cd085..93c5e55 100644
--- a/llvm/test/CodeGen/AMDGPU/elf.r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf.r600.ll
@@ -9,7 +9,7 @@
 ; CONFIG-NEXT: .long   2
 ; CONFIG-NEXT: .long   165900
 ; CONFIG-NEXT: .long   0
-define void @test(float addrspace(1)* %out, i32 %p) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, i32 %p) {
    %i = add i32 %p, 2
    %r = bitcast i32 %i to float
    store float %r, float addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/empty-function.ll b/llvm/test/CodeGen/AMDGPU/empty-function.ll
index a060900..1231cb4 100644
--- a/llvm/test/CodeGen/AMDGPU/empty-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/empty-function.ll
@@ -7,14 +7,14 @@
 ; SI-LABEL: {{^}}empty_function_ret:
 ; SI: s_endpgm
 ; SI: codeLenInByte = 4
-define void @empty_function_ret() #0 {
+define amdgpu_kernel void @empty_function_ret() #0 {
   ret void
 }
 
 ; SI: .text
 ; SI-LABEL: {{^}}empty_function_unreachable:
 ; SI: codeLenInByte = 0
-define void @empty_function_unreachable() #0 {
+define amdgpu_kernel void @empty_function_unreachable() #0 {
   unreachable
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll b/llvm/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
index 76b50b5..6eb1fc1 100644
--- a/llvm/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
+++ b/llvm/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
@@ -9,7 +9,7 @@
 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
 
 ; GCN-UNSAFE-NOT: xor
-define void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll b/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll
index c670954..bd861e0 100644
--- a/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll
+++ b/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll
@@ -12,7 +12,7 @@
 ; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}}
 ; CHECK-NOT: s_or_b64 exec, exec
 ; CHECK: s_cbranch_execnz [[LOOP_LABEL]]
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
 entry:
   %cond = call i32 @llvm.r600.read.tidig.x() #0
   %tmp0 = icmp eq i32 %cond, 0
diff --git a/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll b/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
index 8ef54b9..40d115b 100644
--- a/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_tahiti
-define void @use_too_many_sgprs_tahiti() #0 {
+define amdgpu_kernel void @use_too_many_sgprs_tahiti() #0 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
@@ -20,7 +20,7 @@
 }
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire
-define void @use_too_many_sgprs_bonaire() #1 {
+define amdgpu_kernel void @use_too_many_sgprs_bonaire() #1 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
@@ -39,7 +39,7 @@
 }
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire_flat_scr
-define void @use_too_many_sgprs_bonaire_flat_scr() #1 {
+define amdgpu_kernel void @use_too_many_sgprs_bonaire_flat_scr() #1 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
@@ -59,7 +59,7 @@
 }
 
 ; ERROR: error: scalar registers limit of 96 exceeded (98) in use_too_many_sgprs_iceland
-define void @use_too_many_sgprs_iceland() #2 {
+define amdgpu_kernel void @use_too_many_sgprs_iceland() #2 {
   call void asm sideeffect "", "~{VCC}" ()
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
@@ -77,7 +77,7 @@
 }
 
 ; ERROR: error: addressable scalar registers limit of 102 exceeded (103) in use_too_many_sgprs_fiji
-define void @use_too_many_sgprs_fiji() #3 {
+define amdgpu_kernel void @use_too_many_sgprs_fiji() #3 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
diff --git a/llvm/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll b/llvm/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
index cf384da..0fa06b8 100644
--- a/llvm/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: and_zext:
 ; GCN: v_and_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]]
-define void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id
   %a = load i16, i16 addrspace(1)* %in
@@ -18,7 +18,7 @@
 ; GCN-LABEL: or_zext:
 ; GCN: v_or_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]]
-define void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id
   %a = load i16, i16 addrspace(1)* %in
@@ -33,7 +33,7 @@
 ; GCN-LABEL: xor_zext:
 ; GCN: v_xor_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]]
-define void @xor_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @xor_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id
   %a = load i16, i16 addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/extload-align.ll b/llvm/test/CodeGen/AMDGPU/extload-align.ll
index 9d2eb74..4644800 100644
--- a/llvm/test/CodeGen/AMDGPU/extload-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload-align.ll
@@ -9,7 +9,7 @@
 ; DEBUG: mem:LD2[<unknown>]{{[^(]}}
 ; DEBUG: {{^}}# End machine code for function extload_align.
 
-define void @extload_align(i32* %out, i32 %index) #0 {
+define amdgpu_kernel void @extload_align(i32* %out, i32 %index) #0 {
   %v0 = alloca [4 x i16]
   %a1 = getelementptr inbounds [4 x i16], [4 x i16]* %v0, i32 0, i32 0
   %a2 = getelementptr inbounds [4 x i16], [4 x i16]* %v0, i32 0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/extload-private.ll b/llvm/test/CodeGen/AMDGPU/extload-private.ll
index 505aef3..fd298b3 100644
--- a/llvm/test/CodeGen/AMDGPU/extload-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload-private.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}load_i8_sext_private:
 ; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
-define void @load_i8_sext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i8_sext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i8
   %tmp1 = load i8, i8* %tmp0
@@ -14,7 +14,7 @@
 
 ; FUNC-LABEL: {{^}}load_i8_zext_private:
 ; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
-define void @load_i8_zext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i8_zext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i8
   %tmp1 = load i8, i8* %tmp0
@@ -25,7 +25,7 @@
 
 ; FUNC-LABEL: {{^}}load_i16_sext_private:
 ; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
-define void @load_i16_sext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i16_sext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i16
   %tmp1 = load i16, i16* %tmp0
@@ -36,7 +36,7 @@
 
 ; FUNC-LABEL: {{^}}load_i16_zext_private:
 ; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
-define void @load_i16_zext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i16
   %tmp1 = load volatile i16, i16* %tmp0
diff --git a/llvm/test/CodeGen/AMDGPU/extload.ll b/llvm/test/CodeGen/AMDGPU/extload.ll
index 8b3e087..a7b8e86 100644
--- a/llvm/test/CodeGen/AMDGPU/extload.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload.ll
@@ -10,7 +10,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
 ; EG: VTX_READ_32 [[VAL]]
-define void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
   %load = load i32, i32 addrspace(1)* %cast
   %x = bitcast i32 %load to <4 x i8>
@@ -25,7 +25,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
 ; EG: VTX_READ_32 [[VAL]]
-define void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
   %load = load i32, i32 addrspace(1)* %cast
   %x = bitcast i32 %load to <2 x i16>
@@ -40,7 +40,7 @@
 
 ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
 ; EG: LDS_WRITE * [[VAL]]
-define void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
   %load = load i32, i32 addrspace(3)* %cast
   %x = bitcast i32 %load to <4 x i8>
@@ -55,7 +55,7 @@
 
 ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
 ; EG: LDS_WRITE * [[VAL]]
-define void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
   %load = load i32, i32 addrspace(3)* %cast
   %x = bitcast i32 %load to <2 x i16>
diff --git a/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll b/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
index 4edff15..be85ca9 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
@@ -13,7 +13,7 @@
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
                                                     <4 x i32> addrspace(1)* noalias %out1,
                                                     i32 addrspace(1)* noalias %out2,
                                                     i32 addrspace(1)* %in) {
@@ -55,7 +55,7 @@
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+define amdgpu_kernel void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
                                                             <4 x i32> addrspace(1)* noalias %out1,
                                                             i32 addrspace(1)* noalias %out2,
                                                             i32 addrspace(1)* %in) {
@@ -99,7 +99,7 @@
 
 ; GCN: buffer_store_dwordx2
 ; GCN: buffer_store_dwordx2
-define void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0,
+define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0,
                                                                      <4 x i32> addrspace(1)* noalias %out1,
                                                                      i64 addrspace(1)* noalias %out2,
                                                                      i32 addrspace(1)* %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 84c7955..1f567ae 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -8,7 +8,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN-DAG: buffer_store_short [[VELT0]]
 ; GCN-DAG: buffer_store_short [[VELT1]]
-define void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
   %p0 = extractelement <2 x half> %vec, i32 0
   %p1 = extractelement <2 x half> %vec, i32 1
@@ -26,7 +26,7 @@
 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN: buffer_store_short [[VELT1]]
 ; GCN: ScratchSize: 0
-define void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
   %elt = extractelement <2 x half> %vec, i32 %idx
   store half %elt, half addrspace(1)* %out, align 2
@@ -45,7 +45,7 @@
 ; SI: buffer_store_short [[ELT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
 ; GCN: ScratchSize: 0{{$}}
-define void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
@@ -61,7 +61,7 @@
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
-define void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 {
   %p0 = extractelement <3 x half> %foo, i32 0
   %p1 = extractelement <3 x half> %foo, i32 2
   %out1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -75,7 +75,7 @@
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
-define void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo) #0 {
   %p0 = extractelement <4 x half> %foo, i32 0
   %p1 = extractelement <4 x half> %foo, i32 2
   %out1 = getelementptr half, half addrspace(1)* %out, i32 10
@@ -95,7 +95,7 @@
 
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
-define void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x half> %foo, i32 %idx
   %out1 = getelementptr half, half addrspace(1)* %out, i32 1
   store half %p0, half addrspace(1)* %out
@@ -115,7 +115,7 @@
 
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
-define void @dynamic_extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo, i32 %idx) #0 {
   %p0 = extractelement <4 x half> %foo, i32 %idx
   %out1 = getelementptr half, half addrspace(1)* %out, i32 1
   store half %p0, half addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
index 4594379..db5bf0b 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
@@ -5,7 +5,7 @@
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_load_dwordx2
 ; GCN: buffer_store_dwordx2
-define void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
   %ld = load volatile <3 x double>, <3 x double> addrspace(1)* %in
   %elt = extractelement <3 x double> %ld, i32 2
   store volatile double %elt, double addrspace(1)* %out
@@ -13,14 +13,14 @@
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64:
-define void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x double> %foo, i32 %elt
   store volatile double %dynelt, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64:
-define void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x double> %foo, i32 %elt
   store volatile double %dynelt, double addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
index 22e04db..9b117d4 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -9,7 +9,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN-DAG: buffer_store_short [[VELT0]]
 ; GCN-DAG: buffer_store_short [[VELT1]]
-define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %p0 = extractelement <2 x i16> %vec, i32 0
   %p1 = extractelement <2 x i16> %vec, i32 1
@@ -27,7 +27,7 @@
 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN: buffer_store_short [[VELT1]]
 ; GCN: ScratchSize: 0
-define void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %elt = extractelement <2 x i16> %vec, i32 %idx
   store i16 %elt, i16 addrspace(1)* %out, align 2
@@ -45,7 +45,7 @@
 ; SI: buffer_store_short [[ELT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
 ; GCN: ScratchSize: 0{{$}}
-define void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
@@ -61,7 +61,7 @@
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
-define void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
   %p0 = extractelement <3 x i16> %foo, i32 0
   %p1 = extractelement <3 x i16> %foo, i32 2
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
@@ -82,7 +82,7 @@
 ; GFX9-DAG: buffer_store_short [[VLOAD0]], off
 ; GFX9-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], [[LOAD1]]
 ; GFX9-DAG: buffer_store_short [[VLOAD1]], off
-define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
   %p0 = extractelement <4 x i16> %foo, i32 0
   %p1 = extractelement <4 x i16> %foo, i32 2
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
@@ -105,7 +105,7 @@
 
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
-define void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
   store i16 %p0, i16 addrspace(1)* %out
@@ -131,7 +131,7 @@
 ; GFX9: buffer_store_dword
 ; GFX9: buffer_load_ushort
 ; GFX9: buffer_store_short
-define void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
   %p0 = extractelement <4 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
   store i16 %p0, i16 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
index 1df91c9..a8d1278 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
@@ -8,7 +8,7 @@
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dwordx2
-define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 {
+define amdgpu_kernel void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 {
   %vec = bitcast i64 %val to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 0
   %elt1 = extractelement <2 x i32> %vec, i32 1
@@ -20,7 +20,7 @@
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v2i64:
-define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 {
   %p0 = extractelement <2 x i64> %foo, i32 0
   %p1 = extractelement <2 x i64> %foo, i32 1
   %out1 = getelementptr i64, i64 addrspace(1)* %out, i32 1
@@ -30,14 +30,14 @@
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64:
-define void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <2 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2:
-define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 {
   %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo
   %or = or <2 x i64> %load, %arst
   %dynelt = extractelement <2 x i64> %or, i32 %elt
@@ -46,14 +46,14 @@
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64:
-define void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64:
-define void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index 6f4ae82..b7d768f 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -4,7 +4,7 @@
 ; FUNC-LABEL: {{^}}extract_vector_elt_v1i8:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
   %p0 = extractelement <1 x i8> %foo, i32 0
   store i8 %p0, i8 addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
   %p0 = extractelement <2 x i8> %foo, i32 0
   %p1 = extractelement <2 x i8> %foo, i32 1
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -29,7 +29,7 @@
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
   %p0 = extractelement <3 x i8> %foo, i32 0
   %p1 = extractelement <3 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -43,7 +43,7 @@
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
   %p0 = extractelement <4 x i8> %foo, i32 0
   %p1 = extractelement <4 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -57,7 +57,7 @@
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
   %p0 = extractelement <8 x i8> %foo, i32 0
   %p1 = extractelement <8 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -71,7 +71,7 @@
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
   %p0 = extractelement <16 x i8> %foo, i32 0
   %p1 = extractelement <16 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -85,7 +85,7 @@
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
   %p0 = extractelement <32 x i8> %foo, i32 0
   %p1 = extractelement <32 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -99,7 +99,7 @@
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
   %p0 = extractelement <64 x i8> %foo, i32 0
   %p1 = extractelement <64 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -120,7 +120,7 @@
 ; GCN: buffer_store_byte
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
-define void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x i8> %foo, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   store i8 %p0, i8 addrspace(1)* %out
@@ -141,7 +141,7 @@
 ; GCN: buffer_store_byte
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
-define void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 {
   %p0 = extractelement <4 x i8> %foo, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   store i8 %p0, i8 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
index e160c20..34999fa 100644
--- a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@@ -7,7 +7,7 @@
 ; GCN-DAG: buffer_load_dword [[A:v[0-9]+]]
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]]
 ; GCN: buffer_store_dword [[ADD]]
-define void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
    %a = load i64, i64 addrspace(1)* %in
    %add = add i64 %a, %b
    %val.bc = bitcast i64 %add to <2 x i32>
@@ -20,7 +20,7 @@
 ; GCN: buffer_load_dwordx2
 ; GCN: v_add_f64
 ; GCN: buffer_store_dword v
-define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
+define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
    %a = load double, double addrspace(1)* %in
    %add = fadd double %a, %b
    %val.bc = bitcast double %add to <2 x i32>
@@ -33,7 +33,7 @@
 ; GCN: buffer_load_dwordx2
 ; GCN: v_add_i32
 ; GCN: buffer_store_dword
-define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
    %a = load i64, i64 addrspace(1)* %in
    %add = add i64 %a, %b
    %val.bc = bitcast i64 %add to <2 x float>
@@ -45,7 +45,7 @@
 ; GCN-LABEL: {{^}}no_extract_volatile_load_extract0:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_store_dword v
-define void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
   %elt0 = extractelement <4 x i32> %vec, i32 0
@@ -57,7 +57,7 @@
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_store_dword v
 
-define void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
   %elt2 = extractelement <4 x i32> %vec, i32 2
@@ -68,7 +68,7 @@
 ; GCN-LABEL: {{^}}no_extract_volatile_load_dynextract:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_store_dword v
-define void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
 entry:
   %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
   %eltN = extractelement <4 x i32> %vec, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 5ff9dcd4..b80fbda 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -11,7 +11,7 @@
 ; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]]
 ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 
-define void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
   %bc= bitcast i16 %in to half
   %fabs = call half @llvm.fabs.f16(half %bc)
   store half %fabs, half addrspace(1)* %out
@@ -22,7 +22,7 @@
 ; CI: flat_load_ushort [[VAL:v[0-9]+]],
 ; CI: v_and_b32_e32 [[CVT0:v[0-9]+]], 0x7fff, [[VAL]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @s_fabs_f16(half addrspace(1)* %out, half %in) {
+define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
   %fabs = call half @llvm.fabs.f16(half %in)
   store half %fabs, half addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@
 
 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
 ; GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
-define void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
+define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   store <2 x half> %fabs, <2 x half> addrspace(1)* %out
   ret void
@@ -68,7 +68,7 @@
 ; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
 
 ; GCN: flat_store_dwordx2
-define void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
+define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
   %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
   store <4 x half> %fabs, <4 x half> addrspace(1)* %out
   ret void
@@ -87,7 +87,7 @@
 ; VI-NOT: and
 ; VI: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN1]]|, [[IN0]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
+define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
   %fabs = call half @llvm.fabs.f16(half %in0)
   %fmul = fmul half %fabs, %in1
   store half %fmul, half addrspace(1)* %out
@@ -97,7 +97,7 @@
 ; GCN-LABEL: {{^}}v_fabs_v2f16:
 ; GCN: flat_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, [[VAL]]
-define void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
@@ -110,7 +110,7 @@
 ; GCN-LABEL: {{^}}fabs_free_v2f16:
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
-define void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
   %bc = bitcast i32 %in to <2 x half>
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc)
   store <2 x half> %fabs, <2 x half> addrspace(1)* %out
@@ -133,7 +133,7 @@
 
 ; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}}
-define void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
   %fmul = fmul <2 x half> %fabs, %val
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
index f7780b87..998e02f 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
@@ -10,7 +10,7 @@
 ; FUNC-LABEL: {{^}}v_fabs_f64:
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tidext = sext i32 %tid to i64
   %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext
@@ -24,7 +24,7 @@
 ; SI: v_and_b32
 ; SI-NOT: v_and_b32
 ; SI: s_endpgm
-define void @fabs_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double %in) {
   %fabs = call double @llvm.fabs.f64(double %in)
   store double %fabs, double addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@
 ; SI: v_and_b32
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
   store <2 x double> %fabs, <2 x double> addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@
 ; SI: v_and_b32
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
   %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
   store <4 x double> %fabs, <4 x double> addrspace(1)* %out
   ret void
@@ -57,7 +57,7 @@
 ; SI-NOT: and
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|
 ; SI: s_endpgm
-define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
   %fabs = call double @llvm.fabs.f64(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, double addrspace(1)* %out
@@ -69,7 +69,7 @@
 ; SI-NOT: and
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|
 ; SI: s_endpgm
-define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
   %fabs = call double @fabs(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, double addrspace(1)* %out
@@ -79,7 +79,7 @@
 ; FUNC-LABEL: {{^}}fabs_free_f64:
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc= bitcast i64 %in to double
   %fabs = call double @llvm.fabs.f64(double %bc)
   store double %fabs, double addrspace(1)* %out
@@ -89,7 +89,7 @@
 ; FUNC-LABEL: {{^}}fabs_fn_free_f64:
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc= bitcast i64 %in to double
   %fabs = call double @fabs(double %bc)
   store double %fabs, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 98e7f9e..ac8fa3e 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -13,7 +13,7 @@
 
 ; GCN: v_and_b32
 
-define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
   %fabs = call float @fabs(float %bc)
   store float %fabs, float addrspace(1)* %out
@@ -26,7 +26,7 @@
 
 ; GCN: v_and_b32
 
-define void @fabs_free(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fabs_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
   %fabs = call float @llvm.fabs.f32(float %bc)
   store float %fabs, float addrspace(1)* %out
@@ -37,7 +37,7 @@
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
 ; GCN: v_and_b32
-define void @fabs_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)
   store float %fabs, float addrspace(1)* %out
   ret void
@@ -49,7 +49,7 @@
 
 ; GCN: v_and_b32
 ; GCN: v_and_b32
-define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   store <2 x float> %fabs, <2 x float> addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@
 ; GCN: v_and_b32
 ; GCN: v_and_b32
 ; GCN: v_and_b32
-define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
   store <4 x float> %fabs, <4 x float> addrspace(1)* %out
   ret void
@@ -76,7 +76,7 @@
 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]|
-define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
+define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @fabs(float %in0)
   %fmul = fmul float %fabs, %in1
   store float %fmul, float addrspace(1)* %out
@@ -88,7 +88,7 @@
 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]|
-define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
+define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @llvm.fabs.f32(float %in0)
   %fmul = fmul float %fabs, %in1
   store float %fmul, float addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
index b74bce7..9edf55c 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@@ -28,7 +28,7 @@
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -55,7 +55,7 @@
 ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]]
 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
 ; GCN-FASTFMA: buffer_store_dword [[FMA1]]
-define void @fast_sub_fmuladd_fmul() #0 {
+define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -87,7 +87,7 @@
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -120,7 +120,7 @@
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -145,7 +145,7 @@
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -170,7 +170,7 @@
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -205,7 +205,7 @@
 
 ; GCN: buffer_store_dword [[MUL]]
 ; GCN: buffer_store_dword [[MAD]]
-define void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
+define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -241,7 +241,7 @@
 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_subrev_f32_e32
-define void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
+define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index f7aa844..0861d327 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -11,7 +11,7 @@
 ; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fadd_f16(
+define amdgpu_kernel void @fadd_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -31,7 +31,7 @@
 ; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fadd_f16_imm_a(
+define amdgpu_kernel void @fadd_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -49,7 +49,7 @@
 ; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fadd_f16_imm_b(
+define amdgpu_kernel void @fadd_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -83,7 +83,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fadd_v2f16(
+define amdgpu_kernel void @fadd_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -111,7 +111,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fadd_v2f16_imm_a(
+define amdgpu_kernel void @fadd_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -136,7 +136,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fadd_v2f16_imm_b(
+define amdgpu_kernel void @fadd_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.ll b/llvm/test/CodeGen/AMDGPU/fadd.ll
index 989eb03..621a0de 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}fadd_f32:
 ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
 ; SI: v_add_f32
-define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float %a, float %b) #0 {
    %add = fadd float %a, %b
    store float %add, float addrspace(1)* %out, align 4
    ret void
@@ -16,7 +16,7 @@
 ; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
 ; SI: v_add_f32
 ; SI: v_add_f32
-define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
   %add = fadd <2 x float> %a, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -31,7 +31,7 @@
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; SI: v_add_f32
-define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
   %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
@@ -57,7 +57,7 @@
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; SI: v_add_f32
-define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 {
   %add = fadd <8 x float> %a, %b
   store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -65,7 +65,7 @@
 
 ; FUNC-LABEL: {{^}}fadd_0_nsz_attr_f32:
 ; SI-NOT: v_add_f32
-define void @fadd_0_nsz_attr_f32(float addrspace(1)* %out, float %a) #1 {
+define amdgpu_kernel void @fadd_0_nsz_attr_f32(float addrspace(1)* %out, float %a) #1 {
    %add = fadd float %a, 0.0
    store float %add, float addrspace(1)* %out, align 4
    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fadd64.ll b/llvm/test/CodeGen/AMDGPU/fadd64.ll
index 6f0c9de..7eb7747 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd64.ll
@@ -3,7 +3,7 @@
 
 ; CHECK-LABEL: {{^}}v_fadd_f64:
 ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
-define void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                         double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -14,7 +14,7 @@
 
 ; CHECK-LABEL: {{^}}s_fadd_f64:
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
+define amdgpu_kernel void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
   %r2 = fadd double %r0, %r1
   store double %r2, double addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@
 ; CHECK: v_add_f64
 ; CHECK: v_add_f64
 ; CHECK: _store_dwordx4
-define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                           <2 x double> addrspace(1)* %in2) {
   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
   %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
@@ -37,7 +37,7 @@
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 ; CHECK: _store_dwordx4
-define void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) {
+define amdgpu_kernel void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) {
   %r2 = fadd <2 x double> %r0, %r1
   store <2 x double> %r2, <2 x double> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index e1edc26..a82b310 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -9,7 +9,7 @@
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
 ; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
 ; GCN: buffer_store_short [[REG]]
-define void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
   %val = load half, half addrspace(1)* %out
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
   store half %canonicalized, half addrspace(1)* %out
@@ -19,7 +19,7 @@
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f16:
 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
 ; GCN: buffer_store_short [[REG]]
-define void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
   %val = bitcast i16 %val.arg to half
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
   store half %canonicalized, half addrspace(1)* %out
@@ -29,7 +29,7 @@
 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16:
 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}|
 ; GCN: buffer_store_short [[REG]]
-define void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
   %val = load half, half addrspace(1)* %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
   %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs)
@@ -40,7 +40,7 @@
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16:
 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}|
 ; GCN: buffer_store_short [[REG]]
-define void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 {
   %val = load half, half addrspace(1)* %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
   %val.fabs.fneg = fsub half -0.0, %val.fabs
@@ -52,7 +52,7 @@
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16:
 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}}
 ; GCN: buffer_store_short [[REG]]
-define void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 {
   %val = load half, half addrspace(1)* %out
   %val.fneg = fsub half -0.0, %val
   %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
@@ -63,7 +63,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -72,7 +72,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half -0.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -81,7 +81,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 1.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -90,7 +90,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half -1.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 16.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -108,7 +108,7 @@
 ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -117,7 +117,7 @@
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -126,7 +126,7 @@
 ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -135,7 +135,7 @@
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -144,7 +144,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -153,7 +153,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half))
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -162,7 +162,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half))
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -171,7 +171,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -180,7 +180,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -189,7 +189,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -198,7 +198,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -211,7 +211,7 @@
 
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
 ; GFX9: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -229,7 +229,7 @@
 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}}
 ; GCN: buffer_store_dword
-define void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs)
@@ -246,7 +246,7 @@
 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}}
 ; GCN: buffer_store_dword
-define void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
   %val.fabs.fneg = fsub <2 x half> <half -0.0, half -0.0>, %val.fabs
@@ -265,7 +265,7 @@
 
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
 ; GFX9: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %fneg.val = fsub <2 x half> <half -0.0, half -0.0>, %val
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val)
@@ -280,7 +280,7 @@
 
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}}
 ; GFX9: buffer_store_dword [[REG]]
-define void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 {
   %val = bitcast i32 %val.arg to <2 x half>
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -290,7 +290,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -299,7 +299,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -308,7 +308,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -317,7 +317,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -326,7 +326,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c004c00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -335,7 +335,7 @@
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -344,7 +344,7 @@
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -353,7 +353,7 @@
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -362,7 +362,7 @@
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -371,7 +371,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c007c00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -380,7 +380,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>))
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -389,7 +389,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -398,7 +398,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -407,7 +407,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -416,7 +416,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -425,7 +425,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 075a648..8c385f4 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -8,7 +8,7 @@
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f32:
 ; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
 ; GCN: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
   store float %canonicalized, float addrspace(1)* %out
@@ -18,7 +18,7 @@
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f32:
 ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
 ; GCN: buffer_store_dword [[REG]]
-define void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -27,7 +27,7 @@
 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f32:
 ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}|
 ; GCN: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %val.fabs = call float @llvm.fabs.f32(float %val)
   %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs)
@@ -38,7 +38,7 @@
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f32:
 ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}|
 ; GCN: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %val.fabs = call float @llvm.fabs.f32(float %val)
   %val.fabs.fneg = fsub float -0.0, %val.fabs
@@ -50,7 +50,7 @@
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f32:
 ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}}
 ; GCN: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %val.fneg = fsub float -0.0, %val
   %canonicalized = call float @llvm.canonicalize.f32(float %val.fneg)
@@ -61,7 +61,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 0.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -70,7 +70,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f32:
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float -0.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -79,7 +79,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 1.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -88,7 +88,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float -1.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -97,7 +97,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x41800000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 16.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -106,7 +106,7 @@
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -115,7 +115,7 @@
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fffff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -124,7 +124,7 @@
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -133,7 +133,7 @@
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x807fffff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -142,7 +142,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -151,7 +151,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -160,7 +160,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -169,7 +169,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -178,7 +178,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -187,7 +187,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -196,7 +196,7 @@
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -205,7 +205,7 @@
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{v\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %canonicalized = call double @llvm.canonicalize.f64(double %val)
   store double %canonicalized, double addrspace(1)* %out
@@ -215,7 +215,7 @@
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{s\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double %val)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -224,7 +224,7 @@
 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, |{{v\[[0-9]+:[0-9]+\]}}|
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %val.fabs = call double @llvm.fabs.f64(double %val)
   %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs)
@@ -235,7 +235,7 @@
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]\]]], 1.0, -|{{v\[[0-9]+:[0-9]+\]}}|
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %val.fabs = call double @llvm.fabs.f64(double %val)
   %val.fabs.fneg = fsub double -0.0, %val.fabs
@@ -247,7 +247,7 @@
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, -{{v\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %val.fneg = fsub double -0.0, %val
   %canonicalized = call double @llvm.canonicalize.f64(double %val.fneg)
@@ -259,7 +259,7 @@
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 0.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -269,7 +269,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double -0.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -279,7 +279,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 1.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -289,7 +289,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double -1.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -299,7 +299,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 16.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -309,7 +309,7 @@
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -319,7 +319,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -329,7 +329,7 @@
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -339,7 +339,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -349,7 +349,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -359,7 +359,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -369,7 +369,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -379,7 +379,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -389,7 +389,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -399,7 +399,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -409,7 +409,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fceil.ll b/llvm/test/CodeGen/AMDGPU/fceil.ll
index efdda78..0b913fd 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil.ll
@@ -13,7 +13,7 @@
 ; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @fceil_f32(float addrspace(1)* %out, float %x) {
   %y = call float @llvm.ceil.f32(float %x) nounwind readnone
   store float %y, float addrspace(1)* %out
   ret void
@@ -25,7 +25,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
   %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone
   store <2 x float> %y, <2 x float> addrspace(1)* %out
   ret void
@@ -41,7 +41,7 @@
 ; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+define amdgpu_kernel void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
   %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone
   store <3 x float> %y, <3 x float> addrspace(1)* %out
   ret void
@@ -57,7 +57,7 @@
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
   %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone
   store <4 x float> %y, <4 x float> addrspace(1)* %out
   ret void
@@ -82,7 +82,7 @@
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+define amdgpu_kernel void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
   %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone
   store <8 x float> %y, <8 x float> addrspace(1)* %out
   ret void
@@ -125,7 +125,7 @@
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
-define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+define amdgpu_kernel void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
   %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone
   store <16 x float> %y, <16 x float> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll
index 98448db..61572a8 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll
@@ -31,7 +31,7 @@
 ; SI: v_cndmask_b32
 ; SI: v_add_f64
 ; SI: s_endpgm
-define void @fceil_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @fceil_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.ceil.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
@@ -40,7 +40,7 @@
 ; FUNC-LABEL: {{^}}fceil_v2f64:
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@
 ; FIXME-CI: v_ceil_f64_e32
 ; FIXME-CI: v_ceil_f64_e32
 ; FIXME-CI: v_ceil_f64_e32
-; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+; define amdgpu_kernel void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
 ;   store <3 x double> %y, <3 x double> addrspace(1)* %out
 ;   ret void
@@ -61,7 +61,7 @@
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
@@ -76,7 +76,7 @@
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll b/llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll
index 530274f..7f8be80 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll
@@ -4,7 +4,7 @@
 ;registers and literal.x depending on what the optimizer does.
 ;CHECK: CNDE  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
   %cmp = fcmp oeq float %0, 0.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll b/llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
index c402805..2a848e8 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
@@ -6,7 +6,7 @@
 
 ; CHECK: SET{{[A-Z]+}}_DX10
 
-define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
   %cmp = fcmp oeq float %0, 0.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index 8a01ea2..7916226 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -11,7 +11,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_lt(
+define amdgpu_kernel void @fcmp_f16_lt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -37,7 +37,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_lt_abs(
+define amdgpu_kernel void @fcmp_f16_lt_abs(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -62,7 +62,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_eq(
+define amdgpu_kernel void @fcmp_f16_eq(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -85,7 +85,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_le(
+define amdgpu_kernel void @fcmp_f16_le(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -108,7 +108,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_gt(
+define amdgpu_kernel void @fcmp_f16_gt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -131,7 +131,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_lg(
+define amdgpu_kernel void @fcmp_f16_lg(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -154,7 +154,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_ge(
+define amdgpu_kernel void @fcmp_f16_ge(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -177,7 +177,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_o(
+define amdgpu_kernel void @fcmp_f16_o(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -200,7 +200,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_u(
+define amdgpu_kernel void @fcmp_f16_u(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -223,7 +223,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nge(
+define amdgpu_kernel void @fcmp_f16_nge(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -246,7 +246,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nlg(
+define amdgpu_kernel void @fcmp_f16_nlg(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -269,7 +269,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_ngt(
+define amdgpu_kernel void @fcmp_f16_ngt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -292,7 +292,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nle(
+define amdgpu_kernel void @fcmp_f16_nle(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -315,7 +315,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_neq(
+define amdgpu_kernel void @fcmp_f16_neq(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -338,7 +338,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nlt(
+define amdgpu_kernel void @fcmp_f16_nlt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -368,7 +368,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_lt(
+define amdgpu_kernel void @fcmp_v2f16_lt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -398,7 +398,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_eq(
+define amdgpu_kernel void @fcmp_v2f16_eq(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -428,7 +428,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_le(
+define amdgpu_kernel void @fcmp_v2f16_le(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -458,7 +458,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_gt(
+define amdgpu_kernel void @fcmp_v2f16_gt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -488,7 +488,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_lg(
+define amdgpu_kernel void @fcmp_v2f16_lg(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -518,7 +518,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_ge(
+define amdgpu_kernel void @fcmp_v2f16_ge(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -548,7 +548,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_o(
+define amdgpu_kernel void @fcmp_v2f16_o(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -578,7 +578,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_u(
+define amdgpu_kernel void @fcmp_v2f16_u(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -608,7 +608,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nge(
+define amdgpu_kernel void @fcmp_v2f16_nge(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -638,7 +638,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nlg(
+define amdgpu_kernel void @fcmp_v2f16_nlg(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -668,7 +668,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_ngt(
+define amdgpu_kernel void @fcmp_v2f16_ngt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -698,7 +698,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nle(
+define amdgpu_kernel void @fcmp_v2f16_nle(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -728,7 +728,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_neq(
+define amdgpu_kernel void @fcmp_v2f16_neq(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -758,7 +758,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nlt(
+define amdgpu_kernel void @fcmp_v2f16_nlt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.ll b/llvm/test/CodeGen/AMDGPU/fcmp.ll
index 97d954f..b548670 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.ll
@@ -3,7 +3,7 @@
 ; CHECK: {{^}}fcmp_sext:
 ; CHECK: SETE_DX10  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
   %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1
@@ -22,7 +22,7 @@
 ; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}}
 ; CHECK-NEXT: {{[0-9]+\(5.0}}
 
-define void @fcmp_br(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_br(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oeq float %in, 5.0
   br i1 %0, label %IF, label %ENDIF
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp64.ll b/llvm/test/CodeGen/AMDGPU/fcmp64.ll
index acce82f..b9e1921 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp64.ll
@@ -3,7 +3,7 @@
 
 ; CHECK-LABEL: {{^}}flt_f64:
 ; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -15,7 +15,7 @@
 
 ; CHECK-LABEL: {{^}}fle_f64:
 ; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -27,7 +27,7 @@
 
 ; CHECK-LABEL: {{^}}fgt_f64:
 ; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -39,7 +39,7 @@
 
 ; CHECK-LABEL: {{^}}fge_f64:
 ; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -51,7 +51,7 @@
 
 ; CHECK-LABEL: {{^}}fne_f64:
 ; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -63,7 +63,7 @@
 
 ; CHECK-LABEL: {{^}}feq_f64:
 ; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
diff --git a/llvm/test/CodeGen/AMDGPU/fconst64.ll b/llvm/test/CodeGen/AMDGPU/fconst64.ll
index 89af375..1255977 100644
--- a/llvm/test/CodeGen/AMDGPU/fconst64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fconst64.ll
@@ -5,7 +5,7 @@
 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000
 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
 
-define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
    %r1 = load double, double addrspace(1)* %in
    %r2 = fadd double %r1, 5.000000e+00
    store double %r2, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 1e7490a..79f8392 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -23,7 +23,7 @@
 ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_f16(
+define amdgpu_kernel void @test_copysign_f16(
   half addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -43,7 +43,7 @@
 ; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]]
 ; GCN: buffer_store_dword v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f32_mag_f16_sign_f32(
+define amdgpu_kernel void @test_copysign_out_f32_mag_f16_sign_f32(
   float addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   float addrspace(1)* %arg_sign) {
@@ -65,7 +65,7 @@
 ; GCN: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_EXT_HI]], v[[SIGN_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_EXT_LO]]:[[OUT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @test_copysign_out_f64_mag_f16_sign_f64(
+define amdgpu_kernel void @test_copysign_out_f64_mag_f16_sign_f64(
   double addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   double addrspace(1)* %arg_sign) {
@@ -88,7 +88,7 @@
 ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_dword v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f32_mag_f32_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f32_mag_f32_sign_f16(
   float addrspace(1)* %arg_out,
   float addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -111,7 +111,7 @@
 ; VI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_LO]]:[[OUT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @test_copysign_out_f64_mag_f64_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f64_mag_f64_sign_f16(
   double addrspace(1)* %arg_out,
   double addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -136,7 +136,7 @@
 ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f16_sign_f32(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f32(
   half addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   float addrspace(1)* %arg_sign) {
@@ -161,7 +161,7 @@
 ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f16_sign_f64(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f64(
   half addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   double addrspace(1)* %arg_sign) {
@@ -188,7 +188,7 @@
 ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_TRUNC]], v[[SIGN]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f32_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f32_sign_f16(
   half addrspace(1)* %arg_out,
   float addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -204,7 +204,7 @@
 ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f64_sign_f16:
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f64_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f64_sign_f16(
   half addrspace(1)* %arg_out,
   double addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -221,7 +221,7 @@
 ; GCN: v_bfi_b32
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
-define void @test_copysign_v2f16(
+define amdgpu_kernel void @test_copysign_v2f16(
   <2 x half> addrspace(1)* %arg_out,
   <2 x half> %arg_mag,
   <2 x half> %arg_sign) {
@@ -236,7 +236,7 @@
 ; GCN: v_bfi_b32
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
-define void @test_copysign_v3f16(
+define amdgpu_kernel void @test_copysign_v3f16(
   <3 x half> addrspace(1)* %arg_out,
   <3 x half> %arg_mag,
   <3 x half> %arg_sign) {
@@ -252,7 +252,7 @@
 ; GCN: v_bfi_b32
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
-define void @test_copysign_v4f16(
+define amdgpu_kernel void @test_copysign_v4f16(
   <4 x half> addrspace(1)* %arg_out,
   <4 x half> %arg_mag,
   <4 x half> %arg_sign) {
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index 632de18..e5893e5 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -20,7 +20,7 @@
 ; GCN: s_endpgm
 
 ; EG: BFI_INT
-define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
   %result = call float @llvm.copysign.f32(float %mag, float %sign)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -31,7 +31,7 @@
 
 ; EG: BFI_INT
 ; EG: BFI_INT
-define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
   %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
   store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -44,7 +44,7 @@
 ; EG: BFI_INT
 ; EG: BFI_INT
 ; EG: BFI_INT
-define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
   %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
   store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 12c942b..67779a8 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -17,7 +17,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -32,7 +32,7 @@
 ; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]]
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
-define void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind {
   %c = fpext float %sign to double
   %result = call double @llvm.copysign.f64(double %mag, double %c)
   store double %result, double addrspace(1)* %out, align 8
@@ -41,7 +41,7 @@
 
 ; FUNC-LABEL: {{^}}test_copysign_v2f64:
 ; GCN: s_endpgm
-define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
   %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
   store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8
   ret void
@@ -49,7 +49,7 @@
 
 ; FUNC-LABEL: {{^}}test_copysign_v4f64:
 ; GCN: s_endpgm
-define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
   %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
   store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 4dc9904..7f84e97 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -31,7 +31,7 @@
 ; VI: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
 ; VI: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_fdiv_f16(
+define amdgpu_kernel void @v_fdiv_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) #0 {
@@ -54,7 +54,7 @@
 ; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; VI-NOT: [[RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -72,7 +72,7 @@
 ; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]|
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -91,7 +91,7 @@
 ; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; VI-NOT: [[RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -109,7 +109,7 @@
 ; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -127,7 +127,7 @@
 ; VI: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -147,7 +147,7 @@
 ; VI-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -168,7 +168,7 @@
 ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -190,7 +190,7 @@
 ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
+define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -209,7 +209,7 @@
 
 ; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
 ; VI: buffer_store_short [[MUL]]
-define void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
   %x = load half, half addrspace(1)* undef
   %rcp = fdiv arcp half %x, 2.0
   store half %rcp, half addrspace(1)* %out, align 4
@@ -221,7 +221,7 @@
 
 ; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
 ; VI: buffer_store_short [[MUL]]
-define void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
   %x = load half, half addrspace(1)* undef
   %rcp = fdiv arcp half %x, 10.0
   store half %rcp, half addrspace(1)* %out, align 4
@@ -233,7 +233,7 @@
 
 ; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
 ; VI: buffer_store_short [[MUL]]
-define void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
   %x = load half, half addrspace(1)* undef
   %rcp = fdiv arcp half %x, -10.0
   store half %rcp, half addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
index 04e1d5a..d16bdf4 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -29,7 +29,7 @@
 ; GCN: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
 ; GCN: s_endpgm
-define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1
   %num = load volatile double, double addrspace(1)* %in
   %den = load volatile double, double addrspace(1)* %gep.1
@@ -39,7 +39,7 @@
 }
 
 ; GCN-LABEL: {{^}}fdiv_f64_s_v:
-define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) #0 {
+define amdgpu_kernel void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) #0 {
   %den = load double, double addrspace(1)* %in
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
@@ -47,7 +47,7 @@
 }
 
 ; GCN-LABEL: {{^}}fdiv_f64_v_s:
-define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) #0 {
+define amdgpu_kernel void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) #0 {
   %num = load double, double addrspace(1)* %in
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
@@ -55,14 +55,14 @@
 }
 
 ; GCN-LABEL: {{^}}fdiv_f64_s_s:
-define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) #0 {
+define amdgpu_kernel void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) #0 {
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_fdiv_v2f64:
-define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
   %gep.1 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in, i32 1
   %num = load <2 x double>, <2 x double> addrspace(1)* %in
   %den = load <2 x double>, <2 x double> addrspace(1)* %gep.1
@@ -72,14 +72,14 @@
 }
 
 ; GCN-LABEL: {{^}}s_fdiv_v2f64:
-define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) {
+define amdgpu_kernel void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) {
   %result = fdiv <2 x double> %num, %den
   store <2 x double> %result, <2 x double> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_fdiv_v4f64:
-define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
   %gep.1 = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
   %num = load <4 x double>, <4 x double> addrspace(1)* %in
   %den = load <4 x double>, <4 x double> addrspace(1)* %gep.1
@@ -89,7 +89,7 @@
 }
 
 ; GCN-LABEL: {{^}}s_fdiv_v4f64:
-define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) #0 {
+define amdgpu_kernel void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) #0 {
   %result = fdiv <4 x double> %num, %den
   store <4 x double> %result, <4 x double> addrspace(1)* %out
   ret void
@@ -98,7 +98,7 @@
 ; GCN-LABEL: {{^}}div_fast_2_x_pat_f64:
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, 0.5
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 {
   %x = load double, double addrspace(1)* undef
   %rcp = fdiv fast double %x, 2.0
   store double %rcp, double addrspace(1)* %out, align 4
@@ -110,7 +110,7 @@
 ; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fb99999
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 {
   %x = load double, double addrspace(1)* undef
   %rcp = fdiv fast double %x, 10.0
   store double %rcp, double addrspace(1)* %out, align 4
@@ -122,7 +122,7 @@
 ; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfb99999
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define void @div_fast_neg_k_x_pat_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(double addrspace(1)* %out) #1 {
   %x = load double, double addrspace(1)* undef
   %rcp = fdiv fast double %x, -10.0
   store double %rcp, double addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index 0e95de9..b3a2b66 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -27,7 +27,7 @@
 ; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
 ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
-define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -52,7 +52,7 @@
 ; GCN-NOT: s_setreg
 ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
 ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
-define void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -65,7 +65,7 @@
 ; GCN: v_rcp_f32
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
-define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv float %a, %b, !fpmath !0
   store float %fdiv, float addrspace(1)* %out
@@ -77,7 +77,7 @@
 ; GCN: v_fma_f32
 ; GCN: v_div_fmas_f32
 ; GCN: v_div_fixup_f32
-define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv float %a, %b, !fpmath !0
   store float %fdiv, float addrspace(1)* %out
@@ -89,7 +89,7 @@
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv fast float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -104,7 +104,7 @@
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv fast float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -119,7 +119,7 @@
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv arcp float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -136,7 +136,7 @@
 ; GCN: v_div_scale_f32
 ; GCN: v_div_scale_f32
 ; GCN: v_div_scale_f32
-define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv <2 x float> %a, %b
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -146,7 +146,7 @@
 ; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
 ; GCN: v_cmp_gt_f32
 ; GCN: v_cmp_gt_f32
-define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -161,7 +161,7 @@
 
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv fast <2 x float> %a, %b
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -176,7 +176,7 @@
 
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv arcp <2 x float> %a, %b
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -197,7 +197,7 @@
 ; GCN: v_div_fixup_f32
 ; GCN: v_div_fixup_f32
 ; GCN: v_div_fixup_f32
-define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -220,7 +220,7 @@
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -243,7 +243,7 @@
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/ffloor.f64.ll b/llvm/test/CodeGen/AMDGPU/ffloor.f64.ll
index 83ffbdf..407cccb 100644
--- a/llvm/test/CodeGen/AMDGPU/ffloor.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ffloor.f64.ll
@@ -19,7 +19,7 @@
 ; SI: v_cndmask_b32_e32
 ; SI: v_add_f64
 ; SI: s_endpgm
-define void @ffloor_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ffloor_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.floor.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@
 ; SI: v_cndmask_b32_e32
 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]]
 ; SI: s_endpgm
-define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
   %neg = fsub double 0.0, %x
   %y = call double @llvm.floor.f64(double %neg) nounwind readnone
   store double %y, double addrspace(1)* %out
@@ -50,7 +50,7 @@
 ; SI: v_cndmask_b32_e32
 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]|
 ; SI: s_endpgm
-define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
   %abs = call double @llvm.fabs.f64(double %x)
   %neg = fsub double 0.0, %abs
   %y = call double @llvm.floor.f64(double %neg) nounwind readnone
@@ -61,7 +61,7 @@
 ; FUNC-LABEL: {{^}}ffloor_v2f64:
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
@@ -72,7 +72,7 @@
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI-NOT: v_floor_f64_e32
-define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+define amdgpu_kernel void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
   store <3 x double> %y, <3 x double> addrspace(1)* %out
   ret void
@@ -83,7 +83,7 @@
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
@@ -98,7 +98,7 @@
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
@@ -121,7 +121,7 @@
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ffloor.ll b/llvm/test/CodeGen/AMDGPU/ffloor.ll
index d7f35a4..720fe7a 100644
--- a/llvm/test/CodeGen/AMDGPU/ffloor.ll
+++ b/llvm/test/CodeGen/AMDGPU/ffloor.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}floor_f32:
 ; SI: v_floor_f32_e32
 ; R600: FLOOR
-define void @floor_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @floor_f32(float addrspace(1)* %out, float %in) {
   %tmp = call float @llvm.floor.f32(float %in) #0
   store float %tmp, float addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@
 ; SI: v_floor_f32_e32
 ; SI: v_floor_f32_e32
 
-define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0
   store <2 x float> %tmp, <2 x float> addrspace(1)* %out
   ret void
@@ -31,7 +31,7 @@
 ; R600: FLOOR
 ; R600: FLOOR
 ; R600: FLOOR
-define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0
   store <4 x float> %tmp, <4 x float> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
index 81a3ebc..c867e4f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -17,7 +17,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
 ; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
 ; CHECK: flat_store_dword v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}, v[[DATA]]
-define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
+define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
   store volatile i32 %x, i32 addrspace(4)* %fptr, align 4
   ret void
@@ -25,7 +25,7 @@
 
 ; CHECK-LABEL: {{^}}store_flat_i64:
 ; CHECK: flat_store_dwordx2
-define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
+define amdgpu_kernel void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
   %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
   store volatile i64 %x, i64 addrspace(4)* %fptr, align 8
   ret void
@@ -33,7 +33,7 @@
 
 ; CHECK-LABEL: {{^}}store_flat_v4i32:
 ; CHECK: flat_store_dwordx4
-define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
+define amdgpu_kernel void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
   %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
   store volatile <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
   ret void
@@ -41,7 +41,7 @@
 
 ; CHECK-LABEL: {{^}}store_flat_trunc_i16:
 ; CHECK: flat_store_short
-define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
+define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
   %y = trunc i32 %x to i16
   store volatile i16 %y, i16 addrspace(4)* %fptr, align 2
@@ -50,7 +50,7 @@
 
 ; CHECK-LABEL: {{^}}store_flat_trunc_i8:
 ; CHECK: flat_store_byte
-define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
+define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
   %y = trunc i32 %x to i8
   store volatile i8 %y, i8 addrspace(4)* %fptr, align 2
@@ -61,7 +61,7 @@
 
 ; CHECK-LABEL: load_flat_i32:
 ; CHECK: flat_load_dword
-define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
   %fload = load volatile i32, i32 addrspace(4)* %fptr, align 4
   store i32 %fload, i32 addrspace(1)* %out, align 4
@@ -70,7 +70,7 @@
 
 ; CHECK-LABEL: load_flat_i64:
 ; CHECK: flat_load_dwordx2
-define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
   %fload = load volatile i64, i64 addrspace(4)* %fptr, align 8
   store i64 %fload, i64 addrspace(1)* %out, align 8
@@ -79,7 +79,7 @@
 
 ; CHECK-LABEL: load_flat_v4i32:
 ; CHECK: flat_load_dwordx4
-define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
   %fload = load volatile <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 32
   store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
@@ -88,7 +88,7 @@
 
 ; CHECK-LABEL: sextload_flat_i8:
 ; CHECK: flat_load_sbyte
-define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
   %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4
   %ext = sext i8 %fload to i32
@@ -98,7 +98,7 @@
 
 ; CHECK-LABEL: zextload_flat_i8:
 ; CHECK: flat_load_ubyte
-define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
   %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4
   %ext = zext i8 %fload to i32
@@ -108,7 +108,7 @@
 
 ; CHECK-LABEL: sextload_flat_i16:
 ; CHECK: flat_load_sshort
-define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
   %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4
   %ext = sext i16 %fload to i32
@@ -118,7 +118,7 @@
 
 ; CHECK-LABEL: zextload_flat_i16:
 ; CHECK: flat_load_ushort
-define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
   %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4
   %ext = zext i16 %fload to i32
@@ -131,7 +131,7 @@
 ; CHECK: flat_load_ubyte
 ; CHECK: flat_load_ubyte
 ; CHECK: flat_load_ubyte
-define void @flat_scratch_unaligned_load() {
+define amdgpu_kernel void @flat_scratch_unaligned_load() {
   %scratch = alloca i32
   %fptr = addrspacecast i32* %scratch to i32 addrspace(4)*
   %ld = load volatile i32, i32 addrspace(4)* %fptr, align 1
@@ -143,7 +143,7 @@
 ; CHECK: flat_store_byte
 ; CHECK: flat_store_byte
 ; CHECK: flat_store_byte
-define void @flat_scratch_unaligned_store() {
+define amdgpu_kernel void @flat_scratch_unaligned_store() {
   %scratch = alloca i32
   %fptr = addrspacecast i32* %scratch to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %fptr, align 1
@@ -154,7 +154,7 @@
 ; HSA: flat_load_dword
 ; HSA: flat_load_dword
 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
-define void @flat_scratch_multidword_load() {
+define amdgpu_kernel void @flat_scratch_multidword_load() {
   %scratch = alloca <2 x i32>
   %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)*
   %ld = load volatile <2 x i32>, <2 x i32> addrspace(4)* %fptr
@@ -165,7 +165,7 @@
 ; HSA: flat_store_dword
 ; HSA: flat_store_dword
 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
-define void @flat_scratch_multidword_store() {
+define amdgpu_kernel void @flat_scratch_multidword_store() {
   %scratch = alloca <2 x i32>
   %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)*
   store volatile <2 x i32> zeroinitializer, <2 x i32> addrspace(4)* %fptr
diff --git a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
index df9ba00..dac1500 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -23,7 +23,7 @@
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
 ; NOHSA-NOADDR64: flat_store_dword
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
 entry:
   store i32 0, i32 addrspace(1)* %out
   ret void
@@ -36,7 +36,7 @@
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
 ; NOHSA-NOADDR64: flat_store_dword
-define void @test_addr64(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_addr64(i32 addrspace(1)* %out) {
 entry:
   %out.addr = alloca i32 addrspace(1)*, align 4
 
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index b71c8bc..23f40da 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -19,7 +19,7 @@
 ; CI: ; NumSgprs: 8
 ; VI-NOXNACK: ; NumSgprs: 8
 ; VI-XNACK: ; NumSgprs: 12
-define void @no_vcc_no_flat() {
+define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7}"()
   ret void
@@ -33,7 +33,7 @@
 ; CI: ; NumSgprs: 10
 ; VI-NOXNACK: ; NumSgprs: 10
 ; VI-XNACK: ; NumSgprs: 12
-define void @vcc_no_flat() {
+define amdgpu_kernel void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{VCC}"()
   ret void
@@ -50,7 +50,7 @@
 ; HSA-CI: ; NumSgprs: 8
 ; HSA-VI-NOXNACK: ; NumSgprs: 8
 ; HSA-VI-XNACK: ; NumSgprs: 12
-define void @no_vcc_flat() {
+define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"()
   ret void
@@ -66,7 +66,7 @@
 ; HSA-CI: ; NumSgprs: 10
 ; HSA-VI-NOXNACK: ; NumSgprs: 10
 ; HSA-VI-XNACK: ; NumSgprs: 12
-define void @vcc_flat() {
+define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index 7c91471..cc95d80 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i32_offset:
 ; GCN: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -13,7 +13,7 @@
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_offset:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -23,7 +23,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i32_addr64_offset:
 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -34,7 +34,7 @@
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -45,7 +45,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i32:
 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -54,7 +54,7 @@
 ; GCN-LABEL: {{^}}atomic_add_i32_ret:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -63,7 +63,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i32_addr64:
 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -73,7 +73,7 @@
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -83,7 +83,7 @@
 
 ; GCN-LABEL: {{^}}atomic_and_i32_offset:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -93,7 +93,7 @@
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_offset:
 ; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -103,7 +103,7 @@
 
 ; GCN-LABEL: {{^}}atomic_and_i32_addr64_offset:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -114,7 +114,7 @@
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
 ; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -125,7 +125,7 @@
 
 ; GCN-LABEL: {{^}}atomic_and_i32:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -134,7 +134,7 @@
 ; GCN-LABEL: {{^}}atomic_and_i32_ret:
 ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -143,7 +143,7 @@
 
 ; GCN-LABEL: {{^}}atomic_and_i32_addr64:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -153,7 +153,7 @@
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64:
 ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -163,7 +163,7 @@
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_offset:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -173,7 +173,7 @@
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_offset:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -183,7 +183,7 @@
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_addr64_offset:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -194,7 +194,7 @@
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -205,7 +205,7 @@
 
 ; GCN-LABEL: {{^}}atomic_sub_i32:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -214,7 +214,7 @@
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -223,7 +223,7 @@
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_addr64:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -233,7 +233,7 @@
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -243,7 +243,7 @@
 
 ; GCN-LABEL: {{^}}atomic_max_i32_offset:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -253,7 +253,7 @@
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_offset:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -263,7 +263,7 @@
 
 ; GCN-LABEL: {{^}}atomic_max_i32_addr64_offset:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -274,7 +274,7 @@
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -285,7 +285,7 @@
 
 ; GCN-LABEL: {{^}}atomic_max_i32:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -294,7 +294,7 @@
 ; GCN-LABEL: {{^}}atomic_max_i32_ret:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -303,7 +303,7 @@
 
 ; GCN-LABEL: {{^}}atomic_max_i32_addr64:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -313,7 +313,7 @@
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -323,7 +323,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_offset:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -333,7 +333,7 @@
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_offset:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -343,7 +343,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_addr64_offset:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -354,7 +354,7 @@
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -365,7 +365,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umax_i32:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -374,7 +374,7 @@
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -383,7 +383,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_addr64:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -393,7 +393,7 @@
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -403,7 +403,7 @@
 
 ; GCN-LABEL: {{^}}atomic_min_i32_offset:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -413,7 +413,7 @@
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_offset:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -423,7 +423,7 @@
 
 ; GCN-LABEL: {{^}}atomic_min_i32_addr64_offset:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -434,7 +434,7 @@
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -445,7 +445,7 @@
 
 ; GCN-LABEL: {{^}}atomic_min_i32:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -454,7 +454,7 @@
 ; GCN-LABEL: {{^}}atomic_min_i32_ret:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -463,7 +463,7 @@
 
 ; GCN-LABEL: {{^}}atomic_min_i32_addr64:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -473,7 +473,7 @@
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -483,7 +483,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_offset:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -493,7 +493,7 @@
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_offset:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -503,7 +503,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_addr64_offset:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -514,7 +514,7 @@
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -525,7 +525,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umin_i32:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -534,7 +534,7 @@
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -543,7 +543,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_addr64:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -553,7 +553,7 @@
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]{{$}}
-  define void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+  define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -563,7 +563,7 @@
 
 ; GCN-LABEL: {{^}}atomic_or_i32_offset:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -573,7 +573,7 @@
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_offset:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -583,7 +583,7 @@
 
 ; GCN-LABEL: {{^}}atomic_or_i32_addr64_offset:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -594,7 +594,7 @@
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -605,7 +605,7 @@
 
 ; GCN-LABEL: {{^}}atomic_or_i32:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -614,7 +614,7 @@
 ; GCN-LABEL: {{^}}atomic_or_i32_ret:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -623,7 +623,7 @@
 
 ; GCN-LABEL: {{^}}atomic_or_i32_addr64:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -633,7 +633,7 @@
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -643,7 +643,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_offset:
 ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -653,7 +653,7 @@
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -663,7 +663,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
 ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -674,7 +674,7 @@
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -685,7 +685,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32:
 ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -694,7 +694,7 @@
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -703,7 +703,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64:
 ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -713,7 +713,7 @@
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]],  v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -725,7 +725,7 @@
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_offset:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -735,7 +735,7 @@
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -746,7 +746,7 @@
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -757,7 +757,7 @@
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -769,7 +769,7 @@
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst
   ret void
@@ -778,7 +778,7 @@
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst
   %flag = extractvalue { i32, i1 } %val, 0
@@ -788,7 +788,7 @@
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -798,7 +798,7 @@
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -809,7 +809,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_offset:
 ; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -819,7 +819,7 @@
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_offset:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -829,7 +829,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_addr64_offset:
 ; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -840,7 +840,7 @@
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -851,7 +851,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xor_i32:
 ; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -860,7 +860,7 @@
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -869,7 +869,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_addr64:
 ; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -879,7 +879,7 @@
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -890,7 +890,7 @@
 ; GCN-LABEL: {{^}}atomic_load_i32_offset:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %in, i32 4
   %val = load atomic i32, i32 addrspace(4)* %gep  seq_cst, align 4
@@ -901,7 +901,7 @@
 ; GCN-LABEL: {{^}}atomic_load_i32:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(4)* %in seq_cst, align 4
   store i32 %val, i32 addrspace(4)* %out
@@ -911,7 +911,7 @@
 ; GCN-LABEL: {{^}}atomic_load_i32_addr64_offset:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -923,7 +923,7 @@
 ; GCN-LABEL: {{^}}atomic_load_i32_addr64:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index
   %val = load atomic i32, i32 addrspace(4)* %ptr seq_cst, align 4
@@ -933,7 +933,7 @@
 
 ; GCN-LABEL: {{^}}atomic_store_i32_offset:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   store atomic i32 %in, i32 addrspace(4)* %gep  seq_cst, align 4
@@ -942,7 +942,7 @@
 
 ; GCN-LABEL: {{^}}atomic_store_i32:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(4)* %out seq_cst, align 4
   ret void
@@ -950,7 +950,7 @@
 
 ; GCN-LABEL: {{^}}atomic_store_i32_addr64_offset:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -960,7 +960,7 @@
 
 ; GCN-LABEL: {{^}}atomic_store_i32_addr64:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   store atomic i32 %in, i32 addrspace(4)* %ptr seq_cst, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 0bd6c2d..723dde9 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i64_offset:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -13,7 +13,7 @@
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_offset:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -23,7 +23,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -34,7 +34,7 @@
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64_offset:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -45,7 +45,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i64:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -54,7 +54,7 @@
 ; GCN-LABEL: {{^}}atomic_add_i64_ret:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -63,7 +63,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -73,7 +73,7 @@
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -83,7 +83,7 @@
 
 ; GCN-LABEL: {{^}}atomic_and_i64_offset:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -93,7 +93,7 @@
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_offset:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -103,7 +103,7 @@
 
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -114,7 +114,7 @@
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64_offset:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -125,7 +125,7 @@
 
 ; GCN-LABEL: {{^}}atomic_and_i64:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -134,7 +134,7 @@
 ; GCN-LABEL: {{^}}atomic_and_i64_ret:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -143,7 +143,7 @@
 
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -153,7 +153,7 @@
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -163,7 +163,7 @@
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_offset:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -173,7 +173,7 @@
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -183,7 +183,7 @@
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -194,7 +194,7 @@
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64_offset:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -205,7 +205,7 @@
 
 ; GCN-LABEL: {{^}}atomic_sub_i64:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -214,7 +214,7 @@
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -223,7 +223,7 @@
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -233,7 +233,7 @@
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -243,7 +243,7 @@
 
 ; GCN-LABEL: {{^}}atomic_max_i64_offset:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -253,7 +253,7 @@
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_offset:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -263,7 +263,7 @@
 
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -274,7 +274,7 @@
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64_offset:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -285,7 +285,7 @@
 
 ; GCN-LABEL: {{^}}atomic_max_i64:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -294,7 +294,7 @@
 ; GCN-LABEL: {{^}}atomic_max_i64_ret:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -303,7 +303,7 @@
 
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -313,7 +313,7 @@
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -323,7 +323,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_offset:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -333,7 +333,7 @@
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -343,7 +343,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -354,7 +354,7 @@
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64_offset:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -365,7 +365,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umax_i64:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -374,7 +374,7 @@
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -383,7 +383,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -393,7 +393,7 @@
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -403,7 +403,7 @@
 
 ; GCN-LABEL: {{^}}atomic_min_i64_offset:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -413,7 +413,7 @@
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_offset:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -423,7 +423,7 @@
 
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -434,7 +434,7 @@
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64_offset:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -445,7 +445,7 @@
 
 ; GCN-LABEL: {{^}}atomic_min_i64:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -454,7 +454,7 @@
 ; GCN-LABEL: {{^}}atomic_min_i64_ret:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -463,7 +463,7 @@
 
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -473,7 +473,7 @@
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -483,7 +483,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_offset:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -493,7 +493,7 @@
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -503,7 +503,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -514,7 +514,7 @@
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64_offset:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -525,7 +525,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umin_i64:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -534,7 +534,7 @@
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -543,7 +543,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -553,7 +553,7 @@
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -563,7 +563,7 @@
 
 ; GCN-LABEL: {{^}}atomic_or_i64_offset:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -573,7 +573,7 @@
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_offset:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -583,7 +583,7 @@
 
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -594,7 +594,7 @@
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64_offset:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -605,7 +605,7 @@
 
 ; GCN-LABEL: {{^}}atomic_or_i64:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -614,7 +614,7 @@
 ; GCN-LABEL: {{^}}atomic_or_i64_ret:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -623,7 +623,7 @@
 
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -633,7 +633,7 @@
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -643,7 +643,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_offset:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -653,7 +653,7 @@
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -663,7 +663,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -674,7 +674,7 @@
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64_offset:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -685,7 +685,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -694,7 +694,7 @@
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -703,7 +703,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -713,7 +713,7 @@
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]],  v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -723,7 +723,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_offset:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -733,7 +733,7 @@
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -743,7 +743,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -754,7 +754,7 @@
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64_offset:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -765,7 +765,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xor_i64:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -774,7 +774,7 @@
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -783,7 +783,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -793,7 +793,7 @@
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -804,7 +804,7 @@
 ; GCN-LABEL: {{^}}atomic_load_i64_offset:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %in, i64 4
   %val = load atomic i64, i64 addrspace(4)* %gep  seq_cst, align 8
@@ -815,7 +815,7 @@
 ; GCN-LABEL: {{^}}atomic_load_i64:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
 entry:
   %val = load atomic i64, i64 addrspace(4)* %in seq_cst, align 8
   store i64 %val, i64 addrspace(4)* %out
@@ -825,7 +825,7 @@
 ; GCN-LABEL: {{^}}atomic_load_i64_addr64_offset:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -837,7 +837,7 @@
 ; GCN-LABEL: {{^}}atomic_load_i64_addr64:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index
   %val = load atomic i64, i64 addrspace(4)* %ptr seq_cst, align 8
@@ -847,7 +847,7 @@
 
 ; GCN-LABEL: {{^}}atomic_store_i64_offset:
 ; GCN: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   store atomic i64 %in, i64 addrspace(4)* %gep  seq_cst, align 8
@@ -856,7 +856,7 @@
 
 ; GCN-LABEL: {{^}}atomic_store_i64:
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc
-define void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) {
 entry:
   store atomic i64 %in, i64 addrspace(4)* %out seq_cst, align 8
   ret void
@@ -864,7 +864,7 @@
 
 ; GCN-LABEL: {{^}}atomic_store_i64_addr64_offset:
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -874,7 +874,7 @@
 
 ; GCN-LABEL: {{^}}atomic_store_i64_addr64:
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   store atomic i64 %in, i64 addrspace(4)* %ptr seq_cst, align 8
@@ -883,7 +883,7 @@
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_offset:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -892,7 +892,7 @@
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_soffset:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 9000
   %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -902,7 +902,7 @@
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -913,7 +913,7 @@
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -924,7 +924,7 @@
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64_offset:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -936,7 +936,7 @@
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst
   ret void
@@ -945,7 +945,7 @@
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
@@ -955,7 +955,7 @@
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst
@@ -965,7 +965,7 @@
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index 50c5a5a..4113ba8 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -18,7 +18,7 @@
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -46,7 +46,7 @@
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -75,7 +75,7 @@
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -99,7 +99,7 @@
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -127,7 +127,7 @@
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -156,7 +156,7 @@
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -184,7 +184,7 @@
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -213,7 +213,7 @@
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -242,7 +242,7 @@
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -276,7 +276,7 @@
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -317,7 +317,7 @@
 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -358,7 +358,7 @@
 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -390,7 +390,7 @@
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load volatile float, float addrspace(1)* %in1
@@ -406,7 +406,7 @@
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load volatile float, float addrspace(1)* %in1
@@ -422,7 +422,7 @@
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -438,7 +438,7 @@
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -454,7 +454,7 @@
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -470,7 +470,7 @@
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -486,7 +486,7 @@
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -502,7 +502,7 @@
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -518,7 +518,7 @@
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -534,7 +534,7 @@
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
                                       float addrspace(1)* %in1,
                                       float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -550,7 +550,7 @@
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -566,7 +566,7 @@
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -588,7 +588,7 @@
 ;
 ; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
-define void @test_f32_interp(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out,
                              float addrspace(1)* %in1,
                              float addrspace(1)* %in2,
                              float addrspace(1)* %in3) {
@@ -610,7 +610,7 @@
 ;
 ; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
 ; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
-define void @test_f64_interp(double addrspace(1)* %out,
+define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out,
                              double addrspace(1)* %in1,
                              double addrspace(1)* %in2,
                              double addrspace(1)* %in3) {
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f64.ll b/llvm/test/CodeGen/AMDGPU/fma.f64.ll
index cf6d7d8..4d3f371 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f64.ll
@@ -8,7 +8,7 @@
 
 ; FUNC-LABEL: {{^}}fma_f64:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -21,7 +21,7 @@
 ; FUNC-LABEL: {{^}}fma_v2f64:
 ; SI: v_fma_f64
 ; SI: v_fma_f64
-define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                        <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) {
    %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
    %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
@@ -36,7 +36,7 @@
 ; SI: v_fma_f64
 ; SI: v_fma_f64
 ; SI: v_fma_f64
-define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
                        <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) {
    %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
    %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll
index d04a594..659cecb 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.ll
@@ -12,7 +12,7 @@
 
 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}},
 ; EG: FMA {{\*? *}}[[RES]]
-define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                      float addrspace(1)* %in2, float addrspace(1)* %in3) {
   %r0 = load float, float addrspace(1)* %in1
   %r1 = load float, float addrspace(1)* %in2
@@ -29,7 +29,7 @@
 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}},
 ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]]
 ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
-define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
                        <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
   %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1
   %r1 = load <2 x float>, <2 x float> addrspace(1)* %in2
@@ -50,7 +50,7 @@
 ; EG-DAG: FMA {{\*? *}}[[RES]].Y
 ; EG-DAG: FMA {{\*? *}}[[RES]].Z
 ; EG-DAG: FMA {{\*? *}}[[RES]].W
-define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
                        <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
   %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1
   %r1 = load <4 x float>, <4 x float> addrspace(1)* %in2
@@ -62,7 +62,7 @@
 
 ; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
 ; SI: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, 2.0, {{v[0-9]+}}
-define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -77,7 +77,7 @@
 }
 
 ; FUNC-LABEL: @fma_commute_mul_s_f32
-define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
+define amdgpu_kernel void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
index 4d42a46..8b9104b 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
@@ -11,7 +11,7 @@
 ; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
+define amdgpu_kernel void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
   %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1
   %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2
   %a = load volatile double, double addrspace(1)* %aptr, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 7c01ca8..a96eb5d 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -10,7 +10,7 @@
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile  float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
@@ -28,7 +28,7 @@
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
index da498ca..083346e 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
@@ -4,7 +4,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; FUNC-LABEL: @test_fmax_legacy_uge_f64
-define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -19,7 +19,7 @@
 }
 
 ; FUNC-LABEL: @test_fmax_legacy_oge_f64
-define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -34,7 +34,7 @@
 }
 
 ; FUNC-LABEL: @test_fmax_legacy_ugt_f64
-define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -49,7 +49,7 @@
 }
 
 ; FUNC-LABEL: @test_fmax_legacy_ogt_f64
-define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
index 4a4c92a..7643c3e 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -13,7 +13,7 @@
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 
 ; EG: MAX
-define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -33,7 +33,7 @@
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -53,7 +53,7 @@
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -73,7 +73,7 @@
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -93,7 +93,7 @@
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1
@@ -114,7 +114,7 @@
 ; SI-NONAN: v_max_f32_e32
 ; SI-NONAN: v_max_f32_e32
 ; SI-NONAN: v_max_f32_e32
-define void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1
@@ -137,7 +137,7 @@
 ; SI-NOT: v_max_
 
 ; EG: MAX
-define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fmaxnum.f64.ll b/llvm/test/CodeGen/AMDGPU/fmaxnum.f64.ll
index fec3a35..20af278 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaxnum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaxnum.f64.ll
@@ -9,7 +9,7 @@
 
 ; FUNC-LABEL: @test_fmax_f64
 ; SI: v_max_f64
-define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
   %val = call double @llvm.maxnum.f64(double %a, double %b) #0
   store double %val, double addrspace(1)* %out, align 8
   ret void
@@ -18,7 +18,7 @@
 ; FUNC-LABEL: @test_fmax_v2f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0
   store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -29,7 +29,7 @@
 ; SI: v_max_f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0
   store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -44,7 +44,7 @@
 ; SI: v_max_f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0
   store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -67,7 +67,7 @@
 ; SI: v_max_f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0
   store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fmaxnum.ll b/llvm/test/CodeGen/AMDGPU/fmaxnum.ll
index 4058247..277b8ce 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaxnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaxnum.ll
@@ -14,7 +14,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float %b) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -27,7 +27,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0
   store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -44,7 +44,7 @@
 ; EG: MAX_DX10 {{.*}}[[OUT]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0
   store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -70,7 +70,7 @@
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W
-define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0
   store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -114,7 +114,7 @@
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W
-define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0
   store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -128,7 +128,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -143,7 +143,7 @@
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 ; EG: 2143289344(nan)
-define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -157,7 +157,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -171,7 +171,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -185,7 +185,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -199,7 +199,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -213,7 +213,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -227,7 +227,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -239,7 +239,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -250,7 +250,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -262,7 +262,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -274,7 +274,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 5a651c7..d2cfc71 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -9,7 +9,7 @@
 ; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32:
 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}}
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
-define void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -27,7 +27,7 @@
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -45,7 +45,7 @@
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -63,7 +63,7 @@
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -79,7 +79,7 @@
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_constant_order_f32:
 ; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -96,7 +96,7 @@
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_multi_use_f32:
 ; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -113,7 +113,7 @@
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64:
 ; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0
 ; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0
-define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid
@@ -128,7 +128,7 @@
 
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32:
 ; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
-define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -146,7 +146,7 @@
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -169,7 +169,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -192,7 +192,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], -[[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -215,7 +215,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], -[[C]]
-define void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -238,7 +238,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], |[[B]]|, -|[[C]]|
-define void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -267,7 +267,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, -|[[A]]|, -|[[B]]|, -|[[C]]|
-define void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -301,7 +301,7 @@
 ; GCN-DAG: v_add_f32_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
 ; GCN-DAG: v_add_f32_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
-define void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -341,7 +341,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -363,7 +363,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -385,7 +385,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -407,7 +407,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -429,7 +429,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -451,7 +451,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -473,7 +473,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -495,7 +495,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -517,7 +517,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -539,7 +539,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -561,7 +561,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -583,7 +583,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -605,7 +605,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -627,7 +627,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -649,7 +649,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -671,7 +671,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -697,7 +697,7 @@
 ; GCN-DAG: v_max_f32
 ; GCN: v_min_f32
 ; GCN: v_max_f32
-define void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -716,7 +716,7 @@
 }
 
 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use1:
-define void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -735,7 +735,7 @@
 }
 
 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use2:
-define void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -755,7 +755,7 @@
 
 
 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0:
-define void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -773,7 +773,7 @@
 }
 
 ; GCN-LABEL: {{^}}v_nnan_inputs_missing0_med3_f32_pat0:
-define void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -796,7 +796,7 @@
 }
 
 ; GCN-LABEL: {{^}}v_nnan_inputs_missing1_med3_f32_pat0:
-define void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -819,7 +819,7 @@
 }
 
 ; GCN-LABEL: {{^}}v_nnan_inputs_missing2_med3_f32_pat0:
-define void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -849,7 +849,7 @@
 ; GCN: v_max_f32
 ; GCN: v_min_f32
 ; GCN: v_max_f32
-define void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -874,7 +874,7 @@
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[B]], [[A]]
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, [[C]], [[MAX]]
-define void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -901,7 +901,7 @@
 
 ; GFX9: v_add_f16_e32 v{{[0-9]+}}, 1.0
 ; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
-define void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -938,7 +938,7 @@
 ; VI: v_max_f16
 
 ; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
-define void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr half, half addrspace(1)* %bptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 3102ffd..3183f77 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -11,7 +11,7 @@
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
@@ -29,7 +29,7 @@
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
index 6982ee0..99bc114 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
@@ -3,7 +3,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; FUNC-LABEL: @test_fmin_legacy_f64
-define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
+define amdgpu_kernel void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
    %r0 = extractelement <4 x double> %reg0, i32 0
    %r1 = extractelement <4 x double> %reg0, i32 1
    %r2 = fcmp uge double %r0, %r1
@@ -14,7 +14,7 @@
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_ule_f64
-define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -29,7 +29,7 @@
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_ole_f64
-define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -44,7 +44,7 @@
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_olt_f64
-define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -59,7 +59,7 @@
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_ult_f64
-define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
index 79acd02..52336f9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -14,7 +14,7 @@
 ; EG: MIN *
 ; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
+define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = fcmp uge float %r0, %r1
@@ -34,7 +34,7 @@
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[VA]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[VB]]
 
-define void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %a, float %b
   store float %val, float addrspace(1)* %out, align 4
@@ -46,7 +46,7 @@
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -65,7 +65,7 @@
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -84,7 +84,7 @@
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -103,7 +103,7 @@
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -122,7 +122,7 @@
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1
@@ -144,7 +144,7 @@
 
 ; SI-NONAN: v_min_f32_e32
 ; SI-NONAN: v_min_f32_e32
-define void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %gep.0, i32 1
@@ -166,7 +166,7 @@
 ; SI-NONAN: v_min_f32_e32
 ; SI-NONAN: v_min_f32_e32
 ; SI-NONAN: v_min_f32_e32
-define void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1
@@ -188,7 +188,7 @@
 ; SI-NEXT: v_cndmask_b32
 ; SI-NOT: v_min
 ; SI: s_endpgm
-define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll b/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll
index 0f929d6..01b2674 100644
--- a/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll
@@ -9,7 +9,7 @@
 
 ; FUNC-LABEL: @test_fmin_f64
 ; SI: v_min_f64
-define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
   %val = call double @llvm.minnum.f64(double %a, double %b) #0
   store double %val, double addrspace(1)* %out, align 8
   ret void
@@ -18,7 +18,7 @@
 ; FUNC-LABEL: @test_fmin_v2f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0
   store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -29,7 +29,7 @@
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0
   store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -44,7 +44,7 @@
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0
   store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -67,7 +67,7 @@
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0
   store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fminnum.ll b/llvm/test/CodeGen/AMDGPU/fminnum.ll
index abd2b9d..9e997c7 100644
--- a/llvm/test/CodeGen/AMDGPU/fminnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminnum.ll
@@ -13,7 +13,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float %b) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -26,7 +26,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0
   store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -43,7 +43,7 @@
 ; EG: MIN_DX10 {{.*}}[[OUT]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0
   store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -69,7 +69,7 @@
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W
-define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0
   store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -113,7 +113,7 @@
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W
-define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0
   store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -127,7 +127,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -142,7 +142,7 @@
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 ; EG: 2143289344({{nan|1\.#QNAN0e\+00}})
-define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -156,7 +156,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -170,7 +170,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -184,7 +184,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -198,7 +198,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -212,7 +212,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -226,7 +226,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -237,7 +237,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -248,7 +248,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float 2.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -260,7 +260,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float 99.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -272,7 +272,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float 99.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 553ab18..4002712 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -23,7 +23,7 @@
 ; VI: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 ; VI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
-define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
+define amdgpu_kernel void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
   %a11 = fadd fast float %y, -1.0
   %a12 = call float @llvm.fabs.f32(float %a11)
   %a13 = fadd fast float %x, -1.0
@@ -44,7 +44,7 @@
 ; GCN-DAG: buffer_store_dword [[MUL2]]
 ; GCN-DAG: buffer_store_dword [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %mad = fadd fast float %mul2, %y
@@ -59,7 +59,7 @@
 ; GCN-DAG: buffer_store_dword [[MUL2]]
 ; GCN-DAG: buffer_store_dword [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %x.abs = call float @llvm.fabs.f32(float %x)
   %mul2 = fmul fast float %x.abs, 2.0
@@ -72,7 +72,7 @@
 ; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f32:
 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}}
 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}}
-define void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
+define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %x.abs = call float @llvm.fabs.f32(float %x)
   %mul2 = fmul fast float %x.abs, 2.0
@@ -87,7 +87,7 @@
 ; GCN: v_mul_f32_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %muln2 = fmul fast float %x, -2.0
@@ -101,7 +101,7 @@
 ; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]]
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %muln2 = fmul fast float %x, -3.0
@@ -119,7 +119,7 @@
 ; VI: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
 ; VI-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
-define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
+define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %z = bitcast i16 %z.arg to half
@@ -146,7 +146,7 @@
 ; GCN-DAG: buffer_store_short [[MUL2]]
 ; GCN-DAG: buffer_store_short [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -166,7 +166,7 @@
 ; GCN-DAG: buffer_store_short [[MUL2]]
 ; GCN-DAG: buffer_store_short [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -185,7 +185,7 @@
 ; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}}
 ; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}}
 
-define void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
+define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %z = bitcast i16 %z.arg to half
@@ -203,7 +203,7 @@
 ; GCN: v_mul_f16_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0
 ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -219,7 +219,7 @@
 ; GCN: v_mul_f16_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]]
 ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index 3ccd09b..75977ad 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -11,7 +11,7 @@
 ; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fmul_f16(
+define amdgpu_kernel void @fmul_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -31,7 +31,7 @@
 ; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fmul_f16_imm_a(
+define amdgpu_kernel void @fmul_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -50,7 +50,7 @@
 ; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fmul_f16_imm_b(
+define amdgpu_kernel void @fmul_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -83,7 +83,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmul_v2f16(
+define amdgpu_kernel void @fmul_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -110,7 +110,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmul_v2f16_imm_a(
+define amdgpu_kernel void @fmul_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -135,7 +135,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmul_v2f16_imm_b(
+define amdgpu_kernel void @fmul_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.ll b/llvm/test/CodeGen/AMDGPU/fmul.ll
index 8228fd0..125de7a 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.ll
@@ -6,7 +6,7 @@
 ; GCN: v_mul_f32
 
 ; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
-define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fmul float %a, %b
   store float %0, float addrspace(1)* %out
@@ -19,7 +19,7 @@
 
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
-define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fmul <2 x float> %a, %b
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -36,7 +36,7 @@
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -49,7 +49,7 @@
 ; GCN: v_mul_f32
 ; GCN-NOT: v_mul_f32
 ; GCN: s_endpgm
-define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
   %y = fmul float %x, 2.0
   %z = fmul float %y, 3.0
   store float %z, float addrspace(1)* %out
@@ -61,7 +61,7 @@
 ; GCN-NOT: v_mul_f32
 ; GCN-NOT: v_mad_f32
 ; GCN: s_endpgm
-define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
   %y = fmul float %x, 3.0
   %z = fmul float %y, 2.0
   store float %z, float addrspace(1)* %out
@@ -75,7 +75,7 @@
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
 ; GCN-NOT: v_mul_f32
-define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
   %a = fmul float %x, 5.0
   %b = fsub float -0.0, %a
   %c = fmul float %b, %y
diff --git a/llvm/test/CodeGen/AMDGPU/fmul64.ll b/llvm/test/CodeGen/AMDGPU/fmul64.ll
index 3c222ea..f14233f2 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul64.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}fmul_f64:
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -15,7 +15,7 @@
 ; FUNC-LABEL: {{^}}fmul_v2f64:
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                         <2 x double> addrspace(1)* %in2) {
    %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
    %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
@@ -29,7 +29,7 @@
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
                         <4 x double> addrspace(1)* %in2) {
    %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
    %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 477ae3d..9b71341 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -16,7 +16,7 @@
 ; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 ; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
                          half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
   %r0 = load half, half addrspace(1)* %in1
   %r1 = load half, half addrspace(1)* %in2
@@ -34,7 +34,7 @@
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -56,7 +56,7 @@
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -82,7 +82,7 @@
 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
 
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_f16(half addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
                             half addrspace(1)* %in1,
                             half addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -111,7 +111,7 @@
 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
 
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_b_a_a_f16(half addrspace(1)* %out,
+define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
                             half addrspace(1)* %in1,
                             half addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -134,7 +134,7 @@
 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
 ; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -156,7 +156,7 @@
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -180,7 +180,7 @@
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -202,7 +202,7 @@
 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -231,7 +231,7 @@
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -261,7 +261,7 @@
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -291,7 +291,7 @@
 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -323,7 +323,7 @@
 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -355,7 +355,7 @@
 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -388,7 +388,7 @@
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -419,7 +419,7 @@
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
 
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -447,7 +447,7 @@
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
index e4b1053..fb605dd 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
@@ -25,7 +25,7 @@
 
 ; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                          float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
   %r0 = load float, float addrspace(1)* %in1
   %r1 = load float, float addrspace(1)* %in2
@@ -45,7 +45,7 @@
 
 ; GCN-DENORM-STRICT: v_mul_f32_e32
 ; GCN-DENORM-STRICT: v_add_f32_e32
-define void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                            float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
   %r0 = load volatile float, float addrspace(1)* %in1
   %r1 = load volatile float, float addrspace(1)* %in2
@@ -71,7 +71,7 @@
 
 ; SI-DENORM buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -100,7 +100,7 @@
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -132,7 +132,7 @@
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_f32(float addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -167,7 +167,7 @@
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_b_a_a_f32(float addrspace(1)* %out,
+define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -196,7 +196,7 @@
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -225,7 +225,7 @@
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -256,7 +256,7 @@
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -286,7 +286,7 @@
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -318,7 +318,7 @@
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -353,7 +353,7 @@
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -387,7 +387,7 @@
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -422,7 +422,7 @@
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -460,7 +460,7 @@
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -496,7 +496,7 @@
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -532,7 +532,7 @@
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -563,7 +563,7 @@
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll
index f5e64b3..86e91e0 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll
@@ -7,7 +7,7 @@
 
 ; GCN-LABEL: {{^}}fmuladd_f64:
 ; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                          double addrspace(1)* %in2, double addrspace(1)* %in3) #0 {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -22,7 +22,7 @@
 
 ; GCN-STRICT: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; GCN-STRICT: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                            double addrspace(1)* %in2, double addrspace(1)* %in3) #0 {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -44,7 +44,7 @@
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_f64(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64(double addrspace(1)* %out,
                             double addrspace(1)* %in1,
                             double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -72,7 +72,7 @@
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_b_a_a_f64(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_b_a_a_f64(double addrspace(1)* %out,
                             double addrspace(1)* %in1,
                             double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -94,7 +94,7 @@
 ; GCN-STRICT: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
 
 ; GCN-CONTRACT: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
-define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext
@@ -117,7 +117,7 @@
 ; GCN-STRICT: v_add_f64
 
 ; GCN-CONTRACT: v_fma_f64
-define void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out,
                                       double addrspace(1)* %in1,
                                       double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -139,7 +139,7 @@
 ; GCN-STRICT: v_add_f64
 
 ; GCN-CONTRACT: v_fma_f64
-define void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out,
                                       double addrspace(1)* %in1,
                                       double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -158,7 +158,7 @@
 
 ; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast:
 ; GCN: v_fma_f64
-define void @fadd_a_a_b_f64_fast(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64_fast(double addrspace(1)* %out,
                                  double addrspace(1)* %in1,
                                 double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
index b0123b8..bdd3c04 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
@@ -17,7 +17,7 @@
 ; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 ; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-define void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
                          <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 {
   %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1
   %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2
@@ -37,7 +37,7 @@
 
 ; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
@@ -61,7 +61,7 @@
 
 ; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
@@ -86,7 +86,7 @@
 
 ; GFX9-DENORM-CONTRACT: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out,
                             <2 x half> addrspace(1)* %in1,
                             <2 x half> addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index 5423fad..4ff3bbb 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -13,41 +13,41 @@
 declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0
 
 
-define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 {
 entry:
   %0 = call float @llvm.nearbyint.f32(float %in)
   store float %0, float addrspace(1)* %out
   ret void
 }
 
-define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in)
   store <2 x float> %0, <2 x float> addrspace(1)* %out
   ret void
 }
 
-define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
+define amdgpu_kernel void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
 entry:
   %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in)
   store <4 x float> %0, <4 x float> addrspace(1)* %out
   ret void
 }
 
-define void @nearbyint_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @nearbyint_f64(double addrspace(1)* %out, double %in) {
 entry:
   %0 = call double @llvm.nearbyint.f64(double %in)
   store double %0, double addrspace(1)* %out
   ret void
 }
-define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 entry:
   %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in)
   store <2 x double> %0, <2 x double> addrspace(1)* %out
   ret void
 }
 
-define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 entry:
   %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in)
   store <4 x double> %0, <4 x double> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index acd4f7e..1c0e9a2 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -14,7 +14,7 @@
 
 ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -35,7 +35,7 @@
 ; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -62,7 +62,7 @@
 ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[ADD]]
 ; GCN: buffer_store_dword [[NEG_ADD]]
 ; GCN-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -87,7 +87,7 @@
 
 ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -111,7 +111,7 @@
 
 ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -135,7 +135,7 @@
 
 ; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -164,7 +164,7 @@
 ; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
-define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -192,7 +192,7 @@
 ; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -218,7 +218,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -239,7 +239,7 @@
 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 ; GCN: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -261,7 +261,7 @@
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
 ; GCN-NEXT: buffer_store_dword [[MUL0]]
 ; GCN-NEXT: buffer_store_dword [[MUL1]]
-define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -282,7 +282,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -302,7 +302,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -322,7 +322,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -345,7 +345,7 @@
 ; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -368,7 +368,7 @@
 ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -394,7 +394,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -412,7 +412,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -428,7 +428,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -444,7 +444,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -460,7 +460,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -476,7 +476,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -494,7 +494,7 @@
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -516,7 +516,7 @@
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
 ; GCN-NEXT: buffer_store_dword [[MAX0]]
 ; GCN-NEXT: buffer_store_dword [[MUL1]]
-define void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -541,7 +541,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -559,7 +559,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -575,7 +575,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -591,7 +591,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -607,7 +607,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -623,7 +623,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -641,7 +641,7 @@
 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -663,7 +663,7 @@
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
 ; GCN-NEXT: buffer_store_dword [[MAX0]]
 ; GCN-NEXT: buffer_store_dword [[MUL1]]
-define void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -693,7 +693,7 @@
 
 ; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -717,7 +717,7 @@
 ; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
 ; GCN-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -748,7 +748,7 @@
 
 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
 ; GCN-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -776,7 +776,7 @@
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -803,7 +803,7 @@
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -830,7 +830,7 @@
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -858,7 +858,7 @@
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -886,7 +886,7 @@
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -916,7 +916,7 @@
 ; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
-define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -946,7 +946,7 @@
 ; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_FMA]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
+define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -979,7 +979,7 @@
 
 ; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1009,7 +1009,7 @@
 
 ; GCN: buffer_store_dword [[NEG_MAD]]
 ; GCN-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1035,7 +1035,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1051,7 +1051,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1070,7 +1070,7 @@
 ; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
 ; GCN: buffer_store_dword [[FNEG_A]]
-define void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1090,7 +1090,7 @@
 ; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}
-define void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1110,7 +1110,7 @@
 ; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1126,7 +1126,7 @@
 
 ; FIXME: Source modifiers not folded for f16->f32
 ; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
-define void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1140,7 +1140,7 @@
 }
 
 ; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
-define void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1162,7 +1162,7 @@
 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 ; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1178,7 +1178,7 @@
 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 ; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1197,7 +1197,7 @@
 ; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[A_LO]]:[[NEG_A_HI]]{{\]}}
-define void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1217,7 +1217,7 @@
 ; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dwordx2 [[USE1]]
-define void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1236,7 +1236,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1252,7 +1252,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1271,7 +1271,7 @@
 ; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]]
 ; GCN: buffer_store_dword [[NEG]]
 ; GCN: buffer_store_dword [[CVT]]
-define void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1290,7 +1290,7 @@
 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN: buffer_store_short [[RESULT]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1310,7 +1310,7 @@
 ; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s
 ; GCN: buffer_store_short [[RESULT]]
 ; GCN: buffer_store_dword [[USE1]]
-define void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1333,7 +1333,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1349,7 +1349,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1368,7 +1368,7 @@
 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1388,7 +1388,7 @@
 ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1411,7 +1411,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rcp_legacy_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1432,7 +1432,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1453,7 +1453,7 @@
 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1476,7 +1476,7 @@
 ; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1497,7 +1497,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1517,7 +1517,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1537,7 +1537,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1560,7 +1560,7 @@
 ; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1583,7 +1583,7 @@
 ; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1610,7 +1610,7 @@
 ; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]]
 ; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1626,7 +1626,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1646,7 +1646,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1673,7 +1673,7 @@
 
 ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1693,7 +1693,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1713,7 +1713,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1735,7 +1735,7 @@
 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
 ; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
 ; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
-define void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1758,7 +1758,7 @@
 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
 ; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
 ; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
-define void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1791,7 +1791,7 @@
 ; GCN: buffer_store_dword [[MUL1]]
 
 ; GCN: buffer_store_dword [[MUL0]]
-define void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
+define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1827,7 +1827,7 @@
 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
 ; GCN: ; use [[MUL]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
+define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1856,7 +1856,7 @@
 ; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
 ; GCN: ; use [[NEG]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
+define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1888,7 +1888,7 @@
 ; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0
 ; GCN-NEXT:	buffer_store_dword [[FMA0]]
 ; GCN-NEXT:	buffer_store_dword [[FMA1]]
-define void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1920,7 +1920,7 @@
 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
 ; GCN-NEXT:	buffer_store_dword [[MUL0]]
 ; GCN-NEXT:	buffer_store_dword [[MUL1]]
-define void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1951,7 +1951,7 @@
 
 ; GCN:	buffer_store_dword [[FMA0]]
 ; GCN-NEXT:	buffer_store_dword [[MUL1]]
-define void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1990,7 +1990,7 @@
 
 ; GCN: buffer_store_dword [[MUL1]]
 ; GCN-NEXT:	buffer_store_dword [[MUL2]]
-define void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
+define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -2025,7 +2025,7 @@
 
 ; GCN: buffer_store_dwordx2 [[MUL0]]
 ; GCN: buffer_store_dwordx2 [[MUL1]]
-define void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
+define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -2058,7 +2058,7 @@
 ; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
 ; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
 ; GCN: buffer_store_dword [[FMA0]]
-define void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
+define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -2088,7 +2088,7 @@
 ; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[D]], [[TRUNC_A]]
 ; GCN: buffer_store_dword [[FMA0]]
 ; GCN: buffer_store_dword [[MUL1]]
-define void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
+define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index beeda4c..555764c 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -9,7 +9,7 @@
 
 ; GFX89-NOT: _and
 ; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
-define void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) {
+define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) {
   %fabs = call half @llvm.fabs.f16(half %x)
   %fsub = fsub half -0.0, %fabs
   %fadd = fadd half %y, %fsub
@@ -27,7 +27,7 @@
 ; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}|
 ; GFX89-NOT: [[MUL]]
 ; GFX89: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
-define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
+define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
   %fabs = call half @llvm.fabs.f16(half %x)
   %fsub = fsub half -0.0, %fabs
   %fmul = fmul half %y, %fsub
@@ -41,7 +41,7 @@
 
 ; GCN-LABEL: {{^}}fneg_fabs_free_f16:
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
-define void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
   %bc = bitcast i16 %in to half
   %fabs = call half @llvm.fabs.f16(half %bc)
   %fsub = fsub half -0.0, %fabs
@@ -51,7 +51,7 @@
 
 ; GCN-LABEL: {{^}}fneg_fabs_f16:
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
-define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
+define amdgpu_kernel void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
   %fabs = call half @llvm.fabs.f16(half %in)
   %fsub = fsub half -0.0, %fabs
   store half %fsub, half addrspace(1)* %out, align 2
@@ -60,7 +60,7 @@
 
 ; GCN-LABEL: {{^}}v_fneg_fabs_f16:
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
-define void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %val = load half, half addrspace(1)* %in, align 2
   %fabs = call half @llvm.fabs.f16(half %val)
   %fsub = fsub half -0.0, %fabs
@@ -76,7 +76,7 @@
 ; CIVI: flat_store_dword
 
 ; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
-define void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
+define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
   store <2 x half> %fneg.fabs, <2 x half> addrspace(1)* %out
@@ -95,7 +95,7 @@
 ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
 
 ; GCN: flat_store_dwordx2
-define void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
+define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
   %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
   %fsub = fsub <4 x half> <half -0.0, half -0.0, half -0.0, half -0.0>, %fabs
   store <4 x half> %fsub, <4 x half> addrspace(1)* %out
@@ -113,7 +113,7 @@
 
 ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0]
-define void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
+define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
   %mul = fmul <2 x half> %fneg.fabs, <half 4.0, half 4.0>
@@ -125,7 +125,7 @@
 ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
 ; GFX9: v_mov_b32_e32 [[VABS:v[0-9]+]], [[ABS]]
 ; GFX9: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VABS]]
-define void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
+define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
   store <2 x half> %fabs, <2 x half> addrspace(1)* %out0
@@ -136,7 +136,7 @@
 ; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_foldable_neg_v2f16:
 ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0]
-define void @s_fneg_multi_use_fabs_foldable_neg_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
+define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
   %mul = fmul <2 x half> %fneg, <half 4.0, half 4.0>
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index d16e83f..85f5440 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -6,7 +6,7 @@
 
 ; GCN-LABEL: {{^}}fneg_fabs_fadd_f64:
 ; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}}
-define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
+define amdgpu_kernel void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
   %fabs = call double @llvm.fabs.f64(double %x)
   %fsub = fsub double -0.000000e+00, %fabs
   %fadd = fadd double %y, %fsub
@@ -14,7 +14,7 @@
   ret void
 }
 
-define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) {
   %x = load double, double addrspace(1)* %xptr, align 8
   %y = load double, double addrspace(1)* %xptr, align 8
   %fabs = call double @llvm.fabs.f64(double %x)
@@ -26,7 +26,7 @@
 
 ; GCN-LABEL: {{^}}fneg_fabs_fmul_f64:
 ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}}
-define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
+define amdgpu_kernel void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
   %fabs = call double @llvm.fabs.f64(double %x)
   %fsub = fsub double -0.000000e+00, %fabs
   %fmul = fmul double %y, %fsub
@@ -35,7 +35,7 @@
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_free_f64:
-define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fabs = call double @llvm.fabs.f64(double %bc)
   %fsub = fsub double -0.000000e+00, %fabs
@@ -46,7 +46,7 @@
 ; GCN-LABEL: {{^}}fneg_fabs_fn_free_f64:
 ; GCN: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fabs = call double @fabs(double %bc)
   %fsub = fsub double -0.000000e+00, %fabs
@@ -62,7 +62,7 @@
 ; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
 ; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
-define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
   %fabs = call double @llvm.fabs.f64(double %in)
   %fsub = fsub double -0.000000e+00, %fabs
   store double %fsub, double addrspace(1)* %out, align 8
@@ -74,7 +74,7 @@
 ; GCN-NOT: 0x80000000
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
   %fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
   store <2 x double> %fsub, <2 x double> addrspace(1)* %out
@@ -88,7 +88,7 @@
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
   %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
   %fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs
   store <4 x double> %fsub, <4 x double> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index 9ee1171..a0cf37b 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
 ; SI-NOT: and
 ; SI: v_subrev_f32_e64 {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}}
-define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
+define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
   %fabs = call float @llvm.fabs.f32(float %x)
   %fsub = fsub float -0.000000e+00, %fabs
   %fadd = fadd float %y, %fsub
@@ -17,7 +17,7 @@
 ; SI-NOT: and
 ; SI: v_mul_f32_e64 {{v[0-9]+}}, -|{{v[0-9]+}}|, {{s[0-9]+}}
 ; SI-NOT: and
-define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
+define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
   %fabs = call float @llvm.fabs.f32(float %x)
   %fsub = fsub float -0.000000e+00, %fabs
   %fmul = fmul float %y, %fsub
@@ -35,7 +35,7 @@
 ; R600: -PV
 
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fabs = call float @llvm.fabs.f32(float %bc)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -49,7 +49,7 @@
 ; R600: -PV
 
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fabs = call float @fabs(float %bc)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -59,7 +59,7 @@
 
 ; FUNC-LABEL: {{^}}fneg_fabs_f32:
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)
   %fsub = fsub float -0.000000e+00, %fabs
   store float %fsub, float addrspace(1)* %out, align 4
@@ -68,7 +68,7 @@
 
 ; FUNC-LABEL: {{^}}v_fneg_fabs_f32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %val = load float, float addrspace(1)* %in, align 4
   %fabs = call float @llvm.fabs.f32(float %val)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -86,7 +86,7 @@
 ; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
-define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
   store <2 x float> %fsub, <2 x float> addrspace(1)* %out
@@ -99,7 +99,7 @@
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
-define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
   %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
   store <4 x float> %fsub, <4 x float> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index d0c6d3d..07a23cb 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -4,7 +4,7 @@
 
 ; FIXME: Should be able to do scalar op
 ; GCN-LABEL: {{^}}s_fneg_f16:
-define void @s_fneg_f16(half addrspace(1)* %out, half %in) #0 {
+define amdgpu_kernel void @s_fneg_f16(half addrspace(1)* %out, half %in) #0 {
   %fneg = fsub half -0.0, %in
   store half %fneg, half addrspace(1)* %out
   ret void
@@ -18,7 +18,7 @@
 ; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
 ; SI: buffer_store_short [[XOR]]
-define void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid
@@ -34,7 +34,7 @@
 ; XCI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}
 ; CI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[NEG_VALUE]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
-define void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
+define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
   %bc = bitcast i16 %in to half
   %fsub = fsub half -0.0, %bc
   store half %fsub, half addrspace(1)* %out
@@ -52,7 +52,7 @@
 
 ; VI-NOT: [[NEG_VALUE]]
 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   %fsub = fsub half -0.0, %val
   %fmul = fmul half %fsub, %val
@@ -73,7 +73,7 @@
 ; VI: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
 
 ; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
-define void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
+define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %in
   store <2 x half> %fneg, <2 x half> addrspace(1)* %out
   ret void
@@ -82,7 +82,7 @@
 ; GCN-LABEL: {{^}}v_fneg_v2f16:
 ; GCN: flat_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VAL]]
-define void @v_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
@@ -98,7 +98,7 @@
 
 ; GFX9: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VVAL]]
-define void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
   %bc = bitcast i32 %in to <2 x half>
   %fsub = fsub <2 x half> <half -0.0, half -0.0>, %bc
   store <2 x half> %fsub, <2 x half> addrspace(1)* %out
@@ -120,7 +120,7 @@
 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
 
 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}}
-define void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %fsub = fsub <2 x half> <half -0.0, half -0.0>, %val
   %fmul = fmul <2 x half> %fsub, %val
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg.f64.ll
index b7080f4..9b4b4d6 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f64.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}fneg_f64:
 ; GCN: v_xor_b32
-define void @fneg_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_f64(double addrspace(1)* %out, double %in) {
   %fneg = fsub double -0.000000e+00, %in
   store double %fneg, double addrspace(1)* %out
   ret void
@@ -12,7 +12,7 @@
 ; FUNC-LABEL: {{^}}fneg_v2f64:
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) {
+define amdgpu_kernel void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) {
   %fneg = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %in
   store <2 x double> %fneg, <2 x double> addrspace(1)* %out
   ret void
@@ -28,7 +28,7 @@
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) {
+define amdgpu_kernel void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) {
   %fneg = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %in
   store <4 x double> %fneg, <4 x double> addrspace(1)* %out
   ret void
@@ -40,7 +40,7 @@
 
 ; FUNC-LABEL: {{^}}fneg_free_f64:
 ; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fsub = fsub double 0.0, %bc
   store double %fsub, double addrspace(1)* %out
@@ -52,7 +52,7 @@
 ; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-NOT: xor
 ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
   %fsub = fsub double -0.0, %in
   %fmul = fmul double %fsub, %in
   store double %fmul, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index 007c6dc..d1eabfb 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -6,7 +6,7 @@
 ; R600: -PV
 
 ; GCN: v_xor_b32
-define void @s_fneg_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @s_fneg_f32(float addrspace(1)* %out, float %in) {
   %fneg = fsub float -0.000000e+00, %in
   store float %fneg, float addrspace(1)* %out
   ret void
@@ -18,7 +18,7 @@
 
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
+define amdgpu_kernel void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
   %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
   store <2 x float> %fneg, <2 x float> addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
+define amdgpu_kernel void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
   %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
   store <4 x float> %fneg, <4 x float> addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@
 
 ; R600-NOT: XOR
 ; R600: -KC0[2].Z
-define void @fsub0_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fsub0_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fsub = fsub float 0.0, %bc
   store float %fsub, float addrspace(1)* %out
@@ -66,7 +66,7 @@
 
 ; R600-NOT: XOR
 ; R600: -PV.W
-define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fsub = fsub float -0.0, %bc
   store float %fsub, float addrspace(1)* %out
@@ -78,7 +78,7 @@
 ; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: xor
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define void @fneg_fold_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fneg_fold_f32(float addrspace(1)* %out, float %in) {
   %fsub = fsub float -0.0, %in
   %fmul = fmul float %fsub, %in
   store float %fmul, float addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index b7ffaed..cbc4297 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -9,7 +9,7 @@
 ; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
 ; SI-NOT: v_cmp
 ; SI: s_endpgm
-define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
   %cmp = fcmp oeq float %fabs, 0x7FF0000000000000
   %ext = zext i1 %cmp to i32
@@ -20,7 +20,7 @@
 ; SI-LABEL: {{^}}test_not_isinf_pattern_0:
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
   %cmp = fcmp ueq float %fabs, 0x7FF0000000000000
   %ext = zext i1 %cmp to i32
@@ -31,7 +31,7 @@
 ; SI-LABEL: {{^}}test_not_isinf_pattern_1:
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
   %cmp = fcmp oeq float %fabs, 0xFFF0000000000000
   %ext = zext i1 %cmp to i32
@@ -45,7 +45,7 @@
 ; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
 ; SI-NOT: v_cmp
 ; SI: s_endpgm
-define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
@@ -59,7 +59,7 @@
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_0:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp une float %x.fabs, 0xFFF0000000000000
@@ -73,7 +73,7 @@
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_1:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %ninf = fcmp une float %x, 0x7FF0000000000000
   %and = and i1 %ord, %ninf
@@ -86,7 +86,7 @@
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_2:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %y) #1
   %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
@@ -100,7 +100,7 @@
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_3:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp uno float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
@@ -114,7 +114,7 @@
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_4:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp one float %x.fabs, 0x7FF0000000000000
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
index 01bc53f..ce04136 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
@@ -14,7 +14,7 @@
 ; CM: MEM_RAT_CACHELESS STORE_DWORD [[RES:T[0-9]+\.[XYZW]]]
 ; EGCM: VTX_READ_16 [[VAL:T[0-9]+\.[XYZW]]]
 ; EGCM: FLT16_TO_FLT32{{[ *]*}}[[RES]], [[VAL]]
-define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
   %val = load i16, i16 addrspace(1)* %in, align 2
   %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
   store float %cvt, float addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
index a9f493b..70f0c0c 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
@@ -8,7 +8,7 @@
 ; GCN: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]]
 ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
   %val = load i16, i16 addrspace(1)* %in, align 2
   %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
   store double %cvt, double addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index 3e426e3..2c6b1cb 100644
--- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -12,7 +12,7 @@
 ; EG: MEM_RAT MSKOR
 ; EG: VTX_READ_32
 ; EG: FLT32_TO_FLT16
-define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
   store i16 %cvt, i16 addrspace(1)* %out, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
index 1537d67..a7cddd0 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
@@ -6,7 +6,7 @@
 
 ; FUNC-LABEL: @fp_to_sint_f64_i32
 ; SI: v_cvt_i32_f64_e32
-define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
   %result = fptosi double %in to i32
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@
 ; FUNC-LABEL: @fp_to_sint_v2f64_v2i32
 ; SI: v_cvt_i32_f64_e32
 ; SI: v_cvt_i32_f64_e32
-define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
   %result = fptosi <2 x double> %in to <2 x i32>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -26,7 +26,7 @@
 ; SI: v_cvt_i32_f64_e32
 ; SI: v_cvt_i32_f64_e32
 ; SI: v_cvt_i32_f64_e32
-define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
   %result = fptosi <4 x double> %in to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
@@ -47,7 +47,7 @@
 ; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
 ; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
 ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
   %val = load double, double addrspace(1)* %gep, align 8
@@ -58,7 +58,7 @@
 
 ; FUNC-LABEL: {{^}}fp_to_sint_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{\[[0-9]+:[0-9]+\]}}
-define void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %conv = fptosi double %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -66,7 +66,7 @@
 
 ; FUNC-LABEL: {{^}}fp_to_sint_fabs_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{\[[0-9]+:[0-9]+\]}}|
-define void @fp_to_sint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_sint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %in.fabs = call double @llvm.fabs.f64(double %in)
   %conv = fptosi double %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index a2fa7a1..630a718 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -8,7 +8,7 @@
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; SI: v_cvt_i32_f32_e32
 ; SI: s_endpgm
-define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
   %conv = fptosi float %in to i32
   store i32 %conv, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@
 
 ; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs:
 ; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
-define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
   %in.fabs = call float @llvm.fabs.f32(float %in)
   %conv = fptosi float %in.fabs to i32
   store i32 %conv, i32 addrspace(1)* %out
@@ -28,7 +28,7 @@
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; SI: v_cvt_i32_f32_e32
 ; SI: v_cvt_i32_f32_e32
-define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   %result = fptosi <2 x float> %in to <2 x i32>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@
 ; SI: v_cvt_i32_f32_e32
 ; SI: v_cvt_i32_f32_e32
 ; SI: v_cvt_i32_f32_e32
-define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float>, <4 x float> addrspace(1) * %in
   %result = fptosi <4 x float> %value to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
@@ -76,7 +76,7 @@
 
 ; Check that the compiler doesn't crash with a "cannot select" error
 ; SI: s_endpgm
-define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
 entry:
   %0 = fptosi float %in to i64
   store i64 %0, i64 addrspace(1)* %out
@@ -128,7 +128,7 @@
 ; EG-DAG: CNDE_INT
 
 ; SI: s_endpgm
-define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
   %conv = fptosi <2 x float> %x to <2 x i64>
   store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
   ret void
@@ -221,7 +221,7 @@
 ; EG-DAG: CNDE_INT
 
 ; SI: s_endpgm
-define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
   %conv = fptosi <4 x float> %x to <4 x i64>
   store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
   ret void
@@ -233,7 +233,7 @@
 ; EG: AND_INT
 ; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, literal.y,
 ; EG-NEXT: -1082130432(-1.000000e+00)
-define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %conv = fptosi float %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -241,7 +241,7 @@
 
 ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1:
 ; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{[0-9]+}}|
-define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %in.fabs = call float @llvm.fabs.f32(float %in)
   %conv = fptosi float %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
@@ -251,7 +251,7 @@
 ; FUNC-LABEL: {{^}}fp_to_sint_f32_i16:
 ; GCN: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_short [[VAL]]
-define void @fp_to_sint_f32_i16(i16 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_sint_f32_i16(i16 addrspace(1)* %out, float %in) #0 {
   %sint = fptosi float %in to i16
   store i16 %sint, i16 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
index d5bc416..4f597eb 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
@@ -6,7 +6,7 @@
 
 ; SI-LABEL: {{^}}fp_to_uint_i32_f64:
 ; SI: v_cvt_u32_f64_e32
-define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
   %cast = fptoui double %in to i32
   store i32 %cast, i32 addrspace(1)* %out, align 4
   ret void
@@ -15,7 +15,7 @@
 ; SI-LABEL: @fp_to_uint_v2i32_v2f64
 ; SI: v_cvt_u32_f64_e32
 ; SI: v_cvt_u32_f64_e32
-define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
   %cast = fptoui <2 x double> %in to <2 x i32>
   store <2 x i32> %cast, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -26,7 +26,7 @@
 ; SI: v_cvt_u32_f64_e32
 ; SI: v_cvt_u32_f64_e32
 ; SI: v_cvt_u32_f64_e32
-define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
   %cast = fptoui <4 x double> %in to <4 x i32>
   store <4 x i32> %cast, <4 x i32> addrspace(1)* %out, align 8
   ret void
@@ -47,7 +47,7 @@
 ; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
 ; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
 ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
   %val = load double, double addrspace(1)* %gep, align 8
@@ -57,14 +57,14 @@
 }
 
 ; SI-LABEL: @fp_to_uint_v2i64_v2f64
-define void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) {
   %cast = fptoui <2 x double> %in to <2 x i64>
   store <2 x i64> %cast, <2 x i64> addrspace(1)* %out, align 16
   ret void
 }
 
 ; SI-LABEL: @fp_to_uint_v4i64_v4f64
-define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) {
   %cast = fptoui <4 x double> %in to <4 x i64>
   store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32
   ret void
@@ -72,7 +72,7 @@
 
 ; FUNC-LABEL: {{^}}fp_to_uint_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{\[[0-9]+:[0-9]+\]}}
-define void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %conv = fptoui double %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -80,7 +80,7 @@
 
 ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{\[[0-9]+:[0-9]+\]}}|
-define void @fp_to_uint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_uint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %in.fabs = call double @llvm.fabs.f64(double %in)
   %conv = fptoui double %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index cbff9f2..fdb1580 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -9,7 +9,7 @@
 
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: s_endpgm
-define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) {
   %conv = fptoui float %in to i32
   store i32 %conv, i32 addrspace(1)* %out
   ret void
@@ -21,7 +21,7 @@
 
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: v_cvt_u32_f32_e32
-define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   %result = fptoui <2 x float> %in to <2 x i32>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -37,7 +37,7 @@
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: v_cvt_u32_f32_e32
 
-define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float>, <4 x float> addrspace(1) * %in
   %result = fptoui <4 x float> %value to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
@@ -68,7 +68,7 @@
 ; EG-DAG: CNDE_INT
 
 ; GCN: s_endpgm
-define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) {
   %conv = fptoui float %x to i64
   store i64 %conv, i64 addrspace(1)* %out
   ret void
@@ -119,7 +119,7 @@
 ; EG-DAG: CNDE_INT
 
 ; GCN: s_endpgm
-define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
   %conv = fptoui <2 x float> %x to <2 x i64>
   store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
   ret void
@@ -212,7 +212,7 @@
 ; EG-DAG: CNDE_INT
 
 ; GCN: s_endpgm
-define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
   %conv = fptoui <4 x float> %x to <4 x i64>
   store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
   ret void
@@ -224,7 +224,7 @@
 
 ; EG: AND_INT
 ; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, 1.0,
-define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %conv = fptoui float %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -232,7 +232,7 @@
 
 ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1:
 ; GCN: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{[0-9]+}}|
-define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %in.fabs = call float @llvm.fabs.f32(float %in)
   %conv = fptoui float %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
@@ -246,7 +246,7 @@
 ; SI: v_cvt_u32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; VI: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_short [[VAL]]
-define void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 {
   %uint = fptoui float %in to i16
   store i16 %uint, i16 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 1f0cc15..4b03ada 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -7,7 +7,7 @@
 ; GCN: v_cvt_f32_f16_e32 v[[R_F32:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_dword v[[R_F32]]
 ; GCN: s_endpgm
-define void @fpext_f16_to_f32(
+define amdgpu_kernel void @fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) #0 {
 entry:
@@ -23,7 +23,7 @@
 ; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:[[R_F64_1:[0-9]+]]{{\]}}, v[[A_F32]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F64_0]]:[[R_F64_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fpext_f16_to_f64(
+define amdgpu_kernel void @fpext_f16_to_f64(
     double addrspace(1)* %r,
     half addrspace(1)* %a) #0 {
 entry:
@@ -41,7 +41,7 @@
 ; GCN: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fpext_v2f16_to_v2f32(
+define amdgpu_kernel void @fpext_v2f16_to_v2f32(
     <2 x float> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) #0 {
 entry:
@@ -61,7 +61,7 @@
 ; GCN: v_cvt_f64_f32_e32
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @fpext_v2f16_to_v2f64(
+define amdgpu_kernel void @fpext_v2f16_to_v2f64(
     <2 x double> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -73,7 +73,7 @@
 
 ; GCN-LABEL: {{^}}s_fneg_fpext_f16_to_f32:
 ; GCN: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
-define void @s_fneg_fpext_f16_to_f32(float addrspace(1)* %r, i32 %a) {
+define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(float addrspace(1)* %r, i32 %a) {
 entry:
   %a.trunc = trunc i32 %a to i16
   %a.val = bitcast i16 %a.trunc to half
@@ -85,7 +85,7 @@
 ; GCN-LABEL: {{^}}fneg_fpext_f16_to_f32:
 ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
 ; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -[[A]]
-define void @fneg_fpext_f16_to_f32(
+define amdgpu_kernel void @fneg_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -99,7 +99,7 @@
 ; GCN-LABEL: {{^}}fabs_fpext_f16_to_f32:
 ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
 ; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, |[[A]]|
-define void @fabs_fpext_f16_to_f32(
+define amdgpu_kernel void @fabs_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -113,7 +113,7 @@
 ; GCN-LABEL: {{^}}fneg_fabs_fpext_f16_to_f32:
 ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
 ; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|[[A]]|
-define void @fneg_fabs_fpext_f16_to_f32(
+define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -135,7 +135,7 @@
 
 ; GCN: store_dword [[CVT]]
 ; GCN: store_short [[XOR]]
-define void @fneg_multi_use_fpext_f16_to_f32(
+define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -159,7 +159,7 @@
 
 ; GCN: buffer_store_dword [[CVTA_NEG]]
 ; GCN: buffer_store_short [[MUL]]
-define void @fneg_multi_foldable_use_fpext_f16_to_f32(
+define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -181,7 +181,7 @@
 
 ; GCN: store_dword [[CVT]]
 ; GCN: store_short [[XOR]]
-define void @fabs_multi_use_fpext_f16_to_f32(
+define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -205,7 +205,7 @@
 
 ; GCN: buffer_store_dword [[ABS_A]]
 ; GCN: buffer_store_short [[MUL]]
-define void @fabs_multi_foldable_use_fpext_f16_to_f32(
+define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -227,7 +227,7 @@
 
 ; GCN: buffer_store_dword [[CVT]]
 ; GCN: buffer_store_short [[OR]]
-define void @fabs_fneg_multi_use_fpext_f16_to_f32(
+define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -252,7 +252,7 @@
 
 ; GCN: buffer_store_dword [[FABS_FNEG]]
 ; GCN: buffer_store_short [[MUL]]
-define void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
+define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.ll b/llvm/test/CodeGen/AMDGPU/fpext.ll
index 6dc84b0..b11e2ea 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}fpext_f32_to_f64:
 ; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
   %result = fpext float %in to double
   store double %result, double addrspace(1)* %out
   ret void
@@ -12,7 +12,7 @@
 ; FUNC-LABEL: {{^}}fpext_v2f32_to_v2f64:
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) {
   %result = fpext <2 x float> %in to <2 x double>
   store <2 x double> %result, <2 x double> addrspace(1)* %out
   ret void
@@ -22,7 +22,7 @@
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %in) {
+define amdgpu_kernel void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %in) {
   %result = fpext <3 x float> %in to <3 x double>
   store <3 x double> %result, <3 x double> addrspace(1)* %out
   ret void
@@ -33,7 +33,7 @@
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) {
   %result = fpext <4 x float> %in to <4 x double>
   store <4 x double> %result, <4 x double> addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) {
+define amdgpu_kernel void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) {
   %result = fpext <8 x float> %in to <8 x double>
   store <8 x double> %result, <8 x double> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index 71f56d7..2c25a22 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -7,7 +7,7 @@
 ; GCN: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_I16]]
 ; GCN: s_endpgm
-define void @fptosi_f16_to_i16(
+define amdgpu_kernel void @fptosi_f16_to_i16(
     i16 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -23,7 +23,7 @@
 ; GCN: v_cvt_i32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fptosi_f16_to_i32(
+define amdgpu_kernel void @fptosi_f16_to_i32(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -40,7 +40,7 @@
 ; GCN: buffer_load_ushort
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: s_endpgm
-define void @fptosi_f16_to_i64(
+define amdgpu_kernel void @fptosi_f16_to_i64(
     i64 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -62,7 +62,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]
 ; GCN: buffer_store_dword v[[R_V2_I16]]
 ; GCN: s_endpgm
-define void @fptosi_v2f16_to_v2i16(
+define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
     <2 x i16> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -80,7 +80,7 @@
 ; GCN: v_cvt_i32_f32_e32
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @fptosi_v2f16_to_v2i32(
+define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -98,7 +98,7 @@
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: s_endpgm
-define void @fptosi_v2f16_to_v2i64(
+define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
     <2 x i64> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index a687662..10f59c8 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -8,7 +8,7 @@
 ; VI:  v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_I16]]
 ; GCN: s_endpgm
-define void @fptoui_f16_to_i16(
+define amdgpu_kernel void @fptoui_f16_to_i16(
     i16 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -24,7 +24,7 @@
 ; GCN: v_cvt_u32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fptoui_f16_to_i32(
+define amdgpu_kernel void @fptoui_f16_to_i32(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -41,7 +41,7 @@
 ; GCN: buffer_load_ushort
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: s_endpgm
-define void @fptoui_f16_to_i64(
+define amdgpu_kernel void @fptoui_f16_to_i64(
     i64 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -64,7 +64,7 @@
 ; GCN:     v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]]
 ; GCN:     buffer_store_dword v[[R_V2_I16]]
 ; GCN:     s_endpgm
-define void @fptoui_v2f16_to_v2i16(
+define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
     <2 x i16> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -82,7 +82,7 @@
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @fptoui_v2f16_to_v2i32(
+define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -100,7 +100,7 @@
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: s_endpgm
-define void @fptoui_v2f16_to_v2i64(
+define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
     <2 x i64> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 0719486..ef6d778 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -8,7 +8,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fptrunc_f32_to_f16(
+define amdgpu_kernel void @fptrunc_f32_to_f16(
     half addrspace(1)* %r,
     float addrspace(1)* %a) {
 entry:
@@ -24,7 +24,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fptrunc_f64_to_f16(
+define amdgpu_kernel void @fptrunc_f64_to_f16(
     half addrspace(1)* %r,
     double addrspace(1)* %a) {
 entry:
@@ -48,7 +48,7 @@
 
 ; GCN:     buffer_store_dword v[[R_V2_F16]]
 ; GCN:     s_endpgm
-define void @fptrunc_v2f32_to_v2f16(
+define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x float> addrspace(1)* %a) {
 entry:
@@ -74,7 +74,7 @@
 ; GFX9-DENORM: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
-define void @fptrunc_v2f64_to_v2f16(
+define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x double> addrspace(1)* %a) {
 entry:
@@ -89,7 +89,7 @@
 ; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fneg_fptrunc_f32_to_f16(
+define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
     half addrspace(1)* %r,
     float addrspace(1)* %a) {
 entry:
@@ -105,7 +105,7 @@
 ; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]|
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fabs_fptrunc_f32_to_f16(
+define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
     half addrspace(1)* %r,
     float addrspace(1)* %a) {
 entry:
@@ -121,7 +121,7 @@
 ; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -|v[[A_F32]]|
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fneg_fabs_fptrunc_f32_to_f16(
+define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
     half addrspace(1)* %r,
     float addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 0c7b674..d9c5b7e 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}fptrunc_f64_to_f32:
 ; GCN: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
   %result = fptrunc double %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -14,7 +14,7 @@
 ; GCN-NOT: v_cvt
 ; GCN-UNSAFE: v_cvt_f32_f64_e32 [[F32:v[0-9]+]]
 ; GCN-UNSAFE: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[F32]]
-define void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) {
   %result = fptrunc double %in to half
   %result_i16 = bitcast half %result to i16
   store i16 %result_i16, i16 addrspace(1)* %out
@@ -24,7 +24,7 @@
 ; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32:
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {
   %result = fptrunc <2 x double> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
@@ -35,7 +35,7 @@
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {
   %result = fptrunc <4 x double> %in to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {
+define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {
   %result = fptrunc <8 x double> %in to <8 x float>
   store <8 x float> %result, <8 x float> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fract.f64.ll b/llvm/test/CodeGen/AMDGPU/fract.f64.ll
index 0651dce..7a5bcff 100644
--- a/llvm/test/CodeGen/AMDGPU/fract.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract.f64.ll
@@ -27,7 +27,7 @@
 ; GCN-UNSAFE: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]]
 
 ; GCN: buffer_store_dwordx2 [[FRACT]]
-define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %floor.x = call double @llvm.floor.f64(double %x)
   %fract = fsub double %x, %floor.x
@@ -54,7 +54,7 @@
 ; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -[[X]]
 
 ; GCN: buffer_store_dwordx2 [[FRACT]]
-define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %neg.x = fsub double -0.0, %x
   %floor.neg.x = call double @llvm.floor.f64(double %neg.x)
@@ -82,7 +82,7 @@
 ; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -|[[X]]|
 
 ; GCN: buffer_store_dwordx2 [[FRACT]]
-define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %abs.x = call double @llvm.fabs.f64(double %x)
   %neg.abs.x = fsub double -0.0, %abs.x
@@ -98,7 +98,7 @@
 ; VI-UNSAFE-DAG: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]]
 ; VI-UNSAFE: buffer_store_dwordx2 [[FLOOR]]
 ; VI-UNSAFE: buffer_store_dwordx2 [[FRACT]]
-define void @multi_use_floor_fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @multi_use_floor_fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %floor.x = call double @llvm.floor.f64(double %x)
   %fract = fsub double %x, %floor.x
diff --git a/llvm/test/CodeGen/AMDGPU/fract.ll b/llvm/test/CodeGen/AMDGPU/fract.ll
index 4e1a503..207fe28 100644
--- a/llvm/test/CodeGen/AMDGPU/fract.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract.ll
@@ -14,7 +14,7 @@
 ; GCN-UNSAFE: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
 
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %floor.x = call float @llvm.floor.f32(float %x)
   %fract = fsub float %x, %floor.x
@@ -29,7 +29,7 @@
 ; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]]
 
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %x.neg = fsub float -0.0, %x
   %floor.x.neg = call float @llvm.floor.f32(float %x.neg)
@@ -45,7 +45,7 @@
 ; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
 
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %abs.x = call float @llvm.fabs.f32(float %x)
   %neg.abs.x = fsub float -0.0, %abs.x
@@ -61,7 +61,7 @@
 
 ; GCN-UNSAFE: buffer_store_dword [[FLOOR]]
 ; GCN-UNSAFE: buffer_store_dword [[FRACT]]
-define void @multi_use_floor_fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @multi_use_floor_fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %floor.x = call float @llvm.floor.f32(float %x)
   %fract = fsub float %x, %floor.x
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 58b8308..9778069 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -15,7 +15,7 @@
 ; GCN: v_trunc_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                       float addrspace(1)* %in2) #0 {
    %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
    %r0 = load float, float addrspace(1)* %in1, align 4
@@ -33,7 +33,7 @@
 ; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
 ; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                              float addrspace(1)* %in2) #1 {
    %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
    %r0 = load float, float addrspace(1)* %in1, align 4
@@ -54,7 +54,7 @@
 ; GCN: v_add_f64
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) #0 {
    %r0 = load double, double addrspace(1)* %in1, align 8
    %r1 = load double, double addrspace(1)* %in2, align 8
@@ -70,7 +70,7 @@
 ; CI: v_trunc_f64_e32
 ; GCN: v_fma_f64
 ; GCN: s_endpgm
-define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                              double addrspace(1)* %in2) #1 {
    %r0 = load double, double addrspace(1)* %in1, align 8
    %r1 = load double, double addrspace(1)* %in2, align 8
@@ -79,7 +79,7 @@
    ret void
 }
 
-define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
                         <2 x float> addrspace(1)* %in2) #0 {
    %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
    %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8
@@ -89,7 +89,7 @@
    ret void
 }
 
-define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
                         <4 x float> addrspace(1)* %in2) #0 {
    %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
    %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16
@@ -99,7 +99,7 @@
    ret void
 }
 
-define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                         <2 x double> addrspace(1)* %in2) #0 {
    %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4
    %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index ed04043..453d8fb 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}v_safe_fsqrt_f64:
 ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %r0 = load double, double addrspace(1)* %in
   %r1 = call double @llvm.sqrt.f64(double %r0)
   store double %r1, double addrspace(1)* %out
@@ -12,7 +12,7 @@
 
 ; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f64:
 ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @v_unsafe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #2 {
+define amdgpu_kernel void @v_unsafe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #2 {
   %r0 = load double, double addrspace(1)* %in
   %r1 = call double @llvm.sqrt.f64(double %r0)
   store double %r1, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.ll
index b6526b8..a0fd341 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.ll
@@ -7,7 +7,7 @@
 
 ; FUNC-LABEL: {{^}}v_safe_fsqrt_f32:
 ; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
-define void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %r0 = load float, float addrspace(1)* %in
   %r1 = call float @llvm.sqrt.f32(float %r0)
   store float %r1, float addrspace(1)* %out
@@ -16,7 +16,7 @@
 
 ; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f32:
 ; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
-define void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #2 {
+define amdgpu_kernel void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #2 {
   %r0 = load float, float addrspace(1)* %in
   %r1 = call float @llvm.sqrt.f32(float %r0)
   store float %r1, float addrspace(1)* %out
@@ -29,7 +29,7 @@
 
 ; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
-define void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 {
 entry:
   %fdiv = call float @llvm.sqrt.f32(float %in)
   store float %fdiv, float addrspace(1)* %out
@@ -44,7 +44,7 @@
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
 ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
-define void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -65,7 +65,7 @@
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
 ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
-define void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
+define amdgpu_kernel void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
 entry:
   %fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
   store <4 x float> %fdiv, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@
 ; FUNC-LABEL: {{^}}elim_redun_check_neg0:
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_neg0(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @elim_redun_check_neg0(float addrspace(1)* %out, float %in) #1 {
 entry:
   %sqrt = call float @llvm.sqrt.f32(float %in)
   %cmp = fcmp olt float %in, -0.000000e+00
@@ -87,7 +87,7 @@
 ; FUNC-LABEL: {{^}}elim_redun_check_pos0:
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_pos0(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @elim_redun_check_pos0(float addrspace(1)* %out, float %in) #1 {
 entry:
   %sqrt = call float @llvm.sqrt.f32(float %in)
   %cmp = fcmp olt float %in, 0.000000e+00
@@ -99,7 +99,7 @@
 ; FUNC-LABEL: {{^}}elim_redun_check_ult:
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @elim_redun_check_ult(float addrspace(1)* %out, float %in) #1 {
 entry:
   %sqrt = call float @llvm.sqrt.f32(float %in)
   %cmp = fcmp ult float %in, -0.000000e+00
@@ -112,7 +112,7 @@
 ; GCN: v_sqrt_f32_e32
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
   %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
@@ -125,7 +125,7 @@
 ; GCN: v_sqrt_f32_e32
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
   %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
index 178bf80..972a8bb 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -12,7 +12,7 @@
 ; GFX89:  v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fsub_f16(
+define amdgpu_kernel void @fsub_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -32,7 +32,7 @@
 ; GFX89:  v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fsub_f16_imm_a(
+define amdgpu_kernel void @fsub_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -50,7 +50,7 @@
 ; GFX89:  v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fsub_f16_imm_b(
+define amdgpu_kernel void @fsub_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -89,7 +89,7 @@
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fsub_v2f16(
+define amdgpu_kernel void @fsub_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -123,7 +123,7 @@
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fsub_v2f16_imm_a(
+define amdgpu_kernel void @fsub_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -154,7 +154,7 @@
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fsub_v2f16_imm_b(
+define amdgpu_kernel void @fsub_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.ll b/llvm/test/CodeGen/AMDGPU/fsub.ll
index de3ab69..e7a92d9 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}v_fsub_f32:
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -17,7 +17,7 @@
 ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W
 
 ; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
   %sub = fsub float %a, %b
   store float %sub, float addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@
 
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
   %sub = fsub <2 x float> %a, %b
   store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -45,7 +45,7 @@
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
   %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
@@ -60,7 +60,7 @@
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: s_endpgm
-define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
+define amdgpu_kernel void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
   %result = fsub <4 x float> %a, %b
   store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -69,7 +69,7 @@
 ; FUNC-LABEL: {{^}}v_fneg_fsub_f32:
 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
-define void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -82,7 +82,7 @@
 ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_f32:
 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI-NOT: xor
-define void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -95,7 +95,7 @@
 ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_attribute_f32:
 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI-NOT: xor
-define void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -111,7 +111,7 @@
 ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_false_attribute_f32:
 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
-define void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -123,7 +123,7 @@
 
 ; FUNC-LABEL: {{^}}v_fsub_0_nsz_attribute_f32:
 ; SI-NOT: v_sub
-define void @v_fsub_0_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fsub_0_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %a = load float, float addrspace(1)* %in, align 4
   %result = fsub float %a, 0.0
   store float %result, float addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fsub64.ll b/llvm/test/CodeGen/AMDGPU/fsub64.ll
index 4c9c5dd..1b0879d 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub64.ll
@@ -5,7 +5,7 @@
 
 ; SI-LABEL: {{^}}fsub_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -16,7 +16,7 @@
 
 ; SI-LABEL: {{^}}fsub_fabs_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}}
-define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                            double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -28,7 +28,7 @@
 
 ; SI-LABEL: {{^}}fsub_fabs_inv_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                                double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -40,7 +40,7 @@
 
 ; SI-LABEL: {{^}}s_fsub_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
+define amdgpu_kernel void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
   %sub = fsub double %a, %b
   store double %sub, double addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@
 
 ; SI-LABEL: {{^}}s_fsub_imm_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}, 4.0
-define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
+define amdgpu_kernel void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
   %sub = fsub double 4.0, %a
   store double %sub, double addrspace(1)* %out
   ret void
@@ -56,7 +56,7 @@
 
 ; SI-LABEL: {{^}}s_fsub_imm_inv_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}}, -4.0
-define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) {
+define amdgpu_kernel void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) {
   %sub = fsub double %a, 4.0
   store double %sub, double addrspace(1)* %out
   ret void
@@ -64,7 +64,7 @@
 
 ; SI-LABEL: {{^}}s_fsub_self_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
+define amdgpu_kernel void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
   %sub = fsub double %a, %a
   store double %sub, double addrspace(1)* %out
   ret void
@@ -73,7 +73,7 @@
 ; SI-LABEL: {{^}}fsub_v2f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) {
+define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) {
   %sub = fsub <2 x double> %a, %b
   store <2 x double> %sub, <2 x double> addrspace(1)* %out
   ret void
@@ -84,7 +84,7 @@
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) {
+define amdgpu_kernel void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
   %a = load <4 x double>, <4 x double> addrspace(1)* %in
   %b = load <4 x double>, <4 x double> addrspace(1)* %b_ptr
@@ -98,7 +98,7 @@
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) {
+define amdgpu_kernel void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) {
   %result = fsub <4 x double> %a, %b
   store <4 x double> %result, <4 x double> addrspace(1)* %out, align 16
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
index c4138ad..1f72ec6 100644
--- a/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -13,7 +13,7 @@
 ; CI: v_trunc_f64
 ; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11
 ; SI: s_endpgm
-define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
   %x = load double, double addrspace(1)* %in, align 8
   %y = call double @llvm.trunc.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out, align 8
@@ -36,7 +36,7 @@
 ; SI-DAG: cndmask_b32
 ; SI-DAG: cndmask_b32
 ; SI: s_endpgm
-define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ftrunc_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.trunc.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
@@ -45,7 +45,7 @@
 ; FUNC-LABEL: {{^}}ftrunc_v2f64:
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
@@ -55,7 +55,7 @@
 ; FIXME-CI: v_trunc_f64_e32
 ; FIXME-CI: v_trunc_f64_e32
 ; FIXME-CI: v_trunc_f64_e32
-; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+; define amdgpu_kernel void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ;   %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone
 ;   store <3 x double> %y, <3 x double> addrspace(1)* %out
 ;   ret void
@@ -66,7 +66,7 @@
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
@@ -81,7 +81,7 @@
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
@@ -104,7 +104,7 @@
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ftrunc.ll b/llvm/test/CodeGen/AMDGPU/ftrunc.ll
index d071839..b5ad01e 100644
--- a/llvm/test/CodeGen/AMDGPU/ftrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/ftrunc.ll
@@ -12,7 +12,7 @@
 ; FUNC-LABEL: {{^}}ftrunc_f32:
 ; EG: TRUNC
 ; SI: v_trunc_f32_e32
-define void @ftrunc_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @ftrunc_f32(float addrspace(1)* %out, float %x) {
   %y = call float @llvm.trunc.f32(float %x) nounwind readnone
   store float %y, float addrspace(1)* %out
   ret void
@@ -23,7 +23,7 @@
 ; EG: TRUNC
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
   %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone
   store <2 x float> %y, <2 x float> addrspace(1)* %out
   ret void
@@ -36,7 +36,7 @@
 ; FIXME-SI: v_trunc_f32_e32
 ; FIXME-SI: v_trunc_f32_e32
 ; FIXME-SI: v_trunc_f32_e32
-; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+; define amdgpu_kernel void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
 ;   %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone
 ;   store <3 x float> %y, <3 x float> addrspace(1)* %out
 ;   ret void
@@ -51,7 +51,7 @@
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
   %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone
   store <4 x float> %y, <4 x float> addrspace(1)* %out
   ret void
@@ -74,7 +74,7 @@
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+define amdgpu_kernel void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
   %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone
   store <8 x float> %y, <8 x float> addrspace(1)* %out
   ret void
@@ -113,7 +113,7 @@
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+define amdgpu_kernel void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
   %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone
   store <16 x float> %y, <16 x float> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/gep-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
index f964636..7fb47e0 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 
-define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
+define amdgpu_kernel void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
 ; CHECK-LABEL: {{^}}use_gep_address_space:
 ; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
 ; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64
@@ -17,7 +17,7 @@
 ; SI: s_or_b32
 ; CI: s_add_i32
 ; CHECK: ds_write_b32
-define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
+define amdgpu_kernel void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
   %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
   store i32 99, i32 addrspace(3)* %p
   ret void
@@ -39,7 +39,7 @@
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CHECK: s_endpgm
-define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
+define amdgpu_kernel void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
   %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
@@ -60,7 +60,7 @@
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CHECK: s_endpgm
-define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
+define amdgpu_kernel void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
   %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
   %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/global-constant.ll b/llvm/test/CodeGen/AMDGPU/global-constant.ll
index 5a18d42..80acfcc 100644
--- a/llvm/test/CodeGen/AMDGPU/global-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-constant.ll
@@ -26,7 +26,7 @@
 ; HSA: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], private2@rel32@lo+4
 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2@rel32@hi+4
 
-define void @private_test(i32 %index, float addrspace(1)* %out) {
+define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) {
   %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @private1, i32 0, i32 %index
   %val = load float, float addrspace(2)* %ptr
   store float %val, float addrspace(1)* %out
@@ -40,7 +40,7 @@
 ; HSA: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
 ; HSA: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], available_externally@gotpcrel32@lo+4
 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], available_externally@gotpcrel32@hi+4
-define void @available_externally_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(2)* @available_externally, i32 0, i32 1
   %val = load i32, i32 addrspace(2)* %ptr
   store i32 %val, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/global-directive.ll b/llvm/test/CodeGen/AMDGPU/global-directive.ll
index 450b7d3..ce89e39 100644
--- a/llvm/test/CodeGen/AMDGPU/global-directive.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-directive.ll
@@ -5,7 +5,7 @@
 
 ; SI:	.globl	foo
 ; SI: {{^}}foo:
-define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll b/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
index 2c7c02d..19e592f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
@@ -7,7 +7,7 @@
 ; SI: buffer_load_ushort
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -18,7 +18,7 @@
 ; SI: buffer_load_sshort
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -28,7 +28,7 @@
 ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32:
 ; SI: buffer_load_ushort
 ; SI: s_endpgm
-define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -38,7 +38,7 @@
 ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32:
 ; SI: buffer_load_sshort
 ; SI: s_endpgm
-define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -47,7 +47,7 @@
 
 ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32:
 ; SI: s_endpgm
-define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -56,7 +56,7 @@
 
 ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32:
 ; SI: s_endpgm
-define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -65,7 +65,7 @@
 
 ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32:
 ; SI: s_endpgm
-define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -74,7 +74,7 @@
 
 ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32:
 ; SI: s_endpgm
-define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -83,7 +83,7 @@
 
 ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32:
 ; SI: s_endpgm
-define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -92,7 +92,7 @@
 
 ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32:
 ; SI: s_endpgm
-define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -101,7 +101,7 @@
 
 ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32:
 ; SI: s_endpgm
-define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -110,7 +110,7 @@
 
 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32:
 ; SI: s_endpgm
-define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -119,7 +119,7 @@
 
 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32:
 ; SI: s_endpgm
-define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -128,7 +128,7 @@
 
 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32:
 ; SI: s_endpgm
-define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -137,7 +137,7 @@
 
 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32:
 ; SI: s_endpgm
-define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -146,7 +146,7 @@
 
 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32:
 ; SI: s_endpgm
-define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -157,7 +157,7 @@
 ; SI-DAG: buffer_load_ushort v[[LO:[0-9]+]],
 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -168,7 +168,7 @@
 ; VI: buffer_load_ushort [[LOAD:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0
 ; VI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
 ; VI: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0
-define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -177,7 +177,7 @@
 
 ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64:
 ; SI: s_endpgm
-define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -186,7 +186,7 @@
 
 ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64:
 ; SI: s_endpgm
-define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -195,7 +195,7 @@
 
 ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64:
 ; SI: s_endpgm
-define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -204,7 +204,7 @@
 
 ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64:
 ; SI: s_endpgm
-define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -213,7 +213,7 @@
 
 ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64:
 ; SI: s_endpgm
-define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -222,7 +222,7 @@
 
 ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64:
 ; SI: s_endpgm
-define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -231,7 +231,7 @@
 
 ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64:
 ; SI: s_endpgm
-define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -240,7 +240,7 @@
 
 ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64:
 ; SI: s_endpgm
-define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -249,7 +249,7 @@
 
 ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64:
 ; SI: s_endpgm
-define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -258,7 +258,7 @@
 
 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64:
 ; SI: s_endpgm
-define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -267,7 +267,7 @@
 
 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64:
 ; SI: s_endpgm
-define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -276,7 +276,7 @@
 
 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64:
 ; SI: s_endpgm
-define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -285,7 +285,7 @@
 
 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64:
 ; SI: s_endpgm
-define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = zext <64 x i16> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -294,7 +294,7 @@
 
 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64:
 ; SI: s_endpgm
-define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = sext <64 x i16> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
index 00be6e4..ae6dd54 100644
--- a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
@@ -19,7 +19,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @private_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @private_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @private, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -33,7 +33,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @internal_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @internal_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @internal, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -50,7 +50,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @available_externally_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @available_externally, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -67,7 +67,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @linkonce_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @linkonce_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -84,7 +84,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @weak_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @weak_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -101,7 +101,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @common_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @common_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @common, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -118,7 +118,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @extern_weak_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @extern_weak_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @extern_weak, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -135,7 +135,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @linkonce_odr_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @linkonce_odr_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce_odr, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -152,7 +152,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @weak_odr_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @weak_odr_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak_odr, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -169,7 +169,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @external_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @external_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -186,7 +186,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @external_w_init_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @external_w_init_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external_w_init, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 909ceb55..6928bed 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_offset:
 ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -13,7 +13,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_i32_soffset:
 ; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x8ca0
 ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}}
-define void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -25,7 +25,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[PTRHI:[0-9]+]], 0xabcd
 ; SI: buffer_atomic_add v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_add
-define void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595
 
@@ -36,7 +36,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset:
 ; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -47,7 +47,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -59,7 +59,7 @@
 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -70,7 +70,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_add_i32:
 ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -79,7 +79,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret:
 ; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -89,7 +89,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -100,7 +100,7 @@
 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -110,7 +110,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_offset:
 ; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -120,7 +120,7 @@
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset:
 ; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -131,7 +131,7 @@
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -143,7 +143,7 @@
 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -154,7 +154,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_and_i32:
 ; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -163,7 +163,7 @@
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret:
 ; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -173,7 +173,7 @@
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -184,7 +184,7 @@
 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -194,7 +194,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
 ; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -204,7 +204,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset:
 ; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -215,7 +215,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -227,7 +227,7 @@
 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -238,7 +238,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32:
 ; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -247,7 +247,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret:
 ; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -257,7 +257,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -268,7 +268,7 @@
 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -278,7 +278,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_offset:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -288,7 +288,7 @@
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset:
 ; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -299,7 +299,7 @@
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -311,7 +311,7 @@
 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -322,7 +322,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_max_i32:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -331,7 +331,7 @@
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret:
 ; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -341,7 +341,7 @@
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -352,7 +352,7 @@
 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -362,7 +362,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
 ; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -372,7 +372,7 @@
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset:
 ; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -383,7 +383,7 @@
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -395,7 +395,7 @@
 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -406,7 +406,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32:
 ; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -415,7 +415,7 @@
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret:
 ; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -425,7 +425,7 @@
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -436,7 +436,7 @@
 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -446,7 +446,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_offset:
 ; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -456,7 +456,7 @@
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset:
 ; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -467,7 +467,7 @@
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -479,7 +479,7 @@
 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -490,7 +490,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_min_i32:
 ; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -499,7 +499,7 @@
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret:
 ; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -509,7 +509,7 @@
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -520,7 +520,7 @@
 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -530,7 +530,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
 ; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -540,7 +540,7 @@
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset:
 ; GCN: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -551,7 +551,7 @@
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -563,7 +563,7 @@
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -574,7 +574,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32:
 ; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -583,7 +583,7 @@
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -593,7 +593,7 @@
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -604,7 +604,7 @@
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -614,7 +614,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_offset:
 ; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -624,7 +624,7 @@
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset:
 ; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -635,7 +635,7 @@
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -647,7 +647,7 @@
 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -658,7 +658,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_or_i32:
 ; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -667,7 +667,7 @@
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret:
 ; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -677,7 +677,7 @@
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -688,7 +688,7 @@
 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -698,7 +698,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
 ; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -708,7 +708,7 @@
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset:
 ; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -720,7 +720,7 @@
 ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 
 ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -733,7 +733,7 @@
 
 ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -744,7 +744,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32:
 ; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -753,7 +753,7 @@
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret:
 ; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -763,7 +763,7 @@
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
 ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -774,7 +774,7 @@
 ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_swap [[RET:v[0-9]+]],  v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -784,7 +784,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset:
 ; GCN: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -794,7 +794,7 @@
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
 ; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -807,7 +807,7 @@
 ; SI: buffer_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 
 ; VI: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -819,7 +819,7 @@
 ; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -831,7 +831,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32:
 ; GCN: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
   ret void
@@ -840,7 +840,7 @@
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret:
 ; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
   %extract0 = extractvalue { i32, i1 } %val, 0
@@ -851,7 +851,7 @@
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
 ; SI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -862,7 +862,7 @@
 ; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -873,7 +873,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
 ; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -883,7 +883,7 @@
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset:
 ; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -894,7 +894,7 @@
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -906,7 +906,7 @@
 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -917,7 +917,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32:
 ; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -926,7 +926,7 @@
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret:
 ; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -936,7 +936,7 @@
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -947,7 +947,7 @@
 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -959,7 +959,7 @@
 ; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %in, i64 4
   %val = load atomic i32, i32 addrspace(1)* %gep  seq_cst, align 4
@@ -971,7 +971,7 @@
 ; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4
   store i32 %val, i32 addrspace(1)* %out
@@ -982,7 +982,7 @@
 ; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -995,7 +995,7 @@
 ; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index
   %val = load atomic i32, i32 addrspace(1)* %ptr seq_cst, align 4
@@ -1006,7 +1006,7 @@
 ; FUNC-LABEL: {{^}}atomic_store_i32_offset:
 ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   store atomic i32 %in, i32 addrspace(1)* %gep  seq_cst, align 4
@@ -1016,7 +1016,7 @@
 ; FUNC-LABEL: {{^}}atomic_store_i32:
 ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
   ret void
@@ -1025,7 +1025,7 @@
 ; FUNC-LABEL: {{^}}atomic_store_i32_addr64_offset:
 ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -1036,7 +1036,7 @@
 ; FUNC-LABEL: {{^}}atomic_store_i32_addr64:
 ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   store atomic i32 %in, i32 addrspace(1)* %ptr seq_cst, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index f66c6c7..56520b7 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i64_offset:
 ; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -13,7 +13,7 @@
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_offset:
 ; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -24,7 +24,7 @@
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset:
 ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -36,7 +36,7 @@
 ; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -47,7 +47,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i64:
 ; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -56,7 +56,7 @@
 ; GCN-LABEL: {{^}}atomic_add_i64_ret:
 ; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -66,7 +66,7 @@
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64:
 ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -77,7 +77,7 @@
 ; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -87,7 +87,7 @@
 
 ; GCN-LABEL: {{^}}atomic_and_i64_offset:
 ; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -97,7 +97,7 @@
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_offset:
 ; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -108,7 +108,7 @@
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset:
 ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -120,7 +120,7 @@
 ; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -131,7 +131,7 @@
 
 ; GCN-LABEL: {{^}}atomic_and_i64:
 ; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -140,7 +140,7 @@
 ; GCN-LABEL: {{^}}atomic_and_i64_ret:
 ; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -150,7 +150,7 @@
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64:
 ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -161,7 +161,7 @@
 ; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -171,7 +171,7 @@
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_offset:
 ; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -181,7 +181,7 @@
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset:
 ; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -192,7 +192,7 @@
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset:
 ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -204,7 +204,7 @@
 ; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -215,7 +215,7 @@
 
 ; GCN-LABEL: {{^}}atomic_sub_i64:
 ; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -224,7 +224,7 @@
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret:
 ; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -234,7 +234,7 @@
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64:
 ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -245,7 +245,7 @@
 ; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -255,7 +255,7 @@
 
 ; GCN-LABEL: {{^}}atomic_max_i64_offset:
 ; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -265,7 +265,7 @@
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_offset:
 ; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -276,7 +276,7 @@
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset:
 ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -288,7 +288,7 @@
 ; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -299,7 +299,7 @@
 
 ; GCN-LABEL: {{^}}atomic_max_i64:
 ; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -308,7 +308,7 @@
 ; GCN-LABEL: {{^}}atomic_max_i64_ret:
 ; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -318,7 +318,7 @@
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64:
 ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -329,7 +329,7 @@
 ; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -339,7 +339,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_offset:
 ; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -349,7 +349,7 @@
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset:
 ; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -360,7 +360,7 @@
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset:
 ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -372,7 +372,7 @@
 ; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -383,7 +383,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umax_i64:
 ; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -392,7 +392,7 @@
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret:
 ; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -402,7 +402,7 @@
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64:
 ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -413,7 +413,7 @@
 ; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -423,7 +423,7 @@
 
 ; GCN-LABEL: {{^}}atomic_min_i64_offset:
 ; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -433,7 +433,7 @@
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_offset:
 ; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -444,7 +444,7 @@
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset:
 ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -456,7 +456,7 @@
 ; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -467,7 +467,7 @@
 
 ; GCN-LABEL: {{^}}atomic_min_i64:
 ; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -476,7 +476,7 @@
 ; GCN-LABEL: {{^}}atomic_min_i64_ret:
 ; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -486,7 +486,7 @@
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64:
 ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -497,7 +497,7 @@
 ; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -507,7 +507,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_offset:
 ; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -517,7 +517,7 @@
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset:
 ; GCN: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -528,7 +528,7 @@
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset:
 ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -540,7 +540,7 @@
 ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -551,7 +551,7 @@
 
 ; GCN-LABEL: {{^}}atomic_umin_i64:
 ; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -560,7 +560,7 @@
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret:
 ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -570,7 +570,7 @@
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64:
 ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -581,7 +581,7 @@
 ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -591,7 +591,7 @@
 
 ; GCN-LABEL: {{^}}atomic_or_i64_offset:
 ; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -601,7 +601,7 @@
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_offset:
 ; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -612,7 +612,7 @@
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset:
 ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -624,7 +624,7 @@
 ; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -635,7 +635,7 @@
 
 ; GCN-LABEL: {{^}}atomic_or_i64:
 ; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -644,7 +644,7 @@
 ; GCN-LABEL: {{^}}atomic_or_i64_ret:
 ; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -654,7 +654,7 @@
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64:
 ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -665,7 +665,7 @@
 ; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -675,7 +675,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_offset:
 ; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -685,7 +685,7 @@
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
 ; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -696,7 +696,7 @@
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset:
 ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -708,7 +708,7 @@
 ; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -719,7 +719,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64:
 ; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -728,7 +728,7 @@
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret:
 ; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -738,7 +738,7 @@
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64:
 ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -749,7 +749,7 @@
 ; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]],  v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -759,7 +759,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_offset:
 ; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -769,7 +769,7 @@
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset:
 ; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -780,7 +780,7 @@
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset:
 ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -792,7 +792,7 @@
 ; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -803,7 +803,7 @@
 
 ; GCN-LABEL: {{^}}atomic_xor_i64:
 ; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -812,7 +812,7 @@
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret:
 ; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -822,7 +822,7 @@
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64:
 ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -833,7 +833,7 @@
 ; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -851,7 +851,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_offset:
 ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -861,7 +861,7 @@
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_soffset:
 ; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x11940
 ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}}
-define void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 9000
   %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -871,7 +871,7 @@
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset:
 ; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -884,7 +884,7 @@
 ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 
 ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -896,7 +896,7 @@
 ; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -908,7 +908,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64:
 ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
   ret void
@@ -917,7 +917,7 @@
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret:
 ; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
@@ -928,7 +928,7 @@
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_addr64:
 ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
@@ -939,7 +939,7 @@
 ; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
@@ -952,7 +952,7 @@
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %in, i64 4
   %val = load atomic i64, i64 addrspace(1)* %gep  seq_cst, align 8
@@ -964,7 +964,7 @@
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
   %val = load atomic i64, i64 addrspace(1)* %in seq_cst, align 8
   store i64 %val, i64 addrspace(1)* %out
@@ -975,7 +975,7 @@
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -988,7 +988,7 @@
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index
   %val = load atomic i64, i64 addrspace(1)* %ptr seq_cst, align 8
@@ -999,7 +999,7 @@
 ; FUNC-LABEL: {{^}}atomic_store_i64_offset:
 ; CI: buffer_store_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; VI: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   store atomic i64 %in, i64 addrspace(1)* %gep  seq_cst, align 8
@@ -1009,7 +1009,7 @@
 ; FUNC-LABEL: {{^}}atomic_store_i64:
 ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc
-define void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) {
 entry:
   store atomic i64 %in, i64 addrspace(1)* %out seq_cst, align 8
   ret void
@@ -1018,7 +1018,7 @@
 ; FUNC-LABEL: {{^}}atomic_store_i64_addr64_offset:
 ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -1029,7 +1029,7 @@
 ; FUNC-LABEL: {{^}}atomic_store_i64_addr64:
 ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   store atomic i64 %in, i64 addrspace(1)* %ptr seq_cst, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll
index d07843e..0903542 100644
--- a/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll
+++ b/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll
@@ -15,7 +15,7 @@
 ; EG: @float_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @float(float addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) {
 entry:
   %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
   %1 = load float, float addrspace(2)* %0
@@ -33,7 +33,7 @@
 ; EG: @i32_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @i32(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @i32(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index
   %1 = load i32, i32 addrspace(2)* %0
@@ -53,7 +53,7 @@
 ; EG: @struct_foo_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
   %load = load i32, i32 addrspace(2)* %gep, align 4
   store i32 %load, i32 addrspace(1)* %out, align 4
@@ -72,7 +72,7 @@
 ; EG: @array_v1_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
   %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4
   store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
@@ -84,7 +84,7 @@
 ; EG: VTX_READ_32
 ; EG: @float_gv
 ; EG-NOT: MOVA_INT
-define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
+define amdgpu_kernel void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
 entry:
   %0 = icmp eq i32 0, %a
   br i1 %0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/gv-offset-folding.ll b/llvm/test/CodeGen/AMDGPU/gv-offset-folding.ll
index 2b5af75..e641d72 100644
--- a/llvm/test/CodeGen/AMDGPU/gv-offset-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/gv-offset-folding.ll
@@ -13,7 +13,7 @@
 
 ; CHECK-LABEL: lds_no_offset:
 ; CHECK: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:4
-define void @lds_no_offset() {
+define amdgpu_kernel void @lds_no_offset() {
 entry:
   %ptr = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 1
   store i32 0, i32 addrspace(3)* %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 4b48950..82a6c10 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -8,7 +8,7 @@
 ; SI: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]]
 ; VI: v_trunc_f16_e32 [[CVT:v[0-9]+]], [[ARG]]
 ; GCN: buffer_store_short [[CVT]]
-define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
+define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
   store half %arg, half addrspace(1)* %out
   ret void
 }
@@ -20,7 +20,7 @@
 ; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]]
 ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN: s_endpgm
-define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
+define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
   store <2 x half> %arg, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -34,7 +34,7 @@
 ; GCN-DAG: buffer_store_short
 ; GCN-NOT: buffer_store
 ; GCN: s_endpgm
-define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
+define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
   store <3 x half> %arg, <3 x half> addrspace(1)* %out
   ret void
 }
@@ -46,33 +46,33 @@
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
+define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
   store <4 x half> %arg, <4 x half> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_v8f16_arg:
-define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
+define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
   store <8 x half> %arg, <8 x half> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_v2f16_arg:
-define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
+define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
   %fpext = fpext <2 x half> %in to <2 x float>
   store <2 x float> %fpext, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_f16_to_f32_arg:
-define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
+define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
   %ext = fpext half %arg to float
   store float %ext, float addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg:
-define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
   %ext = fpext <2 x half> %arg to <2 x float>
   store <2 x float> %ext, <2 x float> addrspace(1)* %out
   ret void
@@ -90,14 +90,14 @@
 ; GCN-DAG: buffer_store_dword
 ; GCN-DAG: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
   %ext = fpext <3 x half> %arg to <3 x float>
   store <3 x float> %ext, <3 x float> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg:
-define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
   %ext = fpext <4 x half> %arg to <4 x float>
   store <4 x float> %ext, <4 x float> addrspace(1)* %out
   ret void
@@ -124,7 +124,7 @@
 
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
   %ext = fpext <8 x half> %arg to <8 x float>
   store <8 x float> %ext, <8 x float> addrspace(1)* %out
   ret void
@@ -138,7 +138,7 @@
 ; VI: v_cvt_f32_f16_e32 v[[VARG_F32:[0-9]+]], v[[VARG]]
 ; VI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[VARG_F32]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
+define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
   %ext = fpext half %arg to double
   store double %ext, double addrspace(1)* %out
   ret void
@@ -152,7 +152,7 @@
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN: s_endpgm
-define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
   %ext = fpext <2 x half> %arg to <2 x double>
   store <2 x double> %ext, <2 x double> addrspace(1)* %out
   ret void
@@ -169,7 +169,7 @@
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN: s_endpgm
-define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
   %ext = fpext <3 x half> %arg to <3 x double>
   store <3 x double> %ext, <3 x double> addrspace(1)* %out
   ret void
@@ -189,7 +189,7 @@
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN: s_endpgm
-define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
   %ext = fpext <4 x half> %arg to <4 x double>
   store <4 x double> %ext, <4 x double> addrspace(1)* %out
   ret void
@@ -227,7 +227,7 @@
 ; GCN-DAG: v_cvt_f64_f32_e32
 
 ; GCN: s_endpgm
-define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
   %ext = fpext <8 x half> %arg to <8 x double>
   store <8 x double> %ext, <8 x double> addrspace(1)* %out
   ret void
@@ -236,7 +236,7 @@
 ; GCN-LABEL: {{^}}global_load_store_f16:
 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
 ; GCN: buffer_store_short [[TMP]]
-define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   store half %val, half addrspace(1)* %out
   ret void
@@ -245,7 +245,7 @@
 ; GCN-LABEL: {{^}}global_load_store_v2f16:
 ; GCN: buffer_load_dword [[TMP:v[0-9]+]]
 ; GCN: buffer_store_dword [[TMP]]
-define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   store <2 x half> %val, <2 x half> addrspace(1)* %out
   ret void
@@ -254,7 +254,7 @@
 ; GCN-LABEL: {{^}}global_load_store_v4f16:
 ; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx2 [[TMP]]
-define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
   %val = load <4 x half>, <4 x half> addrspace(1)* %in
   store <4 x half> %val, <4 x half> addrspace(1)* %out
   ret void
@@ -264,7 +264,7 @@
 ; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
 ; GCN: s_endpgm
-define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
   %val = load <8 x half>, <8 x half> addrspace(1)* %in
   store <8 x half> %val, <8 x half> addrspace(1)* %out
   ret void
@@ -274,7 +274,7 @@
 ; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
 ; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]]
 ; GCN: buffer_store_dword [[CVT]]
-define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   %cvt = fpext half %val to float
   store float %cvt, float addrspace(1)* %out
@@ -289,7 +289,7 @@
 ; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
 ; GCN: s_endpgm
-define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %cvt = fpext <2 x half> %val to <2 x float>
   store <2 x float> %cvt, <2 x float> addrspace(1)* %out
@@ -297,7 +297,7 @@
 }
 
 ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32:
-define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
   %val = load <3 x half>, <3 x half> addrspace(1)* %in
   %cvt = fpext <3 x half> %val to <3 x float>
   store <3 x float> %cvt, <3 x float> addrspace(1)* %out
@@ -305,7 +305,7 @@
 }
 
 ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32:
-define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
   %val = load <4 x half>, <4 x half> addrspace(1)* %in
   %cvt = fpext <4 x half> %val to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out
@@ -313,7 +313,7 @@
 }
 
 ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32:
-define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
   %val = load <8 x half>, <8 x half> addrspace(1)* %in
   %cvt = fpext <8 x half> %val to <8 x float>
   store <8 x float> %cvt, <8 x float> addrspace(1)* %out
@@ -347,7 +347,7 @@
 ; GCN: buffer_store_dwordx4
 
 ; GCN: s_endpgm
-define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
   %val = load <16 x half>, <16 x half> addrspace(1)* %in
   %cvt = fpext <16 x half> %val to <16 x float>
   store <16 x float> %cvt, <16 x float> addrspace(1)* %out
@@ -359,7 +359,7 @@
 ; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]]
 ; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]]
 ; GCN: buffer_store_dwordx2 [[CVT1]]
-define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   %cvt = fpext half %val to double
   store double %cvt, double addrspace(1)* %out
@@ -375,7 +375,7 @@
 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]]
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %cvt = fpext <2 x half> %val to <2 x double>
   store <2 x double> %cvt, <2 x double> addrspace(1)* %out
@@ -413,7 +413,7 @@
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dwordx2 [[Z]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
 ; GCN: s_endpgm
-define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
   %val = load <3 x half>, <3 x half> addrspace(1)* %in
   %cvt = fpext <3 x half> %val to <3 x double>
   store <3 x double> %cvt, <3 x double> addrspace(1)* %out
@@ -421,7 +421,7 @@
 }
 
 ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64:
-define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
   %val = load <4 x half>, <4 x half> addrspace(1)* %in
   %cvt = fpext <4 x half> %val to <4 x double>
   store <4 x double> %cvt, <4 x double> addrspace(1)* %out
@@ -429,7 +429,7 @@
 }
 
 ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64:
-define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
   %val = load <8 x half>, <8 x half> addrspace(1)* %in
   %cvt = fpext <8 x half> %val to <8 x double>
   store <8 x double> %cvt, <8 x double> addrspace(1)* %out
@@ -437,7 +437,7 @@
 }
 
 ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64:
-define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
   %val = load <16 x half>, <16 x half> addrspace(1)* %in
   %cvt = fpext <16 x half> %val to <16 x double>
   store <16 x double> %cvt, <16 x double> addrspace(1)* %out
@@ -448,7 +448,7 @@
 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]]
 ; GCN: buffer_store_short [[CVT]]
-define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %val = load float, float addrspace(1)* %in
   %cvt = fptrunc float %val to half
   store half %cvt, half addrspace(1)* %out
@@ -463,7 +463,7 @@
 ; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]]
 ; GCN-DAG: buffer_store_dword [[PACKED]]
 ; GCN: s_endpgm
-define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
   %val = load <2 x float>, <2 x float> addrspace(1)* %in
   %cvt = fptrunc <2 x float> %val to <2 x half>
   store <2 x half> %cvt, <2 x half> addrspace(1)* %out
@@ -479,7 +479,7 @@
 ; GCN: buffer_store_short
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %val = load <3 x float>, <3 x float> addrspace(1)* %in
   %cvt = fptrunc <3 x float> %val to <3 x half>
   store <3 x half> %cvt, <3 x half> addrspace(1)* %out
@@ -494,7 +494,7 @@
 ; GCN: v_cvt_f16_f32_e32
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %val = load <4 x float>, <4 x float> addrspace(1)* %in
   %cvt = fptrunc <4 x float> %val to <4 x half>
   store <4 x half> %cvt, <4 x half> addrspace(1)* %out
@@ -514,7 +514,7 @@
 ; GCN: v_cvt_f16_f32_e32
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
   %val = load <8 x float>, <8 x float> addrspace(1)* %in
   %cvt = fptrunc <8 x float> %val to <8 x half>
   store <8 x half> %cvt, <8 x half> addrspace(1)* %out
@@ -545,7 +545,7 @@
 ; GCN-DAG: buffer_store_dwordx4
 ; GCN-DAG: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
   %val = load <16 x float>, <16 x float> addrspace(1)* %in
   %cvt = fptrunc <16 x float> %val to <16 x half>
   store <16 x half> %cvt, <16 x half> addrspace(1)* %out
@@ -560,7 +560,7 @@
 ; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
+define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
    %add = fadd half %a, %b
    store half %add, half addrspace(1)* %out, align 4
    ret void
@@ -570,7 +570,7 @@
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
   %add = fadd <2 x half> %a, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8
   ret void
@@ -582,7 +582,7 @@
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1
   %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16
   %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16
@@ -601,7 +601,7 @@
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
   %add = fadd <8 x half> %a, %b
   store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32
   ret void
@@ -610,7 +610,7 @@
 ; GCN-LABEL: {{^}}test_bitcast_from_half:
 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
 ; GCN: buffer_store_short [[TMP]]
-define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
   %val = load half, half addrspace(1)* %in
   %val_int = bitcast half %val to i16
   store i16 %val_int, i16 addrspace(1)* %out
@@ -620,7 +620,7 @@
 ; GCN-LABEL: {{^}}test_bitcast_to_half:
 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
 ; GCN: buffer_store_short [[TMP]]
-define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %val = load i16, i16 addrspace(1)* %in
   %val_fp = bitcast i16 %val to half
   store half %val_fp, half addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-default-device.ll b/llvm/test/CodeGen/AMDGPU/hsa-default-device.ll
index 631d6de..45efe9b 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-default-device.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-default-device.ll
@@ -4,7 +4,7 @@
 ; unsupported device.
 
 ; CHECK: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
-define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
+define amdgpu_kernel void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
   store float 0.0, float addrspace(1)* %out0
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll b/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll
index ecb825a..b1901cf 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll
@@ -4,7 +4,7 @@
 ; GCN: float_mode = 192
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
+define amdgpu_kernel void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -14,7 +14,7 @@
 ; GCN: float_mode = 192
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
+define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -24,7 +24,7 @@
 ; GCN: float_mode = 192
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
+define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -34,7 +34,7 @@
 ; GCN: float_mode = 48
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
+define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -44,7 +44,7 @@
 ; GCN: float_mode = 240
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
+define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -54,7 +54,7 @@
 ; GCN: float_mode = 0
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
+define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -64,7 +64,7 @@
 ; GCN: float_mode = 192
 ; GCN: enable_dx10_clamp = 0
 ; GCN: enable_ieee_mode = 1
-define void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
+define amdgpu_kernel void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-globals.ll b/llvm/test/CodeGen/AMDGPU/hsa-globals.ll
index 2820b30..2ec57a40 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-globals.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-globals.ll
@@ -9,7 +9,7 @@
 @internal_readonly = internal unnamed_addr addrspace(2) constant i32 0
 @external_readonly = unnamed_addr addrspace(2) constant i32 0
 
-define void @test() {
+define amdgpu_kernel void @test() {
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-group-segment.ll b/llvm/test/CodeGen/AMDGPU/hsa-group-segment.ll
index 1999dc3..6007938 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-group-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-group-segment.ll
@@ -3,7 +3,7 @@
 @internal_group = internal addrspace(3) global i32 undef
 @external_group = addrspace(3) global i32 undef
 
-define void @test() {
+define amdgpu_kernel void @test() {
 entry:
   store i32 0, i32 addrspace(3)* @internal_group
   store i32 0, i32 addrspace(3)* @external_group
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
index e85db65..f6bf0b0 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
@@ -5,7 +5,7 @@
 ; SI-LABEL: {{^}}br_implicit_def:
 ; SI: BB#0:
 ; SI-NEXT: s_cbranch_scc1
-define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
+define amdgpu_kernel void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
 bb:
   br i1 undef, label %bb1, label %bb2
 
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
index d491277..b160af8 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -10,7 +10,7 @@
 ; SI: s_and_saveexec_b64
 ; SI: s_xor_b64
 ; SI: s_endpgm
-define void @br_i1_phi(i32 %arg) {
+define amdgpu_kernel void @br_i1_phi(i32 %arg) {
 bb:
   %tidig = call i32 @llvm.r600.read.tidig.x() #0
   %cmp = trunc i32 %tidig to i1
diff --git a/llvm/test/CodeGen/AMDGPU/i8-to-double-to-float.ll b/llvm/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
index c218e19..d501be5 100644
--- a/llvm/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
+++ b/llvm/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
@@ -2,7 +2,7 @@
 
 ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
   %1 = load i8, i8 addrspace(1)* %in
   %2 = uitofp i8 %1 to double
   %3 = fptrunc double %2 to float
diff --git a/llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll b/llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
index 60e59a5..12cc440 100644
--- a/llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
@@ -6,7 +6,7 @@
 ;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;CHECK-NOT: SETNE_INT
 
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32, i32 addrspace(1)* %in
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
index c3dad2d..99c2138 100644
--- a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
@@ -8,7 +8,7 @@
 ; GCN-LABEL: {{^}}i16_eq:
 ; VI: v_cmp_eq_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_eq(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_eq(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -26,7 +26,7 @@
 ; GCN-LABEL: {{^}}i16_ne:
 ; VI: v_cmp_ne_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ne_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ne(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ne(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -44,7 +44,7 @@
 ; GCN-LABEL: {{^}}i16_ugt:
 ; VI: v_cmp_gt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ugt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ugt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -62,7 +62,7 @@
 ; GCN-LABEL: {{^}}i16_uge:
 ; VI: v_cmp_ge_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_uge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_uge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -80,7 +80,7 @@
 ; GCN-LABEL: {{^}}i16_ult:
 ; VI: v_cmp_lt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ult(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ult(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -98,7 +98,7 @@
 ; GCN-LABEL: {{^}}i16_ule:
 ; VI: v_cmp_le_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ule(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ule(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -117,7 +117,7 @@
 ; GCN-LABEL: {{^}}i16_sgt:
 ; VI: v_cmp_gt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sgt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_sgt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -135,7 +135,7 @@
 ; GCN-LABEL: {{^}}i16_sge:
 ; VI: v_cmp_ge_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_sge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -153,7 +153,7 @@
 ; GCN-LABEL: {{^}}i16_slt:
 ; VI: v_cmp_lt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_slt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_slt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -171,7 +171,7 @@
 ; GCN-LABEL: {{^}}i16_sle:
 ; VI: v_cmp_le_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sle(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_sle(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -190,7 +190,7 @@
 ; GCN-LABEL: {{^}}i16_eq_v_s:
 ; VI: v_cmp_eq_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_eq_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_eq_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_eq_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -206,7 +206,7 @@
 ; GCN-LABEL: {{^}}i16_ne_v_s:
 ; VI: v_cmp_ne_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ne_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ne_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ne_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -222,7 +222,7 @@
 ; GCN-LABEL: {{^}}i16_ugt_v_s:
 ; VI: v_cmp_lt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ugt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ugt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -238,7 +238,7 @@
 ; GCN-LABEL: {{^}}i16_uge_v_s:
 ; VI: v_cmp_le_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_uge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_uge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -254,7 +254,7 @@
 ; GCN-LABEL: {{^}}i16_ult_v_s:
 ; VI: v_cmp_gt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ult_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ult_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -270,7 +270,7 @@
 ; GCN-LABEL: {{^}}i16_ule_v_s:
 ; VI: v_cmp_ge_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ule_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ule_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -286,7 +286,7 @@
 ; GCN-LABEL: {{^}}i16_sgt_v_s:
 ; VI: v_cmp_lt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sgt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_sgt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -302,7 +302,7 @@
 ; GCN-LABEL: {{^}}i16_sge_v_s:
 ; VI: v_cmp_le_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_sge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -318,7 +318,7 @@
 ; GCN-LABEL: {{^}}i16_slt_v_s:
 ; VI: v_cmp_gt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_slt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_slt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -334,7 +334,7 @@
 ; GCN-LABEL: {{^}}i16_sle_v_s:
 ; VI: v_cmp_ge_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sle_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_sle_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/icmp64.ll b/llvm/test/CodeGen/AMDGPU/icmp64.ll
index 33ad0c9..3af7427 100644
--- a/llvm/test/CodeGen/AMDGPU/icmp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp64.ll
@@ -3,7 +3,7 @@
 
 ; SI-LABEL: {{^}}test_i64_eq:
 ; SI: v_cmp_eq_u64
-define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp eq i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -12,7 +12,7 @@
 
 ; SI-LABEL: {{^}}test_i64_ne:
 ; SI: v_cmp_ne_u64
-define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ne i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -21,7 +21,7 @@
 
 ; SI-LABEL: {{^}}test_i64_slt:
 ; SI: v_cmp_lt_i64
-define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp slt i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -30,7 +30,7 @@
 
 ; SI-LABEL: {{^}}test_i64_ult:
 ; SI: v_cmp_lt_u64
-define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ult i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -39,7 +39,7 @@
 
 ; SI-LABEL: {{^}}test_i64_sle:
 ; SI: v_cmp_le_i64
-define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sle i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -48,7 +48,7 @@
 
 ; SI-LABEL: {{^}}test_i64_ule:
 ; SI: v_cmp_le_u64
-define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ule i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -57,7 +57,7 @@
 
 ; SI-LABEL: {{^}}test_i64_sgt:
 ; SI: v_cmp_gt_i64
-define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sgt i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -66,7 +66,7 @@
 
 ; SI-LABEL: {{^}}test_i64_ugt:
 ; SI: v_cmp_gt_u64
-define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ugt i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -75,7 +75,7 @@
 
 ; SI-LABEL: {{^}}test_i64_sge:
 ; SI: v_cmp_ge_i64
-define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sge i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -84,7 +84,7 @@
 
 ; SI-LABEL: {{^}}test_i64_uge:
 ; SI: v_cmp_ge_u64
-define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp uge i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/image-attributes.ll b/llvm/test/CodeGen/AMDGPU/image-attributes.ll
index 5906b2f..53d61e6 100644
--- a/llvm/test/CodeGen/AMDGPU/image-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-attributes.ll
@@ -7,7 +7,7 @@
 ; FUNC-LABEL: {{^}}width_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].Z
-define void @width_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @width_2d (%opencl.image2d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d(
@@ -20,7 +20,7 @@
 ; FUNC-LABEL: {{^}}width_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].Z
-define void @width_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @width_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
@@ -37,7 +37,7 @@
 ; FUNC-LABEL: {{^}}height_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].W
-define void @height_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @height_2d (%opencl.image2d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d(
@@ -50,7 +50,7 @@
 ; FUNC-LABEL: {{^}}height_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].W
-define void @height_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @height_3d (%opencl.image3d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
@@ -67,7 +67,7 @@
 ; FUNC-LABEL: {{^}}depth_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].X
-define void @depth_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @depth_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
@@ -84,7 +84,7 @@
 ; FUNC-LABEL: {{^}}data_type_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Y
-define void @data_type_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @data_type_2d (%opencl.image2d_t addrspace(1)* %in,
                            i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d(
@@ -97,7 +97,7 @@
 ; FUNC-LABEL: {{^}}data_type_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Y
-define void @data_type_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @data_type_3d (%opencl.image3d_t addrspace(1)* %in,
                                      i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d(
@@ -114,7 +114,7 @@
 ; FUNC-LABEL: {{^}}channel_order_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Z
-define void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in,
                                i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d(
@@ -127,7 +127,7 @@
 ; FUNC-LABEL: {{^}}channel_order_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Z
-define void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in,
                                          i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d(
@@ -146,7 +146,7 @@
 ; FUNC-LABEL: {{^}}image_arg_2nd:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[4].Z
-define void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1,
+define amdgpu_kernel void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1,
                             i32 %x,
                             %opencl.image2d_t addrspace(1)* %in2,
                             i32 addrspace(1)* %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/image-resource-id.ll b/llvm/test/CodeGen/AMDGPU/image-resource-id.ll
index d4cf349..dac7c7d 100644
--- a/llvm/test/CodeGen/AMDGPU/image-resource-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-resource-id.ll
@@ -7,7 +7,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_rd_1_0(%opencl.image2d_t addrspace(1)* %in, ; read_only
+define amdgpu_kernel void @test_2d_rd_1_0(%opencl.image2d_t addrspace(1)* %in, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
@@ -21,7 +21,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_rd_1_0(%opencl.image3d_t addrspace(1)* %in, ; read_only
+define amdgpu_kernel void @test_3d_rd_1_0(%opencl.image3d_t addrspace(1)* %in, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
@@ -37,7 +37,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_wr_1_0(%opencl.image2d_t addrspace(1)* %in, ; write_only
+define amdgpu_kernel void @test_2d_wr_1_0(%opencl.image2d_t addrspace(1)* %in, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
@@ -51,7 +51,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_wr_1_0(%opencl.image3d_t addrspace(1)* %in, ; write_only
+define amdgpu_kernel void @test_3d_wr_1_0(%opencl.image3d_t addrspace(1)* %in, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
@@ -67,7 +67,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_rd_2_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_2d_rd_2_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -82,7 +82,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_rd_2_1(%opencl.image2d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_2d_rd_2_1(%opencl.image2d_t addrspace(1)* %in1, ; read_only
                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -97,7 +97,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_rd_2_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_3d_rd_2_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -112,7 +112,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_rd_2_1(%opencl.image3d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_3d_rd_2_1(%opencl.image3d_t addrspace(1)* %in1, ; read_only
                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -129,7 +129,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_wr_2_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_wr_2_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                             %opencl.image2d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -144,7 +144,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_wr_2_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_wr_2_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                             %opencl.image2d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -159,7 +159,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_wr_2_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_wr_2_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                             %opencl.image3d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -174,7 +174,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_wr_2_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_wr_2_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                             %opencl.image3d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -191,7 +191,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_2d_rd_3_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_2d_rd_3_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
                             %opencl.image2d_t addrspace(1)* %in3, ; read_only
                             i32 addrspace(1)* %out) {
@@ -208,7 +208,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_3d_rd_3_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_3d_rd_3_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
                             %opencl.image3d_t addrspace(1)* %in3, ; read_only
                             i32 addrspace(1)* %out) {
@@ -226,7 +226,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_2d_wr_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_wr_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                             %opencl.image3d_t addrspace(1)* %in2, ; write_only
                             %opencl.image2d_t addrspace(1)* %in3, ; write_only
                             i32 addrspace(1)* %out) {
@@ -243,7 +243,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_3d_wr_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_wr_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                             %opencl.image2d_t addrspace(1)* %in2, ; write_only
                             %opencl.image3d_t addrspace(1)* %in3, ; write_only
                             i32 addrspace(1)* %out) {
@@ -261,7 +261,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_mix_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_mix_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                              %opencl.image3d_t addrspace(1)* %in2, ; read_only
                              %opencl.image2d_t addrspace(1)* %in3, ; read_only
                              i32 addrspace(1)* %out) {
@@ -277,7 +277,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_mix_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_mix_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                              %opencl.image2d_t addrspace(1)* %in2, ; read_only
                              %opencl.image3d_t addrspace(1)* %in3, ; read_only
                              i32 addrspace(1)* %out) {
@@ -293,7 +293,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_mix_3_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_mix_3_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                              %opencl.image3d_t addrspace(1)* %in2, ; read_only
                              %opencl.image2d_t addrspace(1)* %in3, ; write_only
                              i32 addrspace(1)* %out) {
@@ -309,7 +309,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_mix_3_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_mix_3_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                              %opencl.image2d_t addrspace(1)* %in2, ; read_only
                              %opencl.image3d_t addrspace(1)* %in3, ; write_only
                              i32 addrspace(1)* %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll
index b2c873a..c2668a0 100644
--- a/llvm/test/CodeGen/AMDGPU/imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm.ll
@@ -5,7 +5,7 @@
 ; GCN-LABEL: {{^}}i64_imm_inline_lo:
 ; GCN: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], 5
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]:
-define void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
 entry:
   store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005
   ret void
@@ -15,7 +15,7 @@
 ; GCN-LABEL: {{^}}i64_imm_inline_hi:
 ; GCN: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], 5
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]]
-define void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
 entry:
   store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678
   ret void
@@ -25,7 +25,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
   store i64 -9223372036854775808, i64 addrspace(1) *%out
   ret void
 }
@@ -33,7 +33,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_i32:
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
   store i32 -2147483648, i32 addrspace(1)* %out
   ret void
 }
@@ -41,7 +41,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_0.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
   store float 0.0, float addrspace(1)* %out
   ret void
 }
@@ -49,7 +49,7 @@
 ; GCN-LABEL: {{^}}store_imm_neg_0.0_f32:
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
   store float -0.0, float addrspace(1)* %out
   ret void
 }
@@ -57,7 +57,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_0.5_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
   store float 0.5, float addrspace(1)* %out
   ret void
 }
@@ -65,7 +65,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
   store float -0.5, float addrspace(1)* %out
   ret void
 }
@@ -73,7 +73,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_1.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
   store float 1.0, float addrspace(1)* %out
   ret void
 }
@@ -81,7 +81,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
   store float -1.0, float addrspace(1)* %out
   ret void
 }
@@ -89,7 +89,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_2.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
   store float 2.0, float addrspace(1)* %out
   ret void
 }
@@ -97,7 +97,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
   store float -2.0, float addrspace(1)* %out
   ret void
 }
@@ -105,7 +105,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_4.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
   store float 4.0, float addrspace(1)* %out
   ret void
 }
@@ -113,7 +113,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
   store float -4.0, float addrspace(1)* %out
   ret void
 }
@@ -123,7 +123,7 @@
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e22f983{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0.15915494{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) {
   store float 0x3FC45F3060000000, float addrspace(1)* %out
   ret void
 }
@@ -131,7 +131,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_f32:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbe22f983{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) {
   store float 0xBFC45F3060000000, float addrspace(1)* %out
   ret void
 }
@@ -139,7 +139,7 @@
 ; GCN-LABEL: {{^}}store_literal_imm_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000
 ; GCN: buffer_store_dword [[REG]]
-define void @store_literal_imm_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f32(float addrspace(1)* %out) {
   store float 4096.0, float addrspace(1)* %out
   ret void
 }
@@ -148,7 +148,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -158,7 +158,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0.5
   store float %y, float addrspace(1)* %out
   ret void
@@ -168,7 +168,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -0.5
   store float %y, float addrspace(1)* %out
   ret void
@@ -178,7 +178,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 1.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -188,7 +188,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -1.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -198,7 +198,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 2.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -208,7 +208,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -2.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -218,7 +218,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 4.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -228,7 +228,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -4.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -238,7 +238,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
 ; GCN: buffer_store_dword [[REG]]
-define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %x = load float, float addrspace(1)* %in
   %y = fadd float %x, 0.5
   store float %y, float addrspace(1)* %out
@@ -249,7 +249,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]]
 ; GCN: buffer_store_dword [[REG]]
-define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %x = load float, float addrspace(1)* %in
   %y = fadd float %x, 1024.0
   store float %y, float addrspace(1)* %out
@@ -260,7 +260,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36a0000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -270,7 +270,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 2{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36b0000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -280,7 +280,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 16
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36e0000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -290,7 +290,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0xffffffffe0000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -300,7 +300,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -2{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0xffffffffc0000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -310,7 +310,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -16
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0xfffffffe00000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -320,7 +320,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 63
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36ff800000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -330,7 +330,7 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 64
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x3700000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -342,7 +342,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -353,7 +353,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0.5
   store double %y, double addrspace(1)* %out
   ret void
@@ -364,7 +364,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -0.5
   store double %y, double addrspace(1)* %out
   ret void
@@ -375,7 +375,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 1.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -386,7 +386,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -1.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -397,7 +397,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 2.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -408,7 +408,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -2.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -419,7 +419,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 4.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -430,7 +430,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -4.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -445,7 +445,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; VI: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.15915494{{$}}
 ; VI: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x3fc45f306dc9c882
   store double %y, double addrspace(1)* %out
   ret void
@@ -455,7 +455,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30
 ; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xbfc45f306dc9c882
   store double %y, double addrspace(1)* %out
   ret void
@@ -466,7 +466,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000001
   store double %y, double addrspace(1)* %out
   ret void
@@ -477,7 +477,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000002
   store double %y, double addrspace(1)* %out
   ret void
@@ -488,7 +488,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000010
   store double %y, double addrspace(1)* %out
   ret void
@@ -499,7 +499,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xffffffffffffffff
   store double %y, double addrspace(1)* %out
   ret void
@@ -510,7 +510,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xfffffffffffffffe
   store double %y, double addrspace(1)* %out
   ret void
@@ -521,7 +521,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -16
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xfffffffffffffff0
   store double %y, double addrspace(1)* %out
   ret void
@@ -532,7 +532,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x000000000000003F
   store double %y, double addrspace(1)* %out
   ret void
@@ -543,7 +543,7 @@
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000040
   store double %y, double addrspace(1)* %out
   ret void
@@ -554,7 +554,7 @@
 ; GCN: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0
 ; GCN: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], v[[LO_VREG]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
   store double 0.0, double addrspace(1)* %out
   ret void
 }
@@ -564,7 +564,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
   store double -0.0, double addrspace(1)* %out
   ret void
 }
@@ -573,7 +573,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fe00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
   store double 0.5, double addrspace(1)* %out
   ret void
 }
@@ -582,7 +582,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfe00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
   store double -0.5, double addrspace(1)* %out
   ret void
 }
@@ -591,7 +591,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3ff00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
   store double 1.0, double addrspace(1)* %out
   ret void
 }
@@ -600,7 +600,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbff00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
   store double -1.0, double addrspace(1)* %out
   ret void
 }
@@ -609,7 +609,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 2.0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
   store double 2.0, double addrspace(1)* %out
   ret void
 }
@@ -618,7 +618,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], -2.0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
   store double -2.0, double addrspace(1)* %out
   ret void
 }
@@ -627,7 +627,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40100000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
   store double 4.0, double addrspace(1)* %out
   ret void
 }
@@ -636,7 +636,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xc0100000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
   store double -4.0, double addrspace(1)* %out
   ret void
 }
@@ -645,7 +645,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fc45f30
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inv_2pi_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inv_2pi_f64(double addrspace(1)* %out) {
   store double 0x3fc45f306dc9c882, double addrspace(1)* %out
   ret void
 }
@@ -654,7 +654,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) {
   store double 0xbfc45f306dc9c882, double addrspace(1)* %out
   ret void
 }
@@ -663,7 +663,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40b00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_literal_imm_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f64(double addrspace(1)* %out) {
   store double 4096.0, double addrspace(1)* %out
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll
index 2e73eb0..e42d587 100644
--- a/llvm/test/CodeGen/AMDGPU/imm16.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm16.ll
@@ -7,7 +7,7 @@
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
   store volatile i16 -32768, i16 addrspace(1)* %out
   ret void
 }
@@ -15,7 +15,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_0.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
   store half 0.0, half addrspace(1)* %out
   ret void
 }
@@ -24,7 +24,7 @@
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
   store half -0.0, half addrspace(1)* %out
   ret void
 }
@@ -32,7 +32,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_0.5_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3800{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
   store half 0.5, half addrspace(1)* %out
   ret void
 }
@@ -41,7 +41,7 @@
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb800{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
   store half -0.5, half addrspace(1)* %out
   ret void
 }
@@ -49,7 +49,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_1.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
   store half 1.0, half addrspace(1)* %out
   ret void
 }
@@ -58,7 +58,7 @@
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
   store half -1.0, half addrspace(1)* %out
   ret void
 }
@@ -66,7 +66,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_2.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
   store half 2.0, half addrspace(1)* %out
   ret void
 }
@@ -75,7 +75,7 @@
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
   store half -2.0, half addrspace(1)* %out
   ret void
 }
@@ -83,7 +83,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_4.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4400{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
   store half 4.0, half addrspace(1)* %out
   ret void
 }
@@ -92,7 +92,7 @@
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc400{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
   store half -4.0, half addrspace(1)* %out
   ret void
 }
@@ -101,7 +101,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3118{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
   store half 0xH3118, half addrspace(1)* %out
   ret void
 }
@@ -110,7 +110,7 @@
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb118{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
   store half 0xHB118, half addrspace(1)* %out
   ret void
 }
@@ -118,7 +118,7 @@
 ; GCN-LABEL: {{^}}store_literal_imm_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c00
 ; GCN: buffer_store_short [[REG]]
-define void @store_literal_imm_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f16(half addrspace(1)* %out) {
   store half 4096.0, half addrspace(1)* %out
   ret void
 }
@@ -127,7 +127,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -137,7 +137,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0.5
   store half %y, half addrspace(1)* %out
   ret void
@@ -147,7 +147,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -0.5
   store half %y, half addrspace(1)* %out
   ret void
@@ -157,7 +157,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 1.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -167,7 +167,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -1.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -177,7 +177,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 2.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -187,7 +187,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -2.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -197,7 +197,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 4.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -207,7 +207,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -4.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -217,7 +217,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %x = load half, half addrspace(1)* %in
   %y = fadd half %x, 0.5
   store half %y, half addrspace(1)* %out
@@ -228,7 +228,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0x6400, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %x = load half, half addrspace(1)* %in
   %y = fadd half %x, 1024.0
   store half %y, half addrspace(1)* %out
@@ -239,7 +239,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0001
   store half %y, half addrspace(1)* %out
   ret void
@@ -249,7 +249,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0002
   store half %y, half addrspace(1)* %out
   ret void
@@ -259,7 +259,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 16, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0010
   store half %y, half addrspace(1)* %out
   ret void
@@ -269,7 +269,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xHFFFF
   store half %y, half addrspace(1)* %out
   ret void
@@ -279,7 +279,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xHFFFE
   store half %y, half addrspace(1)* %out
   ret void
@@ -289,7 +289,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -16, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xHFFF0
   store half %y, half addrspace(1)* %out
   ret void
@@ -299,7 +299,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 63, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH003F
   store half %y, half addrspace(1)* %out
   ret void
@@ -309,7 +309,7 @@
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 64, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0040
   store half %y, half addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll
index c1ed395..6ba502a 100644
--- a/llvm/test/CodeGen/AMDGPU/immv216.ll
+++ b/llvm/test/CodeGen/AMDGPU/immv216.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 {
   store <2 x i16> <i16 -32768, i16 -32768>, <2 x i16> addrspace(1)* %out
   ret void
 }
@@ -14,7 +14,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 0.0, half 0.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -22,7 +22,7 @@
 ; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half -0.0, half -0.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -30,7 +30,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 0.5, half 0.5>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -38,7 +38,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half -0.5, half -0.5>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -46,7 +46,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 1.0, half 1.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -54,7 +54,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half -1.0, half -1.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -62,7 +62,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 2.0, half 2.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -70,7 +70,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half -2.0, half -2.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -78,7 +78,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 4.0, half 4.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -86,7 +86,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half -4.0, half -4.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -94,7 +94,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 0xH3118, half 0xH3118>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -102,7 +102,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 0xHB118, half 0xHB118>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -110,7 +110,7 @@
 ; GCN-LABEL: {{^}}store_literal_imm_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00
 ; GCN: buffer_store_dword [[REG]]
-define void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 4096.0, half 4096.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -126,7 +126,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0.0, half 0.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -143,7 +143,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0.5, half 0.5>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -160,7 +160,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half -0.5, half -0.5>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -177,7 +177,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 1.0, half 1.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -194,7 +194,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half -1.0, half -1.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -211,7 +211,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 2.0, half 2.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -228,7 +228,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half -2.0, half -2.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -245,7 +245,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 4.0, half 4.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -262,7 +262,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half -4.0, half -4.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -280,7 +280,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %x = load <2 x half>, <2 x half> addrspace(1)* %in
   %y = fadd <2 x half> %x, <half 0.5, half 0.5>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
@@ -301,7 +301,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %x = load <2 x half>, <2 x half> addrspace(1)* %in
   %y = fadd <2 x half> %x, <half 1024.0, half 1024.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
@@ -319,7 +319,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xH0001, half 0xH0001>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -336,7 +336,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xH0002, half 0xH0002>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -353,7 +353,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xH0010, half 0xH0010>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -370,7 +370,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xHFFFF, half 0xHFFFF>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -387,7 +387,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xHFFFE, half 0xHFFFE>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -404,7 +404,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xHFFF0, half 0xHFFF0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -421,7 +421,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xH003F, half 0xH003F>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -438,7 +438,7 @@
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xH0040, half 0xH0040>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
index 877956b..8e207a3 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
@@ -10,7 +10,7 @@
 ; CHECK: s_mov_b32 m0, [[IN]]
 ; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
 ; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
-define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %ins = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
   store <4 x float> %ins, <4 x float> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index f83363d..b18ae35 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -19,7 +19,7 @@
 ; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %idx = add i32 %in, 1
   %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %idx
@@ -44,7 +44,7 @@
 ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
+define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
 entry:
   %idx = add i32 %in, 1
   %vec = or <4 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4>
@@ -66,7 +66,7 @@
 ; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
   store float %elt, float addrspace(1)* %out
@@ -84,7 +84,7 @@
 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
+define amdgpu_kernel void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
@@ -105,7 +105,7 @@
 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) {
+define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %or = or <4 x i32> %vec0, %vec1
@@ -137,7 +137,7 @@
 
 ; IDXMODE: s_set_gpr_idx_off
 ; GCN: buffer_store_dword [[RESULT]]
-define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -512
@@ -147,7 +147,7 @@
 }
 
 ; GCN-LABEL: {{^}}extract_undef_offset_sgpr:
-define void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %value = extractelement <4 x i32> %ld, i32 undef
@@ -159,7 +159,7 @@
 ; GCN-DAG: buffer_load_dwordx4
 ; MOVREL-DAG: s_mov_b32 m0,
 ; MOVREL: v_movreld_b32
-define void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %ld = load <4 x i32>, <4  x i32> addrspace(1)* %in
   %value = insertelement <4 x i32> %ld, i32 5, i32 undef
@@ -178,7 +178,7 @@
 
 ; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]]
 ; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
-define void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %0 = add i32 %in, 1
   %1 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %0
@@ -197,7 +197,7 @@
 ; IDXMODE-NEXT: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
-define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
   store <4 x float> %0, <4 x float> addrspace(1)* %out
@@ -213,7 +213,7 @@
 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
+define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
@@ -233,7 +233,7 @@
 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {
+define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %value = insertelement <4 x i32> %vec, i32 5, i32 %index
@@ -270,7 +270,7 @@
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword
-define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -512
@@ -305,7 +305,7 @@
 ; GCN: s_cbranch_execnz
 
 ; IDXMODE: s_set_gpr_idx_off
-define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -16
@@ -375,7 +375,7 @@
 
 ; GCN: buffer_store_dword [[MOVREL0]]
 ; GCN: buffer_store_dword [[MOVREL1]]
-define void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %id.ext = zext i32 %id to i64
@@ -450,7 +450,7 @@
 ; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
 
 ; GCN: buffer_store_dword [[INS0]]
-define void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
+define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %id.ext = zext i32 %id to i64
@@ -499,7 +499,7 @@
 ; GCN: [[ENDBB]]:
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @extract_adjacent_blocks(i32 %arg) #0 {
+define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) #0 {
 bb:
   %tmp = icmp eq i32 %arg, 0
   br i1 %tmp, label %bb1, label %bb4
@@ -549,7 +549,7 @@
 ; GCN: [[ENDBB]]:
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
+define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
 bb:
   %tmp = icmp eq i32 %arg, 0
   br i1 %tmp, label %bb1, label %bb4
@@ -610,7 +610,7 @@
 ; GCN: ds_write_b32
 ; GCN: ds_write_b32
 ; GCN: s_endpgm
-define void @multi_same_block(i32 %arg) #0 {
+define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {
 bb:
   %tmp1 = add i32 %arg, -16
   %tmp2 = insertelement <6 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01>, float 4.000000e+00, i32 %tmp1
@@ -637,7 +637,7 @@
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword [[EXTRACT]]
-define void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %offset = add i32 %idx, 3
@@ -658,7 +658,7 @@
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword [[EXTRACT]]
-define void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %offset = add i32 %idx, 4
@@ -681,7 +681,7 @@
 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], src0
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE: s_set_gpr_idx_off
-define void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
+define amdgpu_kernel void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %idx.shl = shl i32 %idx.in, 2
@@ -702,7 +702,7 @@
 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE: s_set_gpr_idx_off
-define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
   %idx.shl = shl i32 %idx.in, 2
   %idx = or i32 %idx.shl, 1
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx
@@ -729,7 +729,7 @@
 ; IDXMODE: s_set_gpr_idx_idx
 ; IDXMODE: v_mov_b32_e32
 ; GCN: s_cbranch_execnz [[REGLOOP]]
-define void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
+define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
 bb:
   br label %bb2
 
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
index a4acfa6..b6dea01 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -20,7 +20,7 @@
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write_b64
 ; CI-PROMOTE: ds_read_b64
-define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load double, double addrspace(1)* %in, align 8
   %array = alloca [8 x double], align 8
   %ptr = getelementptr inbounds [8 x double], [8 x double]* %array, i32 0, i32 %b
@@ -51,7 +51,7 @@
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write2_b64
 ; CI-PROMOTE: ds_read2_b64
-define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
   %array = alloca [4 x <2 x double>], align 16
   %ptr = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* %array, i32 0, i32 %b
@@ -77,7 +77,7 @@
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write_b64
 ; CI-PROMOTE: ds_read_b64
-define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %array = alloca [8 x i64], align 8
   %ptr = getelementptr inbounds [8 x i64], [8 x i64]* %array, i32 0, i32 %b
@@ -109,7 +109,7 @@
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write2_b64
 ; CI-PROMOTE: ds_read2_b64
-define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %array = alloca [4 x <2 x i64>], align 16
   %ptr = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* %array, i32 0, i32 %b
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
index 990f335..7cee8a4 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
@@ -2,7 +2,7 @@
 ; REQUIRES: asserts
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
 
-define void @inf_loop_irreducible_cfg() nounwind {
+define amdgpu_kernel void @inf_loop_irreducible_cfg() nounwind {
 entry:
   br label %block
 
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index 3e0b695..7348275 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -7,7 +7,7 @@
 ; SI: buffer_store_dword [[REG]]
 ; SI: s_waitcnt vmcnt(0) expcnt(0)
 ; SI: s_branch BB0_1
-define void @infinite_loop(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
 entry:
   br label %for.body
 
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index db1a0c6..85eb163 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -4,7 +4,7 @@
 ; CHECK-LABEL: {{^}}inline_asm:
 ; CHECK: s_endpgm
 ; CHECK: s_endpgm
-define void @inline_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @inline_asm(i32 addrspace(1)* %out) {
 entry:
   store i32 5, i32 addrspace(1)* %out
   call void asm sideeffect "s_endpgm", ""()
@@ -25,7 +25,7 @@
 ; Make sure inline assembly is treted as divergent.
 ; CHECK: s_mov_b32 s{{[0-9]+}}, 0
 ; CHECK: s_and_saveexec_b64
-define void @branch_on_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @branch_on_asm(i32 addrspace(1)* %out) {
 	%zero = call i32 asm "s_mov_b32 $0, 0", "=s"()
 	%cmp = icmp eq i32 %zero, 0
 	br i1 %cmp, label %if, label %endif
@@ -44,7 +44,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[MASK_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[MASK_HI]]
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) {
   %sgpr = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 %in)
   store i64 %sgpr, i64 addrspace(1)* %out
   ret void
@@ -52,7 +52,7 @@
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm:
 ; CHECK: codeLenInByte = 12
-define void @code_size_inline_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "v_nop_e64", ""()
   ret void
@@ -61,7 +61,7 @@
 ; All inlineasm instructions are assumed to be the maximum size
 ; CHECK-LABEL: {{^}}code_size_inline_asm_small_inst:
 ; CHECK: codeLenInByte = 12
-define void @code_size_inline_asm_small_inst(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_small_inst(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "v_nop_e32", ""()
   ret void
@@ -69,7 +69,7 @@
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_2_inst(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_inst(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "
     v_nop_e64
@@ -80,7 +80,7 @@
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst_extra_newline:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_2_inst_extra_newline(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_inst_extra_newline(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "
     v_nop_e64
@@ -92,7 +92,7 @@
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_0_inst:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_0_inst(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_0_inst(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "", ""()
   ret void
@@ -100,7 +100,7 @@
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_1_comment(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_1_comment(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment", ""()
   ret void
@@ -108,7 +108,7 @@
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_newline_1_comment:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_newline_1_comment(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_newline_1_comment(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "
 ; comment", ""()
@@ -117,7 +117,7 @@
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment_newline:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_1_comment_newline(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_1_comment_newline(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment
 ", ""()
@@ -126,7 +126,7 @@
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_2_comments_line(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_comments_line(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; first comment ; second comment", ""()
   ret void
@@ -134,7 +134,7 @@
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line_nospace:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_2_comments_line_nospace(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_comments_line_nospace(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; first comment;second comment", ""()
   ret void
@@ -142,7 +142,7 @@
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments0:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_mixed_comments0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_mixed_comments0(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment
     v_nop_e64 ; inline comment
@@ -157,7 +157,7 @@
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments1:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_mixed_comments1(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_mixed_comments1(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "v_nop_e64 ; inline comment
 ; separate comment
@@ -171,7 +171,7 @@
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments_operands:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_mixed_comments_operands(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_mixed_comments_operands(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment
     v_add_i32_e32 v0, vcc, v1, v2 ; inline comment
diff --git a/llvm/test/CodeGen/AMDGPU/inline-calls.ll b/llvm/test/CodeGen/AMDGPU/inline-calls.ll
index 4541a90..f8821f3 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-calls.ll
@@ -11,7 +11,7 @@
 
 ; CHECK: {{^}}kernel:
 ; CHECK-NOT: call
-define void @kernel(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @func(i32 1)
   store i32 %tmp0, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@
 
 ; CHECK: {{^}}kernel2:
 ; CHECK-NOT: call
-define void @kernel2(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel2(i32 addrspace(1)* %out) {
 entry:
   call void @kernel(i32 addrspace(1)* %out)
   ret void
@@ -31,7 +31,7 @@
 
 ; CHECK: {{^}}kernel3:
 ; CHECK-NOT: call
-define void @kernel3(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel3(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @func_alias(i32 1)
   store i32 %tmp0, i32 addrspace(1)* %out
@@ -43,7 +43,7 @@
 
 ; CHECK: {{^}}kernel4:
 ; CHECK-NOT: call
-define void @kernel4(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel4(i32 addrspace(1)* %out) {
 entry:
   call void @kernel_alias(i32 addrspace(1)* %out)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
index 1bcbd14..941a1b9 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
@@ -10,7 +10,7 @@
 ; GCN: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
 ; GCN: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
 
-define void @inline_reg_constraints(i32 addrspace(1)* %ptr) {
+define amdgpu_kernel void @inline_reg_constraints(i32 addrspace(1)* %ptr) {
 entry:
   %v32 = tail call i32 asm sideeffect "flat_load_dword   $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
   %v64 = tail call <2 x i32> asm sideeffect "flat_load_dwordx2 $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
@@ -27,7 +27,7 @@
 ; GCN: s_mov_b32 m0, -1
 ; GCN: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; GCN: ; use [[COPY_M0]]
-define void @inline_sreg_constraint_m0() {
+define amdgpu_kernel void @inline_sreg_constraint_m0() {
   %m0 = tail call i32 asm sideeffect "s_mov_b32 m0, -1", "={M0}"()
   tail call void asm sideeffect "; use $0", "s"(i32 %m0)
   ret void
@@ -36,7 +36,7 @@
 ; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i32:
 ; GCN: s_mov_b32 [[REG:s[0-9]+]], 32
 ; GCN: ; use [[REG]]
-define void @inline_sreg_constraint_imm_i32() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_i32() {
   tail call void asm sideeffect "; use $0", "s"(i32 32)
   ret void
 }
@@ -44,7 +44,7 @@
 ; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f32:
 ; GCN: s_mov_b32 [[REG:s[0-9]+]], 1.0
 ; GCN: ; use [[REG]]
-define void @inline_sreg_constraint_imm_f32() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_f32() {
   tail call void asm sideeffect "; use $0", "s"(float 1.0)
   ret void
 }
@@ -54,7 +54,7 @@
 ; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -4{{$}}
 ; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}}
 ; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
-define void @inline_sreg_constraint_imm_i64() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_i64() {
   tail call void asm sideeffect "; use $0", "s"(i64 -4)
   ret void
 }
@@ -63,7 +63,7 @@
 ; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0x3ff00000{{$}}
 ; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
-define void @inline_sreg_constraint_imm_f64() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_f64() {
   tail call void asm sideeffect "; use $0", "s"(double 1.0)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-16.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-16.ll
index 75f3158..15e57fe 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-16.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-16.ll
@@ -5,7 +5,7 @@
 ; GCN-LABEL: {{^}}s_input_output_i16:
 ; SICI: error: couldn't allocate output register for constraint 's'
 ; SICI: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_i16() #0 {
+define amdgpu_kernel void @s_input_output_i16() #0 {
   %v = tail call i16 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i16 %v) #0
   ret void
@@ -14,7 +14,7 @@
 ; GCN-LABEL: {{^}}v_input_output_i16:
 ; SICI: error: couldn't allocate output register for constraint 'v'
 ; SICI: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_i16() #0 {
+define amdgpu_kernel void @v_input_output_i16() #0 {
   %v = tail call i16 asm sideeffect "v_mov_b32 $0, -1", "=v"() #0
   tail call void asm sideeffect "; use $0", "v"(i16 %v)
   ret void
@@ -23,7 +23,7 @@
 ; GCN-LABEL: {{^}}s_input_output_f16:
 ; SICI: error: couldn't allocate output register for constraint 's'
 ; SICI: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_f16() #0 {
+define amdgpu_kernel void @s_input_output_f16() #0 {
   %v = tail call half asm sideeffect "s_mov_b32 $0, -1", "=s"() #0
   tail call void asm sideeffect "; use $0", "s"(half %v)
   ret void
@@ -32,7 +32,7 @@
 ; GCN-LABEL: {{^}}v_input_output_f16:
 ; SICI: error: couldn't allocate output register for constraint 'v'
 ; SICI: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_f16() #0 {
+define amdgpu_kernel void @v_input_output_f16() #0 {
   %v = tail call half asm sideeffect "v_mov_b32 $0, -1", "=v"() #0
   tail call void asm sideeffect "; use $0", "v"(half %v)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
index 2eb21f0..c1d67ba 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
@@ -3,7 +3,7 @@
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_i8() {
+define amdgpu_kernel void @s_input_output_i8() {
   %v = tail call i8 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i8 %v)
   ret void
@@ -11,7 +11,7 @@
 
 ; GCN: error: couldn't allocate output register for constraint 'v'
 ; GCN: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_i8() {
+define amdgpu_kernel void @v_input_output_i8() {
   %v = tail call i8 asm sideeffect "v_mov_b32 $0, -1", "=v"()
   tail call void asm sideeffect "; use $0", "v"(i8 %v)
   ret void
@@ -19,7 +19,7 @@
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_i128() {
+define amdgpu_kernel void @s_input_output_i128() {
   %v = tail call i128 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i128 %v)
   ret void
@@ -27,7 +27,7 @@
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_v8f16() {
+define amdgpu_kernel void @s_input_output_v8f16() {
   %v = tail call <8 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(<8 x half> %v)
   ret void
@@ -36,7 +36,7 @@
 ; CI: error: couldn't allocate output register for constraint 's'
 ; CI: error: couldn't allocate input reg for constraint 's'
 ; VI-NOT: error
-define void @s_input_output_f16() {
+define amdgpu_kernel void @s_input_output_f16() {
   %v = tail call half asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(half %v)
   ret void
@@ -44,7 +44,7 @@
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_v2f16() {
+define amdgpu_kernel void @s_input_output_v2f16() {
   %v = tail call <2 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(<2 x half> %v)
   ret void
@@ -52,7 +52,7 @@
 
 ; GCN: error: couldn't allocate output register for constraint 'v'
 ; GCN: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_v2f16() {
+define amdgpu_kernel void @v_input_output_v2f16() {
   %v = tail call <2 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"()
   tail call void asm sideeffect "; use $0", "v"(<2 x half> %v)
   ret void
@@ -61,7 +61,7 @@
 ; CI: error: couldn't allocate output register for constraint 's'
 ; CI: error: couldn't allocate input reg for constraint 's'
 ; VI-NOT: error
-define void @s_input_output_i16() {
+define amdgpu_kernel void @s_input_output_i16() {
   %v = tail call i16 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i16 %v)
   ret void
@@ -69,14 +69,14 @@
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_v2i16() {
+define amdgpu_kernel void @s_input_output_v2i16() {
   %v = tail call <2 x i16> asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(<2 x i16> %v)
   ret void
 }
 
 ; FIXME: Crash in codegen prepare
-; define void @s_input_output_i3() {
+; define amdgpu_kernel void @s_input_output_i3() {
 ;   %v = tail call i3 asm sideeffect "s_mov_b32 $0, -1", "=s"()
 ;   tail call void asm sideeffect "; use $0", "s"(i3 %v)
 ;   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll
index 23a0ed3..3c6c7e1 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}inline_asm_input_v2i16:
 ; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}}
-define void @inline_asm_input_v2i16(i32 addrspace(1)* %out, <2 x i16> %in) #0 {
+define amdgpu_kernel void @inline_asm_input_v2i16(i32 addrspace(1)* %out, <2 x i16> %in) #0 {
 entry:
   %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x i16> %in) #0
   store i32 %val, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@
 
 ; GCN-LABEL: {{^}}inline_asm_input_v2f16:
 ; GCN: s_mov_b32 s0, s{{[0-9]+}}
-define void @inline_asm_input_v2f16(i32 addrspace(1)* %out, <2 x half> %in) #0 {
+define amdgpu_kernel void @inline_asm_input_v2f16(i32 addrspace(1)* %out, <2 x half> %in) #0 {
 entry:
   %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x half> %in) #0
   store i32 %val, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@
 
 ; GCN-LABEL: {{^}}inline_asm_output_v2i16:
 ; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}}
-define void @inline_asm_output_v2i16(<2 x i16> addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @inline_asm_output_v2i16(<2 x i16> addrspace(1)* %out, i32 %in) #0 {
 entry:
   %val = call <2 x i16> asm "s_mov_b32 $0, $1", "=r,r"(i32 %in) #0
   store <2 x i16> %val, <2 x i16> addrspace(1)* %out
@@ -29,7 +29,7 @@
 
 ; GCN-LABEL: {{^}}inline_asm_output_v2f16:
 ; GCN: v_mov_b32 v{{[0-9]+}}, s{{[0-9]+}}
-define void @inline_asm_output_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @inline_asm_output_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
 entry:
   %val = call <2 x half> asm "v_mov_b32 $0, $1", "=v,r"(i32 %in) #0
   store <2 x half> %val, <2 x half> addrspace(1)* %out
@@ -38,7 +38,7 @@
 
 ; GCN-LABEL: {{^}}inline_asm_packed_v2i16:
 ; GCN: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define void @inline_asm_packed_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %in0, <2 x i16> %in1) #0 {
+define amdgpu_kernel void @inline_asm_packed_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %in0, <2 x i16> %in1) #0 {
 entry:
   %val = call <2 x i16> asm "v_pk_add_u16 $0, $1, $2", "=v,r,v"(<2 x i16> %in0, <2 x i16> %in1) #0
   store <2 x i16> %val, <2 x i16> addrspace(1)* %out
@@ -47,7 +47,7 @@
 
 ; GCN-LABEL: {{^}}inline_asm_packed_v2f16:
 ; GCN: v_pk_add_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define void @inline_asm_packed_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in0, <2 x half> %in1) #0 {
+define amdgpu_kernel void @inline_asm_packed_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in0, <2 x half> %in1) #0 {
 entry:
   %val = call <2 x half> asm "v_pk_add_f16 $0, $1, $2", "=v,r,v"(<2 x half> %in0, <2 x half> %in1) #0
   store <2 x half> %val, <2 x half> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/insert_subreg.ll b/llvm/test/CodeGen/AMDGPU/insert_subreg.ll
index 4a5e886..e895f27 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_subreg.ll
@@ -6,7 +6,7 @@
 
 ; Make sure this doesn't crash
 ; CHECK-LABEL: test:
-define void @test(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i64 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca [16 x i32]
   %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 97ef5ce..76aad64 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -18,56 +18,56 @@
 ; GCN-DAG: s_mov_b32 [[CONSTREG:s[0-9]+]], 0x40a00000
 ; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]]
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]:
-define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4f32_1:
-define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4f32_2:
-define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4f32_3:
-define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4i32_0:
-define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 999, i32 0
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v3f32_1:
-define void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v3f32_2:
-define void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v3f32_3:
-define void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
@@ -86,7 +86,7 @@
 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
 ; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
-define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
   store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -97,7 +97,7 @@
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
 ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
 ; GCN-DAG: buffer_store_dword v
-define void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
@@ -107,7 +107,7 @@
 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
 ; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
-define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -117,7 +117,7 @@
 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
   store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -129,7 +129,7 @@
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
   store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -138,7 +138,7 @@
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i32:
 ; GCN: v_movreld_b32
 ; GCN: buffer_store_dwordx2
-define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
   store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -148,7 +148,7 @@
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5
 ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
 ; GCN-DAG: buffer_store_dword v
-define void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
   store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
   ret void
@@ -159,7 +159,7 @@
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]]
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
@@ -169,7 +169,7 @@
 ; GCN: v_movreld_b32
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
   store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
   ret void
@@ -181,21 +181,21 @@
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
   store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i16:
-define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i16:
-define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
   store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
   ret void
@@ -222,7 +222,7 @@
 ; GCN: buffer_load_dwordx2
 
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off
-define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i16> %a, i16 5, i32 %b
   store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 8
   ret void
@@ -242,7 +242,7 @@
 ; GCN-TONGA: buffer_load_ushort
 
 ; GCN: buffer_store_short v{{[0-9]+}}, off
-define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
   store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
   ret void
@@ -265,7 +265,7 @@
 
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off
-define void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
   store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
   ret void
@@ -291,21 +291,21 @@
 ; GCN-TONGA: buffer_load_dword
 
 ; GCN: buffer_store_dword v{{[0-9]+}}, off
-define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
   store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v8i8:
-define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
   store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
-define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
   store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
   ret void
@@ -314,7 +314,7 @@
 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
 ; the compiler doesn't crash.
 ; GCN-LABEL: {{^}}insert_split_bb:
-define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
+define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
 entry:
   %0 = insertelement <2 x i32> undef, i32 %a, i32 0
   %1 = icmp eq i32 %a, 0
@@ -361,7 +361,7 @@
 
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
   store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -374,14 +374,14 @@
 
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
   store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i64:
-define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
   store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
   ret void
@@ -411,7 +411,7 @@
 ; GCN: s_endpgm
 ; GCN: ScratchSize: 64
 
-define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
   store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
   ret void
@@ -438,7 +438,7 @@
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
 ; GCN: ScratchSize: 128
-define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
   store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index b794584..a3f82b8 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -10,7 +10,7 @@
 
 ; GFX9-NOT: lshr
 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, 0x3e7, [[VEC]]
-define void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@@ -28,7 +28,7 @@
 ; GFX9-NOT: [[ELT0]]
 ; GFX9-NOT: [[VEC]]
 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]]
-define void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@@ -48,7 +48,7 @@
 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
 ; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
 ; GFX9-DAG: ; use [[ELT1]]
-define void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %elt1 = extractelement <2 x i16> %vec, i32 1
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
@@ -68,7 +68,7 @@
 ; GFX9-NOT: [[ELT0]]
 ; GFX9-NOT: [[VEC]]
 ; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]]
-define void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %elt.hi = lshr i32 %elt.arg, 16
   %elt = trunc i32 %elt.hi to i16
@@ -87,7 +87,7 @@
 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16
 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT1]], [[VEC]]
 ; GFX9: ; use [[ELT1]]
-define void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %elt.hi = lshr i32 %elt.arg, 16
   %elt = trunc i32 %elt.hi to i16
@@ -112,7 +112,7 @@
 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
 ; GFX9: ; use [[ELT_HI]]
 ; GFX9: ; use [[VEC_HI]]
-define void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %elt.hi = lshr i32 %elt.arg, 16
   %elt = trunc i32 %elt.hi to i16
@@ -136,7 +136,7 @@
 ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x3e70000
 
 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x3e7
-define void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@@ -152,7 +152,7 @@
 
 ; GCN-NOT: shlr
 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]]
-define void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@@ -166,7 +166,7 @@
 
 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, 0x4500, [[ELT1]]
-define void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
   store <2 x half> %vecins, <2 x half> addrspace(1)* %out
@@ -181,7 +181,7 @@
 ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x45000000
 
 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x4500
-define void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
   store <2 x half> %vecins, <2 x half> addrspace(1)* %out
@@ -197,7 +197,7 @@
 ; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
 ; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], [[ELT0]], [[VEC]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -221,7 +221,7 @@
 ; GFX9: v_and_or_b32 [[RES:v[0-9]+]], [[VEC]], [[MASK]], [[ELT0_SHIFT]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -244,7 +244,7 @@
 ; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], 53, [[VEC]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -266,7 +266,7 @@
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -283,7 +283,7 @@
 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]]
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -305,7 +305,7 @@
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, [[ELT0]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
@@ -325,7 +325,7 @@
 ; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]]
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, 53
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
@@ -345,7 +345,7 @@
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
@@ -362,7 +362,7 @@
 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]]
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
@@ -383,7 +383,7 @@
 ; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VVEC]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(2)* %idx.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(2)* %idx.ptr) #0 {
   %idx = load volatile i32, i32 addrspace(2)* %idx.ptr
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
@@ -399,7 +399,7 @@
 ; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -424,7 +424,7 @@
 
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -451,7 +451,7 @@
 
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
index e97cb1b..1479303 100644
--- a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
+++ b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
@@ -4,17 +4,17 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass post-RA-hazard-rec  %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI,GFX9
 
 --- |
-  define void @div_fmas() { ret void }
-  define void @s_getreg() { ret void }
-  define void @s_setreg() { ret void }
-  define void @vmem_gt_8dw_store() { ret void }
-  define void @readwrite_lane() { ret void }
-  define void @rfe() { ret void }
-  define void @s_mov_fed_b32() { ret void }
-  define void @s_movrel() { ret void }
-  define void @v_interp() { ret void }
+  define amdgpu_kernel void @div_fmas() { ret void }
+  define amdgpu_kernel void @s_getreg() { ret void }
+  define amdgpu_kernel void @s_setreg() { ret void }
+  define amdgpu_kernel void @vmem_gt_8dw_store() { ret void }
+  define amdgpu_kernel void @readwrite_lane() { ret void }
+  define amdgpu_kernel void @rfe() { ret void }
+  define amdgpu_kernel void @s_mov_fed_b32() { ret void }
+  define amdgpu_kernel void @s_movrel() { ret void }
+  define amdgpu_kernel void @v_interp() { ret void }
 
-  define void @mov_fed_hazard_crash_on_dbg_value(i32 addrspace(1)* %A) {
+  define amdgpu_kernel void @mov_fed_hazard_crash_on_dbg_value(i32 addrspace(1)* %A) {
   entry:
     %A.addr = alloca i32 addrspace(1)*, align 4
     store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/internalize.ll b/llvm/test/CodeGen/AMDGPU/internalize.ll
index 5a66699..968b1d3 100644
--- a/llvm/test/CodeGen/AMDGPU/internalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/internalize.ll
@@ -8,14 +8,14 @@
 @gvar_used = addrspace(1) global i32 undef, align 4
 
 ; Function Attrs: alwaysinline nounwind
-define void @foo_unused(i32 addrspace(1)* %out) local_unnamed_addr #1 {
+define amdgpu_kernel void @foo_unused(i32 addrspace(1)* %out) local_unnamed_addr #1 {
 entry:
   store i32 1, i32 addrspace(1)* %out
   ret void
 }
 
 ; Function Attrs: alwaysinline nounwind
-define void @foo_used(i32 addrspace(1)* %out, i32 %tid) local_unnamed_addr #1 {
+define amdgpu_kernel void @foo_used(i32 addrspace(1)* %out, i32 %tid) local_unnamed_addr #1 {
 entry:
   store i32 %tid, i32 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
index c29434f..31f2fbc 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: error: <unknown>:0:0: in function use_group_to_global_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast
-define void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) {
+define amdgpu_kernel void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %stof
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
index 45a0610..5cd965d 100644
--- a/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@@ -10,7 +10,7 @@
 ; GCN-DAG: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]],
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
 ; GCN: buffer_store_dword [[K]], [[PTR]]
-define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 {
+define amdgpu_kernel void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 {
   %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(1)* %in, !invariant.load !0
   %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
   store i16 123, i16 addrspace(1)* %ptr, align 4
@@ -22,7 +22,7 @@
 ; GCN: s_load_dwordx2 s{{\[}}[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]{{\]}}
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
 ; GCN: buffer_store_dword [[K]], off, s{{\[}}[[SPTR_LO]]:
-define void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 {
+define amdgpu_kernel void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 {
   %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(2)* %in, !invariant.load !0
   %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
   store i16 123, i16 addrspace(1)* %ptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir b/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
index 66182d0..bc1dafe 100644
--- a/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
+++ b/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
@@ -1,7 +1,7 @@
 # RUN: llc -run-pass block-placement -march=amdgcn -verify-machineinstrs -o - %s | FileCheck %s
 --- |
 
-  define void @invert_br_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
+  define amdgpu_kernel void @invert_br_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
   entry:
     br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
 
diff --git a/llvm/test/CodeGen/AMDGPU/kcache-fold.ll b/llvm/test/CodeGen/AMDGPU/kcache-fold.ll
index f9a0d0e..37dd977 100644
--- a/llvm/test/CodeGen/AMDGPU/kcache-fold.ll
+++ b/llvm/test/CodeGen/AMDGPU/kcache-fold.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: {{^}}main1:
 ; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}}
-define void @main1() #0 {
+define amdgpu_kernel void @main1() #0 {
 main_body:
   %tmp = load <4 x float>, <4 x float> addrspace(8)* null
   %tmp7 = extractelement <4 x float> %tmp, i32 0
@@ -54,7 +54,7 @@
 
 ; CHECK: {{^}}main2:
 ; CHECK-NOT: MOV
-define void @main2() #0 {
+define amdgpu_kernel void @main2() #0 {
 main_body:
   %tmp = load <4 x float>, <4 x float> addrspace(8)* null
   %tmp7 = extractelement <4 x float> %tmp, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll b/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
index aaa63f7..8e358ef 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
@@ -5,7 +5,7 @@
 
 ; CHECK-LABEL: {{^}}no_args:
 ; CHECK: ScratchSize: 5{{$}}
-define void @no_args() {
+define amdgpu_kernel void @no_args() {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
@@ -13,7 +13,7 @@
 
 ; CHECK-LABEL: {{^}}force_align32:
 ; CHECK: ScratchSize: 5{{$}}
-define void @force_align32(<8 x i32>) {
+define amdgpu_kernel void @force_align32(<8 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
@@ -21,7 +21,7 @@
 
 ; CHECK-LABEL: {{^}}force_align64:
 ; CHECK: ScratchSize: 5{{$}}
-define void @force_align64(<16 x i32>) {
+define amdgpu_kernel void @force_align64(<16 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
@@ -29,7 +29,7 @@
 
 ; CHECK-LABEL: {{^}}force_align128:
 ; CHECK: ScratchSize: 5{{$}}
-define void @force_align128(<32 x i32>) {
+define amdgpu_kernel void @force_align128(<32 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
@@ -37,7 +37,7 @@
 
 ; CHECK-LABEL: {{^}}force_align256:
 ; CHECK: ScratchSize: 5{{$}}
-define void @force_align256(<64 x i32>) {
+define amdgpu_kernel void @force_align256(<64 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 95a6831..6fa26cb 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -17,7 +17,7 @@
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
+define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
 entry:
   %0 = zext i8 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -36,7 +36,7 @@
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
+define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
 entry:
   %0 = zext i8 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -55,7 +55,7 @@
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
+define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
 entry:
   %0 = sext i8 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -75,7 +75,7 @@
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
+define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
 entry:
   %0 = zext i16 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -94,7 +94,7 @@
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
+define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
 entry:
   %0 = zext i16 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -113,7 +113,7 @@
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
+define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
 entry:
   %0 = sext i16 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -126,7 +126,7 @@
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8
-define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
+define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
 entry:
   store i32 %in, i32 addrspace(1)* %out, align 4
   ret void
@@ -138,7 +138,7 @@
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
-define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
+define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
 entry:
   store float %in, float addrspace(1)* %out, align 4
   ret void
@@ -152,7 +152,7 @@
 ; MESA-GCN: buffer_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
+define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
 entry:
   store <2 x i8> %in, <2 x i8> addrspace(1)* %out
   ret void
@@ -166,7 +166,7 @@
 ; MESA-GCN: buffer_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
+define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(1)* %out
   ret void
@@ -179,7 +179,7 @@
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
-define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
+define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
   ret void
@@ -192,7 +192,7 @@
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
-define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
+define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
 entry:
   store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
   ret void
@@ -209,7 +209,7 @@
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
+define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
 entry:
   store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
   ret void
@@ -226,7 +226,7 @@
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
+define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
 entry:
   store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
   ret void
@@ -239,7 +239,7 @@
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
+define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
 entry:
   store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
   ret void
@@ -253,7 +253,7 @@
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
+define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
 entry:
   store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
   ret void
@@ -273,7 +273,7 @@
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(1)* %out
   ret void
@@ -293,7 +293,7 @@
 ; HSA-GCN: flat_load_ushort
 ; HSA-GCN: flat_load_ushort
 ; HSA-GCN: flat_load_ushort
-define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
+define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
 entry:
   store <4 x i16> %in, <4 x i16> addrspace(1)* %out
   ret void
@@ -308,7 +308,7 @@
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
+define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -323,7 +323,7 @@
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
+define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
   ret void
@@ -354,7 +354,7 @@
 ; HSA-GCN: float_load_ubyte
 ; HSA-GCN: float_load_ubyte
 ; HSA-GCN: float_load_ubyte
-define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
+define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
 entry:
   store <8 x i8> %in, <8 x i8> addrspace(1)* %out
   ret void
@@ -386,7 +386,7 @@
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
+define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
 entry:
   store <8 x i16> %in, <8 x i16> addrspace(1)* %out
   ret void
@@ -405,7 +405,7 @@
 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
 ; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
-define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
+define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
   ret void
@@ -422,7 +422,7 @@
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
-define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
+define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
 entry:
   store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
   ret void
@@ -478,7 +478,7 @@
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
+define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
 entry:
   store <16 x i8> %in, <16 x i8> addrspace(1)* %out
   ret void
@@ -534,7 +534,7 @@
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
+define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
 entry:
   store <16 x i16> %in, <16 x i16> addrspace(1)* %out
   ret void
@@ -561,7 +561,7 @@
 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
-define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
+define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
   ret void
@@ -588,7 +588,7 @@
 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
-define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
+define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
   ret void
@@ -599,7 +599,7 @@
 ; MESA-GCN: s_load_dwordx2
 ; MESA-GCN: buffer_store_dwordx2
 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
-define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
@@ -611,7 +611,7 @@
 ; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
 ; MESA-GCN: buffer_store_dwordx2
 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
-define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
+define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
 entry:
   store double %in, double addrspace(1)* %out
   ret void
@@ -621,7 +621,7 @@
 ; XGCN: s_load_dwordx2
 ; XGCN: s_load_dwordx2
 ; XGCN: buffer_store_dwordx2
-; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
 ;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
 ;   ret void
 ; }
@@ -631,7 +631,7 @@
 ; SI: v_and_b32_e32
 ; SI: buffer_store_byte
 ; SI: s_endpgm
-define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
   store i1 %x, i1 addrspace(1)* %out, align 1
   ret void
 }
@@ -640,7 +640,7 @@
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -650,7 +650,7 @@
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
@@ -660,7 +660,7 @@
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i32
   store i32 %ext, i32addrspace(1)* %out, align 4
   ret void
@@ -672,7 +672,7 @@
 ; SI: v_ashrrev_i32
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
index d2b5ad4..4af37d8 100644
--- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -48,7 +48,7 @@
 
 ; Scratch size = alloca size + emergency stack slot
 ; ALL: ; ScratchSize: 32772
-define void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
   %large = alloca [8192 x i32], align 4
   %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
   store volatile i32 %x, i32* %gep
diff --git a/llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll b/llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll
index 9975b1b..c46d68e 100644
--- a/llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll
@@ -4,7 +4,7 @@
 
 @gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4
 
-define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind {
+define amdgpu_kernel void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind {
   %val = load i32, i32 addrspace(2)* getelementptr ([239 x i32], [239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4
   %mul12 = mul nsw i32 %val, 7
   br i1 undef, label %exit, label %bb
diff --git a/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll b/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
index 93b3373..13dd705 100644
--- a/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
@@ -4,7 +4,7 @@
 ; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
 ; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
 
-define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -26,7 +26,7 @@
 
 ; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
 
-define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
+define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -48,7 +48,7 @@
 
 ; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
 
-define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
+define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -71,7 +71,7 @@
 ; ALL-LABEL: @occupancy_0(
 ; CI-NOT: alloca [5 x i32]
 ; SI: alloca [5 x i32]
-define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
+define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -94,7 +94,7 @@
 ; ALL-LABEL: @occupancy_max(
 ; CI-NOT: alloca [5 x i32]
 ; SI: alloca [5 x i32]
-define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
+define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -118,7 +118,7 @@
 ; CI-LABEL: @occupancy_6(
 ; SI: alloca
 ; CI-NOT: alloca
-define void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
+define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
 entry:
   %stack = alloca [42 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -142,7 +142,7 @@
 
 ; ALL-LABEL: @occupancy_6_over(
 ; ALL: alloca [43 x i8]
-define void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
+define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
 entry:
   %stack = alloca [43 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -168,7 +168,7 @@
 ; CI-LABEL: @occupancy_8(
 ; SI: alloca
 ; CI-NOT: alloca
-define void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
+define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
 entry:
   %stack = alloca [32 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -192,7 +192,7 @@
 
 ; ALL-LABEL: @occupancy_8_over(
 ; ALL: alloca [33 x i8]
-define void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
+define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
 entry:
   %stack = alloca [33 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -218,7 +218,7 @@
 ; CI-LABEL: @occupancy_9(
 ; SI: alloca
 ; CI-NOT: alloca
-define void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
+define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
 entry:
   %stack = alloca [28 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -242,7 +242,7 @@
 
 ; ALL-LABEL: @occupancy_9_over(
 ; ALL: alloca [29 x i8]
-define void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
+define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
 entry:
   %stack = alloca [29 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
index 9933458..c23dea2 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
@@ -15,7 +15,7 @@
 
 ; HSA-LABEL: {{^}}test_no_round_size_1:
 ; HSA: workgroup_group_segment_byte_size = 38
-define void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
@@ -34,7 +34,7 @@
 ; HSA-LABEL: {{^}}test_round_size_2:
 ; HSA: workgroup_group_segment_byte_size = 86
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
@@ -50,7 +50,7 @@
 ; HSA-LABEL: {{^}}test_round_size_2_align_8:
 ; HSA: workgroup_group_segment_byte_size = 86
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
@@ -65,7 +65,7 @@
 ; HSA-LABEL: {{^}}test_round_local_lds_and_arg:
 ; HSA: workgroup_group_segment_byte_size = 38
 ; HSA: group_segment_alignment = 4
-define void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
+define amdgpu_kernel void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
 
@@ -78,7 +78,7 @@
 ; HSA-LABEL: {{^}}test_round_lds_arg:
 ; HSA: workgroup_group_segment_byte_size = 0
 ; HSA: group_segment_alignment = 4
-define void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
+define amdgpu_kernel void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false)
   ret void
@@ -88,7 +88,7 @@
 ; HSA-LABEL: {{^}}test_high_align_lds_arg:
 ; HSA: workgroup_group_segment_byte_size = 0
 ; HSA: group_segment_alignment = 4
-define void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 {
+define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 {
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 64, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 64, i1 false)
   ret void
@@ -98,7 +98,7 @@
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
 ; HSA: workgroup_group_segment_byte_size = 212
 ; HSA: group_segment_alignment = 4
-define void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false)
@@ -114,7 +114,7 @@
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1:
 ; HSA: workgroup_group_segment_byte_size = 216
 ; HSA: group_segment_alignment = 4
-define void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false)
@@ -142,7 +142,7 @@
 ; HSA-LABEL: {{^}}test_round_size_3_order0:
 ; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
@@ -163,7 +163,7 @@
 ; HSA-LABEL: {{^}}test_round_size_3_order1:
 ; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
@@ -184,7 +184,7 @@
 ; HSA-LABEL: {{^}}test_round_size_3_order2:
 ; HSA: workgroup_group_segment_byte_size = 150
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
@@ -205,7 +205,7 @@
 ; HSA-LABEL: {{^}}test_round_size_3_order3:
 ; HSA: workgroup_group_segment_byte_size = 118
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
@@ -226,7 +226,7 @@
 ; HSA-LABEL: {{^}}test_round_size_3_order4:
 ; HSA: workgroup_group_segment_byte_size = 142
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
@@ -247,7 +247,7 @@
 ; HSA-LABEL: {{^}}test_round_size_3_order5:
 ; HSA: workgroup_group_segment_byte_size = 126
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/lds-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-initializer.ll
index 9875814..254673d 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-initializer.ll
@@ -5,7 +5,7 @@
 
 @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
 
-define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
+define amdgpu_kernel void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
  %gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10
   %ld = load i32, i32 addrspace(3)* %gep
   store i32 %ld, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll b/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
index 078d633..1b3eeed 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
@@ -18,7 +18,7 @@
 
 ; GCN: BB0_3:
 ; GCN-NEXT: s_endpgm
-define void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 {
+define amdgpu_kernel void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
diff --git a/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll b/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll
index 6ff6fc3..fff2a92 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll
@@ -10,7 +10,7 @@
 ; reads and writes are bundled together in the same instruction.
 
 ; CHECK: {{^}}lds_crash:
-define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = load i32, i32 addrspace(3)* %in
   ; This block needs to be > 115 ISA instructions to hit the bug,
diff --git a/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll b/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
index abe472e4..8b7e9e6 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
@@ -10,7 +10,7 @@
 
 @local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4
 
-define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
+define amdgpu_kernel void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
 entry:
   %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
   %1 = load i32, i32 addrspace(3)* %0
@@ -88,7 +88,7 @@
 ; CHECK: LDS_READ_RET
 ; CHECK-NOT: ALU clause
 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
-define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
   %1 = load i32, i32 addrspace(3)* %0
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size.ll b/llvm/test/CodeGen/AMDGPU/lds-size.ll
index 1607713..c65817a 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-size.ll
@@ -14,7 +14,7 @@
 ; GCN: ; LDSByteSize: 4 bytes/workgroup (compile time only)
 @lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
 
-define void @test(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
index cb5d73f..53c1c72 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
@@ -5,7 +5,7 @@
 
 @lds = addrspace(3) global [256 x i32] zeroinitializer
 
-define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
+define amdgpu_kernel void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
  %gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10
   %ld = load i32, i32 addrspace(3)* %gep
   store i32 %ld, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll b/llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
index 4244c48..e85a1b6 100644
--- a/llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
@@ -11,7 +11,7 @@
 ; CHECK: {{^}}setcc_expand:
 ; CHECK: SET
 ; CHECK-NOT: CND
-define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @setcc_expand(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp eq i32 %in, 5
   br i1 %0, label %IF, label %ENDIF
diff --git a/llvm/test/CodeGen/AMDGPU/literals.ll b/llvm/test/CodeGen/AMDGPU/literals.ll
index 82fbb7f..1c546ba 100644
--- a/llvm/test/CodeGen/AMDGPU/literals.ll
+++ b/llvm/test/CodeGen/AMDGPU/literals.ll
@@ -10,7 +10,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: ADD_INT * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 5
-define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = add i32 5, %in
   store i32 %0, i32 addrspace(1)* %out
@@ -27,7 +27,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: ADD * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 1084227584(5.0
-define void @float_literal(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @float_literal(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fadd float 5.0, %in
   store float %0, float addrspace(1)* %out
@@ -41,7 +41,7 @@
 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0
 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0
 
-define void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) {
 entry:
   store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> addrspace(1)* %out
   ret void
@@ -52,7 +52,7 @@
 ; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0
 ; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0
 ; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0
-define void @inline_literal_dot4(float addrspace(1)* %out) {
+define amdgpu_kernel void @inline_literal_dot4(float addrspace(1)* %out) {
 entry:
   %0 = call float @llvm.r600.dot4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store float %0, float addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/liveness.mir b/llvm/test/CodeGen/AMDGPU/liveness.mir
index 112c3f8..48762e3 100644
--- a/llvm/test/CodeGen/AMDGPU/liveness.mir
+++ b/llvm/test/CodeGen/AMDGPU/liveness.mir
@@ -8,7 +8,7 @@
 # Should see three distinct value numbers:
 # CHECK: %vreg0 [{{.*}}:0)[{{.*}}:1)[{{.*}}:2) 0@{{[0-9]+[Berd]}} 1@{{[0-9]+[Berd]}} 2@{{[0-9]+B-phi}}
 --- |
-  define void @test0() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
 ...
 ---
 name: test0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
index 77dd4b1..64f0d65 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
@@ -8,7 +8,7 @@
 ; SI: v_bfe_i32
 ; EG: BFE_INT
 ; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac
-define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+define amdgpu_kernel void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -17,7 +17,7 @@
 ; FUNC-LABEL: {{^}}bfe_i32_arg_arg_imm:
 ; SI: v_bfe_i32
 ; EG: BFE_INT
-define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+define amdgpu_kernel void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -26,7 +26,7 @@
 ; FUNC-LABEL: {{^}}bfe_i32_arg_imm_arg:
 ; SI: v_bfe_i32
 ; EG: BFE_INT
-define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
+define amdgpu_kernel void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@
 ; FUNC-LABEL: {{^}}bfe_i32_imm_arg_arg:
 ; SI: v_bfe_i32
 ; EG: BFE_INT
-define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
+define amdgpu_kernel void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -43,7 +43,7 @@
 
 ; FUNC-LABEL: {{^}}v_bfe_print_arg:
 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
-define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind {
+define amdgpu_kernel void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind {
   %load = load i32, i32 addrspace(1)* %src0, align 4
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
@@ -54,7 +54,7 @@
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -64,7 +64,7 @@
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -74,7 +74,7 @@
 ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; SI: s_endpgm
-define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31)
@@ -88,7 +88,7 @@
 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
-define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31)
@@ -100,7 +100,7 @@
 ; SI: buffer_load_dword
 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
 ; SI: s_endpgm
-define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
@@ -113,7 +113,7 @@
 ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -125,7 +125,7 @@
 ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -137,7 +137,7 @@
 ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -149,7 +149,7 @@
 ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -160,7 +160,7 @@
 ; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = ashr i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
@@ -171,7 +171,7 @@
 ; SI-NOT: lshr
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = lshr i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
@@ -184,7 +184,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -196,7 +196,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -208,7 +208,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -220,7 +220,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -232,7 +232,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -244,7 +244,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -256,7 +256,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -268,7 +268,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -280,7 +280,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -292,7 +292,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -304,7 +304,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -316,7 +316,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -328,7 +328,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -340,7 +340,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -352,7 +352,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -364,7 +364,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -376,7 +376,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -388,7 +388,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -400,7 +400,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -412,7 +412,7 @@
 ; SI-NOT: v_ashr
 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24
 ; SI: buffer_store_dword [[BFE]],
-define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24)
   %shl = shl i32 %bfe, 8
@@ -428,7 +428,7 @@
 ; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]]
 ; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
 ; SI: buffer_store_dword [[TMP2]]
-define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %src = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone
   %div = sdiv i32 %bfe, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
index ee47b14..8cf1e16 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
@@ -8,7 +8,7 @@
 ; FUNC-LABEL: {{^}}bfe_u32_arg_arg_arg:
 ; SI: v_bfe_u32
 ; EG: BFE_UINT
-define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -17,7 +17,7 @@
 ; FUNC-LABEL: {{^}}bfe_u32_arg_arg_imm:
 ; SI: v_bfe_u32
 ; EG: BFE_UINT
-define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -26,7 +26,7 @@
 ; FUNC-LABEL: {{^}}bfe_u32_arg_imm_arg:
 ; SI: v_bfe_u32
 ; EG: BFE_UINT
-define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
+define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@
 ; FUNC-LABEL: {{^}}bfe_u32_imm_arg_arg:
 ; SI: v_bfe_u32
 ; EG: BFE_UINT
-define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
+define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -45,7 +45,7 @@
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -55,7 +55,7 @@
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -65,7 +65,7 @@
 ; SI: buffer_load_ubyte
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %load = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %load to i32
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
@@ -82,7 +82,7 @@
 ; VI-NEXT: v_and_b32_e32
 ; SI-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -97,7 +97,7 @@
 ; SI-NEXT: v_and_b32_e32
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 65535
@@ -111,7 +111,7 @@
 ; SI: v_add_i32
 ; SI: bfe
 ; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -126,7 +126,7 @@
 ; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8
 ; SI-NEXT: bfe
 ; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -141,7 +141,7 @@
 ; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80
 ; SI-NEXT: bfe
 ; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -155,7 +155,7 @@
 ; SI: v_add_i32
 ; SI-NEXT: bfe
 ; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 65535
@@ -169,14 +169,14 @@
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
 ; SI: s_endpgm
 ; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1,
-define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8)
@@ -184,7 +184,7 @@
   ret void
 }
 
-define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1)
@@ -199,7 +199,7 @@
 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
-define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %shr = lshr i32 %shl, 31
@@ -214,7 +214,7 @@
 ; SI-NOT: shr
 ; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
 ; SI: s_endpgm
-define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %shr = ashr i32 %shl, 31
@@ -227,7 +227,7 @@
 ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; SI: s_endpgm
-define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31)
@@ -239,7 +239,7 @@
 ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31)
@@ -252,7 +252,7 @@
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
@@ -265,7 +265,7 @@
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -277,7 +277,7 @@
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -289,7 +289,7 @@
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -301,7 +301,7 @@
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -312,7 +312,7 @@
 ; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = ashr i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
@@ -323,7 +323,7 @@
 ; SI-NOT: lshr
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = lshr i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
@@ -336,7 +336,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -348,7 +348,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -360,7 +360,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -372,7 +372,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -384,7 +384,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -396,7 +396,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -408,7 +408,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -420,7 +420,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -432,7 +432,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -444,7 +444,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -456,7 +456,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -468,7 +468,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -480,7 +480,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -492,7 +492,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -504,7 +504,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -516,7 +516,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -528,7 +528,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -540,7 +540,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -552,7 +552,7 @@
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -569,7 +569,7 @@
 ; SI-DAG: buffer_store_dword [[AND]]
 ; SI-DAG: buffer_store_dword [[BFE]]
 ; SI: s_endpgm
-define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
+define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
                                             i32 addrspace(1)* %out1,
                                             i32 addrspace(1)* %in) nounwind {
   %src = load i32, i32 addrspace(1)* %in, align 4
@@ -583,7 +583,7 @@
 ; FUNC-LABEL: {{^}}lshr_and:
 ; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
 ; SI: buffer_store_dword
-define void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind {
   %b = lshr i32 %a, 6
   %c = and i32 %b, 7
   store i32 %c, i32 addrspace(1)* %out, align 8
@@ -593,7 +593,7 @@
 ; FUNC-LABEL: {{^}}v_lshr_and:
 ; SI: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3
 ; SI: buffer_store_dword
-define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %c = lshr i32 %a, %b
   %d = and i32 %c, 7
   store i32 %d, i32 addrspace(1)* %out, align 8
@@ -603,7 +603,7 @@
 ; FUNC-LABEL: {{^}}and_lshr:
 ; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
 ; SI: buffer_store_dword
-define void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
   %b = and i32 %a, 448
   %c = lshr i32 %b, 6
   store i32 %c, i32 addrspace(1)* %out, align 8
@@ -613,7 +613,7 @@
 ; FUNC-LABEL: {{^}}and_lshr2:
 ; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
 ; SI: buffer_store_dword
-define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind {
   %b = and i32 %a, 511
   %c = lshr i32 %b, 6
   store i32 %c, i32 addrspace(1)* %out, align 8
@@ -623,7 +623,7 @@
 ; FUNC-LABEL: {{^}}shl_lshr:
 ; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002
 ; SI: buffer_store_dword
-define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
   %b = shl i32 %a, 9
   %c = lshr i32 %b, 11
   store i32 %c, i32 addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.export.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.export.ll
index 23a32dc..2777dd1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.SI.export.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.export.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}test_export_zeroes:
 ; GCN: exp mrt0 off, off, off, off{{$}}
 ; GCN: exp mrt0 off, off, off, off done{{$}}
-define void @test_export_zeroes() #0 {
+define amdgpu_kernel void @test_export_zeroes() #0 {
 
   call void @llvm.SI.export(i32 0, i32 0, i32 0, i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0)
   call void @llvm.SI.export(i32 0, i32 0, i32 1, i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0)
@@ -21,7 +21,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
-define void @test_export_en_src0() #0 {
+define amdgpu_kernel void @test_export_en_src0() #0 {
   call void @llvm.SI.export(i32 1, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
 }
@@ -32,7 +32,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
-define void @test_export_en_src1() #0 {
+define amdgpu_kernel void @test_export_en_src1() #0 {
   call void @llvm.SI.export(i32 2, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
 }
@@ -43,7 +43,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
-define void @test_export_en_src2() #0 {
+define amdgpu_kernel void @test_export_en_src2() #0 {
   call void @llvm.SI.export(i32 4, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
 }
@@ -54,7 +54,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src3() #0 {
+define amdgpu_kernel void @test_export_en_src3() #0 {
   call void @llvm.SI.export(i32 8, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
 }
@@ -65,7 +65,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
-define void @test_export_en_src0_src1() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1() #0 {
   call void @llvm.SI.export(i32 3, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
 }
@@ -76,7 +76,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
-define void @test_export_en_src0_src2() #0 {
+define amdgpu_kernel void @test_export_en_src0_src2() #0 {
   call void @llvm.SI.export(i32 5, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
 }
@@ -88,7 +88,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src0_src3() #0 {
+define amdgpu_kernel void @test_export_en_src0_src3() #0 {
   call void @llvm.SI.export(i32 9, i32 0, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 9, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -101,7 +101,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_en_src0_src1_src2_src3() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1_src2_src3() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -111,7 +111,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0.5
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
-define void @test_export_mrt7() #0 {
+define amdgpu_kernel void @test_export_mrt7() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 7, i32 0, float 0.5, float 0.5, float 0.5, float 0.5)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 7, i32 0, float 0.5, float 0.5, float 0.5, float 0.5)
   ret void
@@ -124,7 +124,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_z() #0 {
+define amdgpu_kernel void @test_export_z() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 8, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 8, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -137,7 +137,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_null() #0 {
+define amdgpu_kernel void @test_export_null() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 9, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 9, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -150,7 +150,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved10() #0 {
+define amdgpu_kernel void @test_export_reserved10() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 10, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 10, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -163,7 +163,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved11() #0 {
+define amdgpu_kernel void @test_export_reserved11() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 11, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 11, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -176,7 +176,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos0() #0 {
+define amdgpu_kernel void @test_export_pos0() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 12, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -189,7 +189,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos3() #0 {
+define amdgpu_kernel void @test_export_pos3() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 15, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 15, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -202,7 +202,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param0() #0 {
+define amdgpu_kernel void @test_export_param0() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 32, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -215,7 +215,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param31() #0 {
+define amdgpu_kernel void @test_export_param31() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 63, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 63, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -228,7 +228,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
-define void @test_export_vm() #0 {
+define amdgpu_kernel void @test_export_vm() #0 {
   call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
index 9c845e8..4a620e0d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -14,7 +14,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
-define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -23,7 +23,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
   store i32 %result, i32 addrspace(1)* %out
@@ -35,7 +35,7 @@
 ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_dec_u32 [[VPTR]], [[DATA]]
-define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
   ret void
 }
@@ -43,7 +43,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
   ret void
@@ -52,7 +52,7 @@
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -61,7 +61,7 @@
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
-define void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
   store i32 %result, i32 addrspace(1)* %out
@@ -70,7 +70,7 @@
 
 ; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32:
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind {
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
   ret void
 }
@@ -78,7 +78,7 @@
 ; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
-define void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
   ret void
@@ -88,7 +88,7 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}}
 ; VI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id
@@ -102,7 +102,7 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
 ; VI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
@@ -113,7 +113,7 @@
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
   store i32 %result, i32 addrspace(4)* %out
   ret void
@@ -122,7 +122,7 @@
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
   store i32 %result, i32 addrspace(4)* %out
@@ -131,7 +131,7 @@
 
 ; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32:
 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind {
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
   ret void
 }
@@ -139,7 +139,7 @@
 ; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
   ret void
@@ -148,7 +148,7 @@
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id
@@ -161,7 +161,7 @@
 ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
@@ -173,7 +173,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
   store i64 %result, i64 addrspace(4)* %out
   ret void
@@ -183,7 +183,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
   store i64 %result, i64 addrspace(4)* %out
@@ -194,7 +194,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
   ret void
 }
@@ -203,7 +203,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
   ret void
@@ -213,7 +213,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id
@@ -227,7 +227,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
@@ -240,7 +240,7 @@
 ; SI-LABEL: {{^}}atomic_dec_shl_base_lds_0:
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]] offset:8
-define void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -254,7 +254,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -264,7 +264,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
-define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
   store i64 %result, i64 addrspace(1)* %out
@@ -275,7 +275,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
   ret void
 }
@@ -284,7 +284,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
-define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
   ret void
@@ -294,7 +294,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -304,7 +304,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
-define void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
   store i64 %result, i64 addrspace(1)* %out
@@ -315,7 +315,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
   ret void
 }
@@ -324,7 +324,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
-define void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
   ret void
@@ -335,7 +335,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id
@@ -350,7 +350,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
@@ -363,7 +363,7 @@
 ; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64:
 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
-define void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
index 2209741..a4e3dc3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -14,7 +14,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
-define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -23,7 +23,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
   store i32 %result, i32 addrspace(1)* %out
@@ -35,7 +35,7 @@
 ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_inc_u32 [[VPTR]], [[DATA]]
-define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
   ret void
 }
@@ -43,7 +43,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
   ret void
@@ -52,7 +52,7 @@
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -61,7 +61,7 @@
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
-define void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
   store i32 %result, i32 addrspace(1)* %out
@@ -70,7 +70,7 @@
 
 ; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32:
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
   ret void
 }
@@ -78,7 +78,7 @@
 ; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
-define void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
   ret void
@@ -88,7 +88,7 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}}
 ; VI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id
@@ -102,7 +102,7 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
 ; VI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
@@ -115,7 +115,7 @@
 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:
 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-define void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -129,7 +129,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -139,7 +139,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
-define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
   store i64 %result, i64 addrspace(1)* %out
@@ -150,7 +150,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
   ret void
 }
@@ -159,7 +159,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
-define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
   ret void
@@ -169,7 +169,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -179,7 +179,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
-define void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
   store i64 %result, i64 addrspace(1)* %out
@@ -190,7 +190,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
   ret void
 }
@@ -199,7 +199,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
-define void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
   ret void
@@ -210,7 +210,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id
@@ -225,7 +225,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
@@ -236,7 +236,7 @@
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
   store i32 %result, i32 addrspace(4)* %out
   ret void
@@ -245,7 +245,7 @@
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
   store i32 %result, i32 addrspace(4)* %out
@@ -254,7 +254,7 @@
 
 ; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32:
 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind {
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
   ret void
 }
@@ -262,7 +262,7 @@
 ; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
   ret void
@@ -271,7 +271,7 @@
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id
@@ -284,7 +284,7 @@
 ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
@@ -297,7 +297,7 @@
 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:
 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
-define void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
@@ -320,7 +320,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
   store i64 %result, i64 addrspace(4)* %out
   ret void
@@ -330,7 +330,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
   store i64 %result, i64 addrspace(4)* %out
@@ -341,7 +341,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
   ret void
 }
@@ -350,7 +350,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
   ret void
@@ -360,7 +360,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id
@@ -374,7 +374,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
index 6d9db65..10bea8e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
@@ -8,7 +8,7 @@
 ; SI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xc4,0xe1,0x00,0x00,0x00,0x00]
 ; VI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xf8,0xe0,0x00,0x00,0x00,0x00]
 ; GCN-NEXT: s_endpgm
-define void @test_buffer_wbinvl1() #0 {
+define amdgpu_kernel void @test_buffer_wbinvl1() #0 {
   call void @llvm.amdgcn.buffer.wbinvl1()
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
index 7462984..fe60d16 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
@@ -6,7 +6,7 @@
 ; SI-NEXT: ; BB#0:
 ; SI-NEXT: buffer_wbinvl1_sc ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00]
 ; SI-NEXT: s_endpgm
-define void @test_buffer_wbinvl1_sc() #0 {
+define amdgpu_kernel void @test_buffer_wbinvl1_sc() #0 {
   call void @llvm.amdgcn.buffer.wbinvl1.sc()
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
index 4e0f3c3..061c146 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
@@ -8,7 +8,7 @@
 ; CI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00]
 ; VI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xfc,0xe0,0x00,0x00,0x00,0x00]
 ; GCN: s_endpgm
-define void @test_buffer_wbinvl1_vol() #0 {
+define amdgpu_kernel void @test_buffer_wbinvl1_vol() #0 {
   call void @llvm.amdgcn.buffer.wbinvl1.vol()
 ; This used to crash in hazard recognizer
   store i8 0, i8 addrspace(1)* undef, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
index 3798c46..f08d4b6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
@@ -10,7 +10,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @class_f16(
+define amdgpu_kernel void @class_f16(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     i32 addrspace(1)* %b) {
@@ -31,7 +31,7 @@
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_fabs(
+define amdgpu_kernel void @class_f16_fabs(
   i32 addrspace(1)* %r,
   half %a.val,
   i32 %b.val) {
@@ -51,7 +51,7 @@
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_fneg(
+define amdgpu_kernel void @class_f16_fneg(
   i32 addrspace(1)* %r,
   half %a.val,
   i32 %b.val) {
@@ -71,7 +71,7 @@
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_fabs_fneg(
+define amdgpu_kernel void @class_f16_fabs_fneg(
   i32 addrspace(1)* %r,
   half %a.val,
   i32 %b.val) {
@@ -91,7 +91,7 @@
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_1(
+define amdgpu_kernel void @class_f16_1(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
@@ -108,7 +108,7 @@
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_64(
+define amdgpu_kernel void @class_f16_64(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
@@ -126,7 +126,7 @@
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_full_mask(
+define amdgpu_kernel void @class_f16_full_mask(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
@@ -144,7 +144,7 @@
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_nine_bit_mask(
+define amdgpu_kernel void @class_f16_nine_bit_mask(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
index 668c669..1fcdac5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@@ -14,7 +14,7 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -29,7 +29,7 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -45,7 +45,7 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %a.fneg = fsub float -0.0, %a
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -61,7 +61,7 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
   %a.fneg.fabs = fsub float -0.0, %a.fabs
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
@@ -76,7 +76,7 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -89,7 +89,7 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -104,7 +104,7 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -118,7 +118,7 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -132,7 +132,7 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -150,7 +150,7 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -170,7 +170,7 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -190,7 +190,7 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -205,7 +205,7 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -221,7 +221,7 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %a.fneg = fsub double -0.0, %a
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -237,7 +237,7 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
   %a.fneg.fabs = fsub double -0.0, %a.fabs
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
@@ -249,7 +249,7 @@
 ; SI-LABEL: {{^}}test_class_1_f64:
 ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
 ; SI: s_endpgm
-define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -259,7 +259,7 @@
 ; SI-LABEL: {{^}}test_class_64_f64:
 ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
 ; SI: s_endpgm
-define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -275,7 +275,7 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -290,7 +290,7 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -306,7 +306,7 @@
 ; XSI: v_cmp_class_f64_e32 vcc, 1.0,
 ; SI: v_cmp_class_f64_e32 vcc,
 ; SI: s_endpgm
-define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -321,7 +321,7 @@
 ; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64:
 ; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
 ; SI: s_endpgm
-define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -338,7 +338,7 @@
 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -358,7 +358,7 @@
 ; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -381,7 +381,7 @@
 ; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -416,7 +416,7 @@
 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -436,7 +436,7 @@
 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -456,7 +456,7 @@
 ; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
 ; SI: s_or_b64
 ; SI: s_endpgm
-define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
+define amdgpu_kernel void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -476,7 +476,7 @@
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -488,7 +488,7 @@
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -500,7 +500,7 @@
 ; SI-NOT: v_cmp_class
 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1,
 ; SI: buffer_store_dword
-define void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float undef, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
index 410ac59..0543886 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
@@ -7,7 +7,7 @@
 ; VI:  v_cos_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @cos_f16(
+define amdgpu_kernel void @cos_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
index f6495d8..5b9c83c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}v_cos_f32:
 ; GCN: v_cos_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_cos_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_cos_f32(float addrspace(1)* %out, float %src) #1 {
   %cos = call float @llvm.amdgcn.cos.f32(float %src) #0
   store float %cos, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
index 22bed45..dadb070 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}test_cubeid:
 ; GCN: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubeid(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubeid(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
index 565f22c..60c4618 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}test_cubema:
 ; GCN: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubema(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubema(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubema(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
index a3ba327..10669cf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}test_cubesc:
 ; GCN: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubesc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubesc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
index d3c0f28..b277030 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}test_cubetc:
 ; GCN: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubetc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubetc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 24a8975..b92eb34 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -8,7 +8,7 @@
 ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
 ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
 ; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[VY]]
-define void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
   store <2 x half> %result, <2 x half> addrspace(1)* %out
   ret void
@@ -17,7 +17,7 @@
 ; GCN-LABEL: {{^}}s_cvt_pkrtz_samereg_v2f16_f32:
 ; GCN: s_load_dword [[X:s[0-9]+]]
 ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[X]]
-define void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
   store <2 x half> %result, <2 x half> addrspace(1)* %out
   ret void
@@ -29,7 +29,7 @@
 ; SI-NEXT: s_endpgm
 ; VI-NEXT: s_endpgm
 ; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
-define void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 {
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
   store <2 x half> %result, <2 x half> addrspace(1)* %out
   ret void
@@ -40,7 +40,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
 ; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], [[B]]
-define void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -56,7 +56,7 @@
 ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_reg_imm:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], 1.0
-define void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -71,7 +71,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
 ; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, 1.0, [[A]]
-define void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -86,7 +86,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], [[B]]
-define void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -104,7 +104,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], -[[B]]
-define void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -122,7 +122,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], -[[B]]
-define void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -141,7 +141,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -|[[A]]|, -[[B]]
-define void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
index 6c09aa5..58250de 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
@@ -9,7 +9,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @dispatch_id(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @dispatch_id(i64 addrspace(1)* %out) #0 {
   %tmp0 = call i64 @llvm.amdgcn.dispatch.id()
   store i64 %tmp0, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
index 2e86252..92208e7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}test:
 ; GCN: enable_sgpr_dispatch_ptr = 1
 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
   %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
   %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
   %value = load i32, i32 addrspace(2)* %header_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
index 6d262cf..e04d9e6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
@@ -9,7 +9,7 @@
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16(
+define amdgpu_kernel void @div_fixup_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -30,7 +30,7 @@
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_a(
+define amdgpu_kernel void @div_fixup_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
@@ -49,7 +49,7 @@
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_b(
+define amdgpu_kernel void @div_fixup_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
@@ -68,7 +68,7 @@
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_c(
+define amdgpu_kernel void @div_fixup_f16_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -86,7 +86,7 @@
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AB_F16]], v[[AB_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_a_imm_b(
+define amdgpu_kernel void @div_fixup_f16_imm_a_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %c) {
 entry:
@@ -102,7 +102,7 @@
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[BC_F16]], v[[BC_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_b_imm_c(
+define amdgpu_kernel void @div_fixup_f16_imm_b_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -118,7 +118,7 @@
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AC_F16]], v[[B_F16]], v[[AC_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_a_imm_c(
+define amdgpu_kernel void @div_fixup_f16_imm_a_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
index cc1504f..b8fcacf4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
@@ -16,7 +16,7 @@
 ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -24,7 +24,7 @@
 
 ; GCN-LABEL: {{^}}test_div_fixup_f64:
 ; GCN: v_div_fixup_f64
-define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
+define amdgpu_kernel void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
   %result = call double @llvm.amdgcn.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index d408fe9..a86468b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -20,7 +20,7 @@
 ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -34,7 +34,7 @@
 ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -48,7 +48,7 @@
 ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -62,7 +62,7 @@
 ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -70,7 +70,7 @@
 
 ; GCN-LABEL: {{^}}test_div_fmas_f64:
 ; GCN: v_div_fmas_f64
-define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -79,7 +79,7 @@
 ; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
 ; SI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
   %cmp = icmp eq i32 %i, 0
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
@@ -89,7 +89,7 @@
 ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
 ; SI: s_mov_b64 vcc, 0
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -98,7 +98,7 @@
 ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
 ; SI: s_mov_b64 vcc, -1
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -114,7 +114,7 @@
 ; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
 ; SI: s_endpgm
-define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
@@ -150,7 +150,7 @@
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
index 8e5c62c..0b4f09a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
@@ -11,7 +11,7 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -31,7 +31,7 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -51,7 +51,7 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -71,7 +71,7 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -91,7 +91,7 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -109,7 +109,7 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -127,7 +127,7 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -145,7 +145,7 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -163,7 +163,7 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -181,7 +181,7 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -199,7 +199,7 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -217,7 +217,7 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -236,7 +236,7 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -250,7 +250,7 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -265,7 +265,7 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
@@ -280,7 +280,7 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
@@ -292,7 +292,7 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %a = load float, float addrspace(1)* %gep.0, align 4
@@ -308,7 +308,7 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %a = load float, float addrspace(1)* %gep.0, align 4
@@ -326,7 +326,7 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[ABS_A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -349,7 +349,7 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[ABS_B]], [[ABS_B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
index 92d3fc8..4d4e8d4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}ds_bpermute:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
   ret void
@@ -14,7 +14,7 @@
 ; CHECK-LABEL: {{^}}ds_bpermute_imm_offset:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
   %index = add i32 %base_index, 4
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
@@ -24,7 +24,7 @@
 ; CHECK-LABEL: {{^}}ds_bpermute_imm_index:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
index 6d9c941..1766e28 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: {{^}}ds_permute:
 ; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
   %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
   ret void
@@ -14,7 +14,7 @@
 ; CHECK-LABEL: {{^}}ds_permute_imm_offset:
 ; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
   %index = add i32 %base_index, 4
   %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
index ef3cb00..a3a78d3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
@@ -6,7 +6,7 @@
 ; FUNC-LABEL: {{^}}ds_swizzle:
 ; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:100
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind {
+define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind {
   %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
   store i32 %swizzle, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll
index a8f6579..b972ddb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll
@@ -8,7 +8,7 @@
 ; GCN-LABEL: {{^}}test_export_compr_zeroes_v2f16:
 ; GCN: exp mrt0 off, off, off, off compr{{$}}
 ; GCN: exp mrt0 off, off, off, off done compr{{$}}
-define void @test_export_compr_zeroes_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_zeroes_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> zeroinitializer, <2 x half> zeroinitializer, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> zeroinitializer, <2 x half> zeroinitializer, i1 true, i1 false)
   ret void
@@ -18,7 +18,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], off, off done compr{{$}}
-define void @test_export_compr_en_src0_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_en_src0_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
 }
@@ -27,7 +27,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
 ; GCN: exp mrt0 off, off, [[SRC1]], [[SRC1]] done compr{{$}}
-define void @test_export_compr_en_src1_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_en_src1_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
 }
@@ -36,7 +36,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
-define void @test_export_compr_en_src0_src1_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_en_src0_src1_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
 }
@@ -45,7 +45,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
 ; GCN: exp mrt0 off, [[SRC0]], off, off done compr{{$}}
-define void @test_export_compr_en_invalid2_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_en_invalid2_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
 }
@@ -54,7 +54,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
 ; GCN: exp mrt0 off, [[SRC0]], off, [[SRC1]] done compr{{$}}
-define void @test_export_compr_en_invalid10_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_en_invalid10_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 10, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
 }
@@ -63,7 +63,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0x38003800
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] compr{{$}}
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done compr{{$}}
-define void @test_export_compr_mrt7_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_mrt7_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 7, i32 15, <2 x half> <half 0.5, half 0.5>, <2 x half> <half 0.5, half 0.5>, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2f16(i32 7, i32 15, <2 x half> <half 0.5, half 0.5>, <2 x half> <half 0.5, half 0.5>, i1 true, i1 false)
   ret void
@@ -74,7 +74,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
 ; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr{{$}}
 ; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
-define void @test_export_compr_z_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_z_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 8, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2f16(i32 8, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
@@ -85,7 +85,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr vm{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr vm{{$}}
-define void @test_export_compr_vm_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_vm_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 false, i1 true)
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 true)
   ret void
@@ -94,7 +94,7 @@
 ; GCN-LABEL: {{^}}test_export_compr_zeroes_v2i16:
 ; GCN: exp mrt0 off, off, off, off compr{{$}}
 ; GCN: exp mrt0 off, off, off, off done compr{{$}}
-define void @test_export_compr_zeroes_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_zeroes_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 0, <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 0, <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i1 true, i1 false)
   ret void
@@ -104,7 +104,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
 ; GCN: exp mrt0 [[SRC0]], off, off, off done compr{{$}}
-define void @test_export_compr_en_src0_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_en_src0_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 1, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
   ret void
 }
@@ -113,7 +113,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
 ; GCN: exp mrt0 off, off, [[SRC1]], [[SRC1]] done compr{{$}}
-define void @test_export_compr_en_src1_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_en_src1_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 12, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
   ret void
 }
@@ -122,7 +122,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
-define void @test_export_compr_en_src0_src1_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_en_src0_src1_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
   ret void
 }
@@ -131,7 +131,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VI16:v[0-9]+]], 0x50005
 ; GCN: exp mrt7 [[VI16]], [[VI16]], [[VI16]], [[VI16]] compr{{$}}
 ; GCN: exp mrt7 [[VI16]], [[VI16]], [[VI16]], [[VI16]] done compr{{$}}
-define void @test_export_compr_mrt7_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_mrt7_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 7, i32 15, <2 x i16> <i16 5, i16 5>, <2 x i16> <i16 5, i16 5>, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2i16(i32 7, i32 15, <2 x i16> <i16 5, i16 5>, <2 x i16> <i16 5, i16 5>, i1 true, i1 false)
   ret void
@@ -142,7 +142,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
 ; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr{{$}}
 ; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
-define void @test_export_compr_z_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_z_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 8, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2i16(i32 8, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
   ret void
@@ -153,7 +153,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr vm{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr vm{{$}}
-define void @test_export_compr_vm_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_vm_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 false, i1 true)
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 true)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
index 9b5836f..6d2de10 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
@@ -7,7 +7,7 @@
 ; GCN-LABEL: {{^}}test_export_zeroes_f32:
 ; GCN: exp mrt0 off, off, off, off{{$}}
 ; GCN: exp mrt0 off, off, off, off done{{$}}
-define void @test_export_zeroes_f32() #0 {
+define amdgpu_kernel void @test_export_zeroes_f32() #0 {
 
   call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 true, i1 false)
@@ -22,7 +22,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
-define void @test_export_en_src0_f32() #0 {
+define amdgpu_kernel void @test_export_en_src0_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -33,7 +33,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
-define void @test_export_en_src1_f32() #0 {
+define amdgpu_kernel void @test_export_en_src1_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -44,7 +44,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
-define void @test_export_en_src2_f32() #0 {
+define amdgpu_kernel void @test_export_en_src2_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -55,7 +55,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src3_f32() #0 {
+define amdgpu_kernel void @test_export_en_src3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -66,7 +66,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
-define void @test_export_en_src0_src1_f32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -77,7 +77,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
-define void @test_export_en_src0_src2_f32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src2_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -89,7 +89,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src0_src3_f32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -102,7 +102,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_en_src0_src1_src2_src3_f32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -112,7 +112,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0.5
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
-define void @test_export_mrt7_f32() #0 {
+define amdgpu_kernel void @test_export_mrt7_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 true, i1 false)
   ret void
@@ -125,7 +125,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_z_f32() #0 {
+define amdgpu_kernel void @test_export_z_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -138,7 +138,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_null_f32() #0 {
+define amdgpu_kernel void @test_export_null_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -151,7 +151,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved10_f32() #0 {
+define amdgpu_kernel void @test_export_reserved10_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -164,7 +164,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved11_f32() #0 {
+define amdgpu_kernel void @test_export_reserved11_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -177,7 +177,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos0_f32() #0 {
+define amdgpu_kernel void @test_export_pos0_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -190,7 +190,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos3_f32() #0 {
+define amdgpu_kernel void @test_export_pos3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -203,7 +203,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param0_f32() #0 {
+define amdgpu_kernel void @test_export_param0_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -216,7 +216,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param31_f32() #0 {
+define amdgpu_kernel void @test_export_param31_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -229,7 +229,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
-define void @test_export_vm_f32() #0 {
+define amdgpu_kernel void @test_export_vm_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 true)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 true)
   ret void
@@ -252,7 +252,7 @@
 ; GCN-LABEL: {{^}}test_export_zeroes_i32:
 ; GCN: exp mrt0 off, off, off, off{{$}}
 ; GCN: exp mrt0 off, off, off, off done{{$}}
-define void @test_export_zeroes_i32() #0 {
+define amdgpu_kernel void @test_export_zeroes_i32() #0 {
 
   call void @llvm.amdgcn.exp.i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 true, i1 false)
@@ -267,7 +267,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
-define void @test_export_en_src0_i32() #0 {
+define amdgpu_kernel void @test_export_en_src0_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 1, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -278,7 +278,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
-define void @test_export_en_src1_i32() #0 {
+define amdgpu_kernel void @test_export_en_src1_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 2, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -289,7 +289,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
-define void @test_export_en_src2_i32() #0 {
+define amdgpu_kernel void @test_export_en_src2_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 4, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -300,7 +300,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src3_i32() #0 {
+define amdgpu_kernel void @test_export_en_src3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 8, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -311,7 +311,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
-define void @test_export_en_src0_src1_i32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -322,7 +322,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
-define void @test_export_en_src0_src2_i32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src2_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 5, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -334,7 +334,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src0_src3_i32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -347,7 +347,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_en_src0_src1_src2_src3_i32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -357,7 +357,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 5
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
-define void @test_export_mrt7_i32() #0 {
+define amdgpu_kernel void @test_export_mrt7_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 true, i1 false)
   ret void
@@ -370,7 +370,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_z_i32() #0 {
+define amdgpu_kernel void @test_export_z_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -383,7 +383,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_null_i32() #0 {
+define amdgpu_kernel void @test_export_null_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -396,7 +396,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved10_i32() #0 {
+define amdgpu_kernel void @test_export_reserved10_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -409,7 +409,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved11_i32() #0 {
+define amdgpu_kernel void @test_export_reserved11_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -422,7 +422,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos0_i32() #0 {
+define amdgpu_kernel void @test_export_pos0_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -435,7 +435,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos3_i32() #0 {
+define amdgpu_kernel void @test_export_pos3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -448,7 +448,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param0_i32() #0 {
+define amdgpu_kernel void @test_export_param0_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -461,7 +461,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param31_i32() #0 {
+define amdgpu_kernel void @test_export_param31_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -474,7 +474,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
-define void @test_export_vm_i32() #0 {
+define amdgpu_kernel void @test_export_vm_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 true)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 true)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
index 42a252d..c9993ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
@@ -7,7 +7,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_dynamic_cc:
 ; GCN: s_endpgm
-define void @v_fcmp_f32_dynamic_cc(i64 addrspace(1)* %out, float %src0, float %src1, i32 %cc) {
+define amdgpu_kernel void @v_fcmp_f32_dynamic_cc(i64 addrspace(1)* %out, float %src0, float %src1, i32 %cc) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src0, float %src1, i32 %cc)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_with_fabs:
 ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, {{s[0-9]+}}
-define void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
+define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1)
   store i64 %result, i64 addrspace(1)* %out
@@ -24,7 +24,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, |{{s[0-9]+}}|
-define void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
+define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
   %temp = call float @llvm.fabs.f32(float %a)
   %src_input = call float @llvm.fabs.f32(float %src)
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src_input, float %temp, i32 1)
@@ -34,7 +34,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp:
 ; GCN-NOT: v_cmp_eq_f32_e64
-define void @v_fcmp(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 -1)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -42,7 +42,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq:
 ; GCN: v_cmp_eq_f32_e64
-define void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_one:
 ; GCN: v_cmp_neq_f32_e64
-define void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -58,7 +58,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ogt:
 ; GCN: v_cmp_gt_f32_e64
-define void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -66,7 +66,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oge:
 ; GCN: v_cmp_ge_f32_e64
-define void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -74,7 +74,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_olt:
 ; GCN: v_cmp_lt_f32_e64
-define void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -82,7 +82,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ole:
 ; GCN: v_cmp_le_f32_e64
-define void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -91,7 +91,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ueq:
 ; GCN: v_cmp_nlg_f32_e64
-define void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_une:
 ; GCN: v_cmp_neq_f32_e64
-define void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -107,7 +107,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ugt:
 ; GCN: v_cmp_nle_f32_e64
-define void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -115,7 +115,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_uge:
 ; GCN: v_cmp_nlt_f32_e64
-define void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -123,7 +123,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ult:
 ; GCN: v_cmp_nge_f32_e64
-define void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -131,7 +131,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ule:
 ; GCN: v_cmp_ngt_f32_e64
-define void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -139,7 +139,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_oeq:
 ; GCN: v_cmp_eq_f64_e64
-define void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -147,7 +147,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_one:
 ; GCN: v_cmp_neq_f64_e64
-define void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -155,7 +155,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ogt:
 ; GCN: v_cmp_gt_f64_e64
-define void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -163,7 +163,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_oge:
 ; GCN: v_cmp_ge_f64_e64
-define void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -171,7 +171,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_olt:
 ; GCN: v_cmp_lt_f64_e64
-define void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -179,7 +179,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ole:
 ; GCN: v_cmp_le_f64_e64
-define void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -187,7 +187,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ueq:
 ; GCN: v_cmp_nlg_f64_e64
-define void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -195,7 +195,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_une:
 ; GCN: v_cmp_neq_f64_e64
-define void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -203,7 +203,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ugt:
 ; GCN: v_cmp_nle_f64_e64
-define void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -211,7 +211,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_uge:
 ; GCN: v_cmp_nlt_f64_e64
-define void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -219,7 +219,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ult:
 ; GCN: v_cmp_nge_f64_e64
-define void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -227,7 +227,7 @@
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ule:
 ; GCN: v_cmp_ngt_f64_e64
-define void @v_fcmp_f64_ule(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ule(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13)
   store i64 %result, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
index 54d7848..248ee99 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
@@ -8,7 +8,7 @@
 ; CHECK: v_rcp_f32_e32
 ; CHECK: v_mul_f32_e32
 ; CHECK: v_mul_f32_e32
-define void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 {
+define amdgpu_kernel void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 {
   %fdiv = call float @llvm.amdgcn.fdiv.fast(float %a, float %b)
   store float %fdiv, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
index 1c22aa1..a4ae37b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}test_fmed3_f16:
 ; GCN: v_med3_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_fmed3_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
+define amdgpu_kernel void @test_fmed3_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
   %src0.f16 = trunc i32 %src0.arg to i16
   %src0 = bitcast i16 %src0.f16 to half
   %src1.f16 = trunc i32 %src1.arg to i16
@@ -16,7 +16,7 @@
 
 ; GCN-LABEL: {{^}}test_fmed3_srcmods_f16:
 ; GCN: v_med3_f16 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}|
-define void @test_fmed3_srcmods_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
+define amdgpu_kernel void @test_fmed3_srcmods_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
   %src0.f16 = trunc i32 %src0.arg to i16
   %src0 = bitcast i16 %src0.f16 to half
   %src1.f16 = trunc i32 %src1.arg to i16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
index 010599d..230e625 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}test_fmed3:
 ; GCN: v_med3_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_fmed3(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
+define amdgpu_kernel void @test_fmed3(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
   %mad = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float %src2)
   store float %mad, float addrspace(1)* %out
   ret void
@@ -11,7 +11,7 @@
 
 ; GCN-LABEL: {{^}}test_fmed3_srcmods:
 ; GCN: v_med3_f32 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}|
-define void @test_fmed3_srcmods(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
+define amdgpu_kernel void @test_fmed3_srcmods(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
   %src0.fneg = fsub float -0.0, %src0
   %src1.fabs = call float @llvm.fabs.f32(float %src1)
   %src2.fabs = call float @llvm.fabs.f32(float %src2)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
index d5c1c0a..b47d2db 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
@@ -4,7 +4,7 @@
 
 ; GCN-LABEL: {{^}}test_mul_legacy_f32:
 ; GCN: v_mul_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %result = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -12,7 +12,7 @@
 
 ; GCN-LABEL: {{^}}test_mul_legacy_undef0_f32:
 ; GCN: v_mul_legacy_f32_e32
-define void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 {
   %result = call float @llvm.amdgcn.fmul.legacy(float undef, float %a)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -20,7 +20,7 @@
 
 ; GCN-LABEL: {{^}}test_mul_legacy_undef1_f32:
 ; GCN: v_mul_legacy_f32_e32
-define void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 {
   %result = call float @llvm.amdgcn.fmul.legacy(float %a, float undef)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -28,7 +28,7 @@
 
 ; GCN-LABEL: {{^}}test_mul_legacy_fabs_f32:
 ; GCN: v_mul_legacy_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |s{{[0-9]+}}|
-define void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %result = call float @llvm.amdgcn.fmul.legacy(float %a.fabs, float %b.fabs)
@@ -40,7 +40,7 @@
 ; GCN-LABEL: {{^}}test_mad_legacy_f32:
 ; GCN: v_mul_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_add_f32_e32
-define void @test_mad_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #0 {
+define amdgpu_kernel void @test_mad_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #0 {
   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   %add = fadd float %mul, %c
   store float %add, float addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
index d8c1af0..026f690 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
@@ -7,7 +7,7 @@
 ; VI:  v_fract_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fract_f16(
+define amdgpu_kernel void @fract_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
index 53cb607..d4f1c5f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
@@ -6,7 +6,7 @@
 
 ; GCN-LABEL: {{^}}v_fract_f32:
 ; GCN: v_fract_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_fract_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_fract_f32(float addrspace(1)* %out, float %src) #1 {
   %fract = call float @llvm.amdgcn.fract.f32(float %src)
   store float %fract, float addrspace(1)* %out
   ret void
@@ -14,7 +14,7 @@
 
 ; GCN-LABEL: {{^}}v_fract_f64:
 ; GCN: v_fract_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @v_fract_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @v_fract_f64(double addrspace(1)* %out, double %src) #1 {
   %fract = call double @llvm.amdgcn.fract.f64(double %src)
   store double %fract, double addrspace(1)* %out
   ret void
@@ -23,7 +23,7 @@
 ; GCN-LABEL: {{^}}v_fract_undef_f32:
 ; GCN-NOT: v_fract_f32
 ; GCN-NOT: store_dword
-define void @v_fract_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_fract_undef_f32(float addrspace(1)* %out) #1 {
   %fract = call float @llvm.amdgcn.fract.f32(float undef)
   store float %fract, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
index 7521224..dc3eb4c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
@@ -6,7 +6,7 @@
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; VI:  v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_I16]]
-define void @frexp_exp_f16(
+define amdgpu_kernel void @frexp_exp_f16(
     i16 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -21,7 +21,7 @@
 ; VI:  v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
 ; VI:  v_bfe_i32 v[[R_I32:[0-9]+]], v[[R_I16]], 0, 16{{$}}
 ; GCN: buffer_store_dword v[[R_I32]]
-define void @frexp_exp_f16_sext(
+define amdgpu_kernel void @frexp_exp_f16_sext(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@
 ; VI:  v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
 ; VI:  v_and_b32_e32 v[[R_I32:[0-9]+]], 0xffff, v[[R_I16]]
 ; GCN: buffer_store_dword v[[R_I32]]
-define void @frexp_exp_f16_zext(
+define amdgpu_kernel void @frexp_exp_f16_zext(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
index 9c49f17..0d68614 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
@@ -8,7 +8,7 @@
 
 ; GCN-LABEL: {{^}}s_test_frexp_exp_f32:
 ; GCN: v_frexp_exp_i32_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f32:
 ; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}|
-define void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %fabs.src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
@@ -25,7 +25,7 @@
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f32:
 ; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}|
-define void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %fneg.fabs.src = fsub float -0.0, %fabs.src
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %fneg.fabs.src)
@@ -35,7 +35,7 @@
 
 ; GCN-LABEL: {{^}}s_test_frexp_exp_f64:
 ; GCN: v_frexp_exp_i32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f64:
 ; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, |{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %fabs.src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
@@ -52,7 +52,7 @@
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f64:
 ; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, -|{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fneg_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %fneg.fabs.src = fsub double -0.0, %fabs.src
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %fneg.fabs.src)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
index 706537d..722cd44 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
@@ -7,7 +7,7 @@
 ; VI:  v_frexp_mant_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @frexp_mant_f16(
+define amdgpu_kernel void @frexp_mant_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
index b8d63de..605dc3d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
@@ -8,7 +8,7 @@
 
 ; GCN-LABEL: {{^}}s_test_frexp_mant_f32:
 ; GCN: v_frexp_mant_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
   %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %src)
   store float %frexp.mant, float addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f32:
 ; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}|
-define void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fabs.src)
   store float %frexp.mant, float addrspace(1)* %out
@@ -25,7 +25,7 @@
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f32:
 ; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}|
-define void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %fneg.fabs.src = fsub float -0.0, %fabs.src
   %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fneg.fabs.src)
@@ -35,7 +35,7 @@
 
 ; GCN-LABEL: {{^}}s_test_frexp_mant_f64:
 ; GCN: v_frexp_mant_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
   %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %src)
   store double %frexp.mant, double addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f64:
 ; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, |{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fabs.src)
   store double %frexp.mant, double addrspace(1)* %out
@@ -52,7 +52,7 @@
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f64:
 ; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, -|{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fneg_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %fneg.fabs.src = fsub double -0.0, %fabs.src
   %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fneg.fabs.src)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
index 6014e2e..d26fab4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
@@ -9,7 +9,7 @@
 
 ; CHECK-LABEL: {{^}}groupstaticsize_test0:
 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
-define void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
+define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 64
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
@@ -23,7 +23,7 @@
 
 ; CHECK-LABEL: {{^}}groupstaticsize_test1:
 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
-define void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
+define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
 entry:
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
   store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
@@ -51,7 +51,7 @@
 ; Exceeds 16-bit simm limit of s_movk_i32
 ; CHECK-LABEL: {{^}}large_groupstaticsize:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
-define void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
+define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
   %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx
   store volatile i32 0, i32 addrspace(3)* %gep
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
index 987a41b..aa04af7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
@@ -7,7 +7,7 @@
 ; No crash on invalid input
 ; GCN-LABEL: {{^}}v_icmp_i32_dynamic_cc:
 ; GCN: s_endpgm
-define void @v_icmp_i32_dynamic_cc(i64 addrspace(1)* %out, i32 %src, i32 %cc) {
+define amdgpu_kernel void @v_icmp_i32_dynamic_cc(i64 addrspace(1)* %out, i32 %src, i32 %cc) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 %cc)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@
 
 ; GCN-LABEL: {{^}}v_icmp_i32_eq:
 ; GCN: v_cmp_eq_u32_e64
-define void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -23,14 +23,14 @@
 
 ; GCN-LABEL: {{^}}v_icmp:
 ; GCN-NOT: v_cmp_eq_u32_e64
-define void @v_icmp(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 30)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 ; GCN-LABEL: {{^}}v_icmp_i32_ne:
 ; GCN: v_cmp_ne_u32_e64
-define void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@
 
 ; GCN-LABEL: {{^}}v_icmp_u32_ugt:
 ; GCN: v_cmp_gt_u32_e64
-define void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@
 
 ; GCN-LABEL: {{^}}v_icmp_u32_uge:
 ; GCN: v_cmp_ge_u32_e64
-define void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -54,7 +54,7 @@
 
 ; GCN-LABEL: {{^}}v_icmp_u32_ult:
 ; GCN: v_cmp_lt_u32_e64
-define void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -62,7 +62,7 @@
 
 ; GCN-LABEL: {{^}}v_icmp_u32_ule:
 ; GCN: v_cmp_le_u32_e64
-define void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -70,7 +70,7 @@
 
 ; GCN-LABEL: {{^}}v_icmp_i32_sgt:
 ; GCN: v_cmp_gt_i32_e64
-define void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 {
+define amdgpu_kernel void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -78,7 +78,7 @@
 
 ; GCN-LABEL: {{^}}v_icmp_i32_sge:
 ; GCN: v_cmp_ge_i32_e64
-define void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -86,14 +86,14 @@
 
 ; GCN-LABEL: {{^}}v_icmp_i32_slt:
 ; GCN: v_cmp_lt_i32_e64
-define void @v_icmp_i32_slt(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_slt(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 ; GCN-LABEL: {{^}}v_icmp_i32_sle:
 ; GCN: v_cmp_le_i32_e64
-define void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -101,7 +101,7 @@
 
 ; GCN-LABEL: {{^}}v_icmp_i64_eq:
 ; GCN: v_cmp_eq_u64_e64
-define void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -109,7 +109,7 @@
 
 ; GCN-LABEL: {{^}}v_icmp_i64_ne:
 ; GCN: v_cmp_ne_u64_e64
-define void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -117,7 +117,7 @@
 
 ; GCN-LABEL: {{^}}v_icmp_u64_ugt:
 ; GCN: v_cmp_gt_u64_e64
-define void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -125,7 +125,7 @@
 
 ; GCN-LABEL: {{^}}v_icmp_u64_uge:
 ; GCN: v_cmp_ge_u64_e64
-define void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -133,7 +133,7 @@
 
 ; GCN-LABEL: {{^}}v_icmp_u64_ult:
 ; GCN: v_cmp_lt_u64_e64
-define void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -141,7 +141,7 @@
 
 ; GCN-LABEL: {{^}}v_icmp_u64_ule:
 ; GCN: v_cmp_le_u64_e64
-define void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -149,7 +149,7 @@
 
 ; GCN-LABEL: {{^}}v_icmp_i64_sgt:
 ; GCN: v_cmp_gt_i64_e64
-define void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -157,7 +157,7 @@
 
 ; GCN-LABEL: {{^}}v_icmp_i64_sge:
 ; GCN: v_cmp_ge_i64_e64
-define void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -165,14 +165,14 @@
 
 ; GCN-LABEL: {{^}}v_icmp_i64_slt:
 ; GCN: v_cmp_lt_i64_e64
-define void @v_icmp_i64_slt(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_slt(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 ; GCN-LABEL: {{^}}v_icmp_i64_sle:
 ; GCN: v_cmp_le_i64_e64
-define void @v_icmp_i64_sle(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sle(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
   store i64 %result, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
index a65f422..a9351db 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}gather4_v2:
 ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_v2(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_v2(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@
 
 ; GCN-LABEL: {{^}}gather4:
 ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@
 
 ; GCN-LABEL: {{^}}gather4_cl:
 ; GCN: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -30,7 +30,7 @@
 
 ; GCN-LABEL: {{^}}gather4_l:
 ; GCN: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -39,7 +39,7 @@
 
 ; GCN-LABEL: {{^}}gather4_b:
 ; GCN: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -48,7 +48,7 @@
 
 ; GCN-LABEL: {{^}}gather4_b_cl:
 ; GCN: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -57,7 +57,7 @@
 
 ; GCN-LABEL: {{^}}gather4_b_cl_v8:
 ; GCN: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_cl_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_cl_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@
 
 ; GCN-LABEL: {{^}}gather4_lz_v2:
 ; GCN: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_lz_v2(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_lz_v2(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@
 
 ; GCN-LABEL: {{^}}gather4_lz:
 ; GCN: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -86,7 +86,7 @@
 
 ; GCN-LABEL: {{^}}gather4_o:
 ; GCN: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -95,7 +95,7 @@
 
 ; GCN-LABEL: {{^}}gather4_cl_o:
 ; GCN: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -104,7 +104,7 @@
 
 ; GCN-LABEL: {{^}}gather4_cl_o_v8:
 ; GCN: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_cl_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_cl_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -113,7 +113,7 @@
 
 ; GCN-LABEL: {{^}}gather4_l_o:
 ; GCN: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_l_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_l_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -122,7 +122,7 @@
 
 ; GCN-LABEL: {{^}}gather4_l_o_v8:
 ; GCN: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_l_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_l_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.l.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -131,7 +131,7 @@
 
 ; GCN-LABEL: {{^}}gather4_b_o:
 ; GCN: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -140,7 +140,7 @@
 
 ; GCN-LABEL: {{^}}gather4_b_o_v8:
 ; GCN: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -149,7 +149,7 @@
 
 ; GCN-LABEL: {{^}}gather4_b_cl_o:
 ; GCN: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -158,7 +158,7 @@
 
 ; GCN-LABEL: {{^}}gather4_lz_o:
 ; GCN: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_lz_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_lz_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -168,7 +168,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c:
 ; GCN: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -177,7 +177,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_cl:
 ; GCN: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -186,7 +186,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_cl_v8:
 ; GCN: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_cl_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_cl_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -195,7 +195,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_l:
 ; GCN: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -204,7 +204,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_l_v8:
 ; GCN: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_l_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_l_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -213,7 +213,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_b:
 ; GCN: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -222,7 +222,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_b_v8:
 ; GCN: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -231,7 +231,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_b_cl:
 ; GCN: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -240,7 +240,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_lz:
 ; GCN: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -250,7 +250,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_o:
 ; GCN: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -259,7 +259,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_o_v8:
 ; GCN: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -268,7 +268,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_cl_o:
 ; GCN: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -277,7 +277,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_l_o:
 ; GCN: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_l_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_l_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -286,7 +286,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_b_o:
 ; GCN: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -295,7 +295,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_b_cl_o:
 ; GCN: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -304,7 +304,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_lz_o:
 ; GCN: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_lz_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_lz_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -313,7 +313,7 @@
 
 ; GCN-LABEL: {{^}}gather4_c_lz_o_v8:
 ; GCN: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_lz_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_lz_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -322,7 +322,7 @@
 
 ; GCN-LABEL: {{^}}gather4_f32:
 ; GCN: image_gather4 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_f32(float addrspace(1)* %out) {
 main_body:
   %r = call float @llvm.amdgcn.image.gather4.f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store float %r, float addrspace(1)* %out
@@ -331,7 +331,7 @@
 
 ; GCN-LABEL: {{^}}gather4_v2f32:
 ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
-define void @gather4_v2f32(<2 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_v2f32(<2 x float> addrspace(1)* %out) {
 main_body:
   %r = call <2 x float> @llvm.amdgcn.image.gather4.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <2 x float> %r, <2 x float> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll
index c7a0461..2e78e2a4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}getlod:
 ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da
-define void @getlod(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @getlod(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.f32.v8i32(float undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@
 
 ; GCN-LABEL: {{^}}getlod_v2:
 ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da
-define void @getlod_v2(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @getlod_v2(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@
 
 ; GCN-LABEL: {{^}}getlod_v4:
 ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da
-define void @getlod_v4(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @getlod_v4(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -31,7 +31,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_getlod_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_getlod_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_getlod_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
index e6412b3..4f90b0a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}sample:
 ; GCN: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@
 
 ; GCN-LABEL: {{^}}sample_cl:
 ; GCN: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@
 
 ; GCN-LABEL: {{^}}sample_d:
 ; GCN: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -30,7 +30,7 @@
 
 ; GCN-LABEL: {{^}}sample_d_cl:
 ; GCN: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -39,7 +39,7 @@
 
 ; GCN-LABEL: {{^}}sample_l:
 ; GCN: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -48,7 +48,7 @@
 
 ; GCN-LABEL: {{^}}sample_b:
 ; GCN: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -57,7 +57,7 @@
 
 ; GCN-LABEL: {{^}}sample_b_cl:
 ; GCN: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@
 
 ; GCN-LABEL: {{^}}sample_lz:
 ; GCN: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@
 
 ; GCN-LABEL: {{^}}sample_cd:
 ; GCN: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -84,7 +84,7 @@
 
 ; GCN-LABEL: {{^}}sample_cd_cl:
 ; GCN: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -93,7 +93,7 @@
 
 ; GCN-LABEL: {{^}}sample_c:
 ; GCN: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -102,7 +102,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_cl:
 ; GCN: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -111,7 +111,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_d:
 ; GCN: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -120,7 +120,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_d_cl:
 ; GCN: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -129,7 +129,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_l:
 ; GCN: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -138,7 +138,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_b:
 ; GCN: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -147,7 +147,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_b_cl:
 ; GCN: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -156,7 +156,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_lz:
 ; GCN: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -165,7 +165,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_cd:
 ; GCN: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -174,7 +174,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_cd_cl:
 ; GCN: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -183,7 +183,7 @@
 
 ; GCN-LABEL: {{^}}sample_f32:
 ; GCN: image_sample {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1
-define void @sample_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @sample_f32(float addrspace(1)* %out) {
 main_body:
   %r = call float @llvm.amdgcn.image.sample.f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 0)
   store float %r, float addrspace(1)* %out
@@ -192,7 +192,7 @@
 
 ; GCN-LABEL: {{^}}sample_v2f32:
 ; GCN: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3
-define void @sample_v2f32(<2 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_v2f32(<2 x float> addrspace(1)* %out) {
 main_body:
   %r = call <2 x float> @llvm.amdgcn.image.sample.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <2 x float> %r, <2 x float> addrspace(1)* %out
@@ -201,7 +201,7 @@
 
 ; GCN-LABEL: {{^}}adjust_writemask_sample_0:
 ; GCN: image_sample v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1{{$}}
-define void @adjust_writemask_sample_0(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_0(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -211,7 +211,7 @@
 
 ; GCN-LABEL: {{^}}adjust_writemask_sample_01:
 ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x3{{$}}
-define void @adjust_writemask_sample_01(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_01(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -223,7 +223,7 @@
 
 ; GCN-LABEL: {{^}}adjust_writemask_sample_012:
 ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x7{{$}}
-define void @adjust_writemask_sample_012(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_012(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -237,7 +237,7 @@
 
 ; GCN-LABEL: {{^}}adjust_writemask_sample_12:
 ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x6{{$}}
-define void @adjust_writemask_sample_12(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_12(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   %elt1 = extractelement <4 x float> %r, i32 1
@@ -249,7 +249,7 @@
 
 ; GCN-LABEL: {{^}}adjust_writemask_sample_03:
 ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x9{{$}}
-define void @adjust_writemask_sample_03(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_03(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -261,7 +261,7 @@
 
 ; GCN-LABEL: {{^}}adjust_writemask_sample_13:
 ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xa{{$}}
-define void @adjust_writemask_sample_13(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_13(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   %elt1 = extractelement <4 x float> %r, i32 1
@@ -273,7 +273,7 @@
 
 ; GCN-LABEL: {{^}}adjust_writemask_sample_123:
 ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xe{{$}}
-define void @adjust_writemask_sample_123(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_123(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   %elt1 = extractelement <4 x float> %r, i32 1
@@ -288,7 +288,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_variable_dmask_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_variable_dmask_enabled(float addrspace(1)* %out, i32 %dmask) {
+define amdgpu_kernel void @adjust_writemask_sample_variable_dmask_enabled(float addrspace(1)* %out, i32 %dmask) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 %dmask, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -300,7 +300,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -311,7 +311,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_cl_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_cl_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_cl_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -322,7 +322,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_d_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_d_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_d_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -333,7 +333,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_d_cl_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_d_cl_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_d_cl_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -344,7 +344,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_l_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_l_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_l_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -355,7 +355,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_b_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_b_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_b_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -366,7 +366,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_b_cl_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_b_cl_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_b_cl_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -377,7 +377,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_lz_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_lz_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_lz_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -388,7 +388,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_cd_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_cd_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_cd_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -399,7 +399,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_cd_cl_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_cd_cl_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_cd_cl_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll
index fd5e138..42d7bc0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}sample:
 ; GCN: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@
 
 ; GCN-LABEL: {{^}}sample_cl:
 ; GCN: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@
 
 ; GCN-LABEL: {{^}}sample_d:
 ; GCN: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -30,7 +30,7 @@
 
 ; GCN-LABEL: {{^}}sample_d_cl:
 ; GCN: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -39,7 +39,7 @@
 
 ; GCN-LABEL: {{^}}sample_l:
 ; GCN: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -48,7 +48,7 @@
 
 ; GCN-LABEL: {{^}}sample_b:
 ; GCN: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -57,7 +57,7 @@
 
 ; GCN-LABEL: {{^}}sample_b_cl:
 ; GCN: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@
 
 ; GCN-LABEL: {{^}}sample_lz:
 ; GCN: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@
 
 ; GCN-LABEL: {{^}}sample_cd:
 ; GCN: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -84,7 +84,7 @@
 
 ; GCN-LABEL: {{^}}sample_cd_cl:
 ; GCN: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -93,7 +93,7 @@
 
 ; GCN-LABEL: {{^}}sample_c:
 ; GCN: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -102,7 +102,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_cl:
 ; GCN: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -111,7 +111,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_d:
 ; GCN: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -120,7 +120,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_d_cl:
 ; GCN: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -129,7 +129,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_l:
 ; GCN: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -138,7 +138,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_b:
 ; GCN: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -147,7 +147,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_b_cl:
 ; GCN: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -156,7 +156,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_lz:
 ; GCN: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -165,7 +165,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_cd:
 ; GCN: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -174,7 +174,7 @@
 
 ; GCN-LABEL: {{^}}sample_c_cd_cl:
 ; GCN: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -184,7 +184,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -195,7 +195,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -206,7 +206,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_d_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_d_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_d_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -217,7 +217,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_d_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_d_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_d_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -228,7 +228,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_l_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_l_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_l_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -239,7 +239,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_b_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_b_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_b_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -250,7 +250,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_b_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_b_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_b_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -261,7 +261,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_lz_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_lz_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_lz_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -272,7 +272,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_cd_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_cd_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_cd_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -283,7 +283,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_cd_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_cd_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_cd_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -294,7 +294,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -305,7 +305,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -316,7 +316,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_d_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_d_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_d_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -327,7 +327,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_d_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_d_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_d_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -338,7 +338,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_l_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_l_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_l_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -349,7 +349,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_b_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_b_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_b_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -360,7 +360,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_b_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_b_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_b_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -371,7 +371,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_lz_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_lz_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_lz_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -382,7 +382,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_cd_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_cd_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_cd_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -393,7 +393,7 @@
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_cd_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_cd_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_cd_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
index 5d4d4cd..055dddb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -8,7 +8,7 @@
 ; CO-V2: s_load_dword s{{[0-9]+}}, s[4:5], 0xa
 
 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa
-define void @test(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 {
   %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)*
   %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
@@ -20,7 +20,7 @@
 ; ALL-LABEL: {{^}}test_implicit:
 ; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15
 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0x15
-define void @test_implicit(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
   %header.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
   %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
@@ -39,7 +39,7 @@
 ; ALL: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[VAL]]
 ; MESA: buffer_store_dword [[V_VAL]]
 ; HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]]
-define void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 {
+define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
   %arg.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
   %val = load i32, i32 addrspace(2)* %arg.ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
index 6720cbe..fe211d3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
@@ -7,7 +7,7 @@
 ; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
 ; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_I32]]
 ; GCN: buffer_store_short v[[R_F16]]
-define void @ldexp_f16(
+define amdgpu_kernel void @ldexp_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     i32 addrspace(1)* %b) {
@@ -22,7 +22,7 @@
 ; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
 ; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[B_I32]]
 ; GCN: buffer_store_short v[[R_F16]]
-define void @ldexp_f16_imm_a(
+define amdgpu_kernel void @ldexp_f16_imm_a(
     half addrspace(1)* %r,
     i32 addrspace(1)* %b) {
   %b.val = load i32, i32 addrspace(1)* %b
@@ -35,7 +35,7 @@
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; VI: v_ldexp_f16_e64 v[[R_F16:[0-9]+]], v[[A_F16]], 2{{$}}
 ; GCN: buffer_store_short v[[R_F16]]
-define void @ldexp_f16_imm_b(
+define amdgpu_kernel void @ldexp_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
   %a.val = load half, half addrspace(1)* %a
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
index a23defd..1ab4e8b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
@@ -7,7 +7,7 @@
 ; SI-LABEL: {{^}}test_ldexp_f32:
 ; SI: v_ldexp_f32
 ; SI: s_endpgm
-define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
+define amdgpu_kernel void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
   %result = call float @llvm.amdgcn.ldexp.f32(float %a, i32 %b) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -16,7 +16,7 @@
 ; SI-LABEL: {{^}}test_ldexp_f64:
 ; SI: v_ldexp_f64
 ; SI: s_endpgm
-define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
+define amdgpu_kernel void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
   %result = call double @llvm.amdgcn.ldexp.f64(double %a, i32 %b) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -24,7 +24,7 @@
 
 ; SI-LABEL: {{^}}test_ldexp_undef_f32:
 ; SI-NOT: v_ldexp_f32
-define void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind {
+define amdgpu_kernel void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind {
   %result = call float @llvm.amdgcn.ldexp.f32(float undef, i32 %b) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
index 014369b..bc59989 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}v_lerp:
 ; GCN: v_lerp_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind {
+define amdgpu_kernel void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind {
   %result= call i32 @llvm.amdgcn.lerp(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
index f78257f..feecd6c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
@@ -7,7 +7,7 @@
 
 ; GCN-LABEL: {{^}}v_log_clamp_f32:
 ; GCN: v_log_clamp_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_log_clamp_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_log_clamp_f32(float addrspace(1)* %out, float %src) #1 {
   %log.clamp = call float @llvm.amdgcn.log.clamp.f32(float %src) #0
   store float %log.clamp, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
index 35fdba8..8baaad1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
@@ -7,7 +7,7 @@
 ; VI: v_mov_b32_e32 v0, s{{[0-9]+}}
 ; VI: s_nop 1
 ; VI: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
-define void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
   store i32 %tmp0, i32 addrspace(1)* %out
   ret void
@@ -19,7 +19,7 @@
 ; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
 ; VI: s_nop 1
 ; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
   %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
   store i32 %tmp1, i32 addrspace(1)* %out
@@ -36,7 +36,7 @@
 ; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
 ; VI: s_nop 1
 ; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-define void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) {
+define amdgpu_kernel void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) {
   %cmp = fcmp oeq float %cond, 0.0
   br i1 %cmp, label %if, label %else
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
index 7c2495e..3a2b87c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8:
 ; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
   %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@
 
 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8_non_immediate:
 ; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
+define amdgpu_kernel void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
   %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
index 04bb97a..a8d03bf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_non_inline_constant:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 100, <4 x i32> <i32 100, i32 100, i32 100, i32 100>) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_non_immediate:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) {
+define amdgpu_kernel void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %b) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 10, i32 20, i32 30, i32 40>) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_fp_immediate:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+define amdgpu_kernel void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 1065353216, i32 0, i32 0, i32 0>) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -37,7 +37,7 @@
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_sgpr_vgpr:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) {
+define amdgpu_kernel void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) {
   %in = load <4 x i32>, <4 x i32> addrspace(1) * %input
 
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %in) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
index 83d13ab..dfaac04 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}v_msad_u8:
 ; GCN: v_msad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@
 
 ; GCN-LABEL: {{^}}v_msad_u8_non_immediate:
 ; GCN: v_msad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_msad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_msad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
index ece4224..be71225 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8:
 ; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
   %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@
 
 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8_non_immediate:
 ; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
+define amdgpu_kernel void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
   %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
index 6bf8715..9200fe7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}test:
 ; GCN: enable_sgpr_queue_ptr = 1
 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
   %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
   %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
   %value = load i32, i32 addrspace(2)* %header_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
index f0b8e2a..0f1fa15 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
@@ -7,7 +7,7 @@
 ; VI:  v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @rcp_f16(
+define amdgpu_kernel void @rcp_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
index d538614..71db76d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
@@ -7,7 +7,7 @@
 
 ; GCN-LABEL: {{^}}rcp_legacy_f32:
 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float %src) #0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -16,7 +16,7 @@
 ; TODO: Really these should be constant folded
 ; GCN-LABEL: {{^}}rcp_legacy_f32_constant_4.0
 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, 4.0
-define void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float 4.0) #0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -24,7 +24,7 @@
 
 ; GCN-LABEL: {{^}}rcp_legacy_f32_constant_100.0
 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000
-define void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float 100.0) #0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -32,7 +32,7 @@
 
 ; GCN-LABEL: {{^}}rcp_legacy_undef_f32:
 ; GCN-NOT: v_rcp_legacy_f32
-define void @rcp_legacy_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_legacy_undef_f32(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float undef)
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
index 9f89e56..ad2d84b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@@ -8,7 +8,7 @@
 
 ; FUNC-LABEL: {{^}}rcp_undef_f32:
 ; SI-NOT: v_rcp_f32
-define void @rcp_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_undef_f32(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.f32(float undef)
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -17,7 +17,7 @@
 ; FUNC-LABEL: {{^}}rcp_2_f32:
 ; SI-NOT: v_rcp_f32
 ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0.5
-define void @rcp_2_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_2_f32(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.f32(float 2.0)
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -26,7 +26,7 @@
 ; FUNC-LABEL: {{^}}rcp_10_f32:
 ; SI-NOT: v_rcp_f32
 ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0x3dcccccd
-define void @rcp_10_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_10_f32(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.f32(float 10.0)
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -36,7 +36,7 @@
 ; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -46,7 +46,7 @@
 ; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 {
+define amdgpu_kernel void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -54,7 +54,7 @@
 
 ; FUNC-LABEL: {{^}}unsafe_f32_denormals_rcp_pat_f32:
 ; SI: v_div_scale_f32
-define void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #3 {
+define amdgpu_kernel void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #3 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -63,7 +63,7 @@
 ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32:
 ; SI: v_sqrt_f32_e32
 ; SI: v_rcp_f32_e32
-define void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
   %sqrt = call float @llvm.sqrt.f32(float %src)
   %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
   store float %rcp, float addrspace(1)* %out, align 4
@@ -72,7 +72,7 @@
 
 ; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f32:
 ; SI: v_rsq_f32_e32
-define void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 {
+define amdgpu_kernel void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 {
   %sqrt = call float @llvm.sqrt.f32(float %src)
   %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
   store float %rcp, float addrspace(1)* %out, align 4
@@ -83,7 +83,7 @@
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @rcp_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double %src) #1 {
   %rcp = call double @llvm.amdgcn.rcp.f64(double %src)
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -93,7 +93,7 @@
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 {
   %rcp = call double @llvm.amdgcn.rcp.f64(double %src)
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -101,7 +101,7 @@
 
 ; FUNC-LABEL: {{^}}rcp_pat_f64:
 ; SI: v_div_scale_f64
-define void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
   %rcp = fdiv double 1.0, %src
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -111,7 +111,7 @@
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
   %rcp = fdiv double 1.0, %src
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -121,7 +121,7 @@
 ; SI-NOT: v_rsq_f64_e32
 ; SI: v_sqrt_f64
 ; SI: v_rcp_f64
-define void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
   %sqrt = call double @llvm.sqrt.f64(double %src)
   %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
   store double %rcp, double addrspace(1)* %out, align 8
@@ -132,7 +132,7 @@
 ; SI: v_rsq_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
   %sqrt = call double @llvm.sqrt.f64(double %src)
   %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
   store double %rcp, double addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 2569108..9f5c809 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -4,7 +4,7 @@
 
 ; CHECK-LABEL: {{^}}test_readfirstlane:
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v{{[0-9]+}}
-define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
+define amdgpu_kernel void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@
 ; CHECK-LABEL: {{^}}test_readfirstlane_imm:
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]]
-define void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -25,7 +25,7 @@
 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]]
-define void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 {
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 436ffff..5e892fa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -4,7 +4,7 @@
 
 ; CHECK-LABEL: {{^}}test_readlane_sreg:
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
   store i32 %readlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@
 ; CHECK-LABEL: {{^}}test_readlane_imm_sreg:
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}
-define void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
   %readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1)
   store i32 %readlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -25,7 +25,7 @@
 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}
-define void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %m0, i32 %src1)
   store i32 %readlane, i32 addrspace(1)* %out, align 4
@@ -34,7 +34,7 @@
 
 ; CHECK-LABEL: {{^}}test_readlane_imm:
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
-define void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) #1 {
+define amdgpu_kernel void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) #1 {
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 32) #0
   store i32 %readlane, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
index 5f40e0d..3611047 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
@@ -12,7 +12,7 @@
 ; VI-DAG: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
 ; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xff7fffff, [[MIN]]
 ; VI: buffer_store_dword [[RESULT]]
-define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
   %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
   store float %rsq_clamp, float addrspace(1)* %out
   ret void
@@ -30,7 +30,7 @@
 ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
 ; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
 ; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]]
-define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
+define amdgpu_kernel void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
   store double %rsq_clamp, double addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@
 
 ; FUNC-LABEL: {{^}}rsq_clamp_undef_f32:
 ; SI-NOT: v_rsq_clamp_f32
-define void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 {
   %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
   store float %rsq_clamp, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
index 2022d02..fd48021 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
@@ -7,7 +7,7 @@
 ; VI:  v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @rsq_f16(
+define amdgpu_kernel void @rsq_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
index 47bd0d8..7f4c2cb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}rsq_legacy_f32:
 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float %src) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@
 ; TODO: Really these should be constant folded
 ; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_4.0
 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 4.0
-define void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float 4.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@
 
 ; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_100.0
 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000
-define void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float 100.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@
 
 ; FUNC-LABEL: {{^}}rsq_legacy_undef_f32:
 ; SI-NOT: v_rsq_legacy_f32
-define void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float undef)
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
index c644288..0ce26d0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
@@ -6,7 +6,7 @@
 
 ; FUNC-LABEL: {{^}}rsq_f32:
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @rsq_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_f32(float addrspace(1)* %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float %src) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -15,7 +15,7 @@
 ; TODO: Really these should be constant folded
 ; FUNC-LABEL: {{^}}rsq_f32_constant_4.0
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0
-define void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float 4.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -23,7 +23,7 @@
 
 ; FUNC-LABEL: {{^}}rsq_f32_constant_100.0
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000
-define void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float 100.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -31,7 +31,7 @@
 
 ; FUNC-LABEL: {{^}}rsq_f64:
 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @rsq_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @rsq_f64(double addrspace(1)* %out, double %src) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double %src) #0
   store double %rsq, double addrspace(1)* %out, align 4
   ret void
@@ -40,7 +40,7 @@
 ; TODO: Really these should be constant folded
 ; FUNC-LABEL: {{^}}rsq_f64_constant_4.0
 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, 4.0
-define void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double 4.0) #0
   store double %rsq, double addrspace(1)* %out, align 4
   ret void
@@ -50,7 +50,7 @@
 ; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0x40590000
 ; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0{{$}}
 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double 100.0) #0
   store double %rsq, double addrspace(1)* %out, align 4
   ret void
@@ -58,7 +58,7 @@
 
 ; FUNC-LABEL: {{^}}rsq_undef_f32:
 ; SI-NOT: v_rsq_f32
-define void @rsq_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_undef_f32(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float undef)
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 9559b5a..5f8ca28 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -7,7 +7,7 @@
 ; GFX9: flat_store_dword
 ; GFX9-NOT: s_waitcnt
 ; GCN: s_barrier
-define void @test_barrier(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out) #0 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
index ecd4ac6..b488565 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
@@ -9,7 +9,7 @@
 ; SI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0xc0,0xc7]
 ; VI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0x80,0xc0,0x00,0x00,0x00,0x00]
 ; GCN-NEXT: s_endpgm
-define void @test_s_dcache_inv() #0 {
+define amdgpu_kernel void @test_s_dcache_inv() #0 {
   call void @llvm.amdgcn.s.dcache.inv()
   ret void
 }
@@ -18,7 +18,7 @@
 ; GCN-NEXT: ; BB#0:
 ; GCN: s_dcache_inv
 ; GCN: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_inv_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_inv_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
index 097f35d..a3a5c32 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
@@ -9,7 +9,7 @@
 ; CI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x40,0xc7]
 ; VI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x88,0xc0,0x00,0x00,0x00,0x00]
 ; GCN-NEXT: s_endpgm
-define void @test_s_dcache_inv_vol() #0 {
+define amdgpu_kernel void @test_s_dcache_inv_vol() #0 {
   call void @llvm.amdgcn.s.dcache.inv.vol()
   ret void
 }
@@ -18,7 +18,7 @@
 ; GCN-NEXT: ; BB#0:
 ; GCN-NEXT: s_dcache_inv_vol
 ; GCN: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_inv_vol_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_inv_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv.vol()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
index 9ecce74..909a85d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
@@ -7,7 +7,7 @@
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb ; encoding: [0x00,0x00,0x84,0xc0,0x00,0x00,0x00,0x00]
 ; VI-NEXT: s_endpgm
-define void @test_s_dcache_wb() #0 {
+define amdgpu_kernel void @test_s_dcache_wb() #0 {
   call void @llvm.amdgcn.s.dcache.wb()
   ret void
 }
@@ -16,7 +16,7 @@
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb
 ; VI: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_wb_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_wb_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
index 943f8c6..217bf97 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
@@ -7,7 +7,7 @@
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb_vol ; encoding: [0x00,0x00,0x8c,0xc0,0x00,0x00,0x00,0x00]
 ; VI-NEXT: s_endpgm
-define void @test_s_dcache_wb_vol() #0 {
+define amdgpu_kernel void @test_s_dcache_wb_vol() #0 {
   call void @llvm.amdgcn.s.dcache.wb.vol()
   ret void
 }
@@ -16,7 +16,7 @@
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb_vol
 ; VI: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_wb_vol_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_wb_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb.vol()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
index 72513fc..8f64c50 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
@@ -20,7 +20,7 @@
 ; GCN: s_decperflevel 13{{$}}
 ; GCN: s_decperflevel 14{{$}}
 ; GCN: s_decperflevel 15{{$}}
-define void @test_s_decperflevel(i32 %x) #0 {
+define amdgpu_kernel void @test_s_decperflevel(i32 %x) #0 {
   call void @llvm.amdgcn.s.decperflevel(i32 0)
   call void @llvm.amdgcn.s.decperflevel(i32 1)
   call void @llvm.amdgcn.s.decperflevel(i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
index 4304398..906a8a3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
@@ -4,7 +4,7 @@
 
 ; GCN-LABEL: {{^}}s_getreg_test:
 ; GCN: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23)
-define void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
+define amdgpu_kernel void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
   %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574)
   %lds_size_bytes = shl i32 %lds_size_64dwords, 8
   store i32 %lds_size_bytes, i32 addrspace(1)* %out
@@ -14,7 +14,7 @@
 ; Call site has additional readnone knowledge.
 ; GCN-LABEL: {{^}}readnone_s_getreg_test:
 ; GCN: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23)
-define void @readnone_s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
+define amdgpu_kernel void @readnone_s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
   %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574) #1
   %lds_size_bytes = shl i32 %lds_size_64dwords, 8
   store i32 %lds_size_bytes, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
index 2ae4fc4..49e6e42 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
@@ -20,7 +20,7 @@
 ; GCN: s_incperflevel 13{{$}}
 ; GCN: s_incperflevel 14{{$}}
 ; GCN: s_incperflevel 15{{$}}
-define void @test_s_incperflevel(i32 %x) #0 {
+define amdgpu_kernel void @test_s_incperflevel(i32 %x) #0 {
   call void @llvm.amdgcn.s.incperflevel(i32 0)
   call void @llvm.amdgcn.s.incperflevel(i32 1)
   call void @llvm.amdgcn.s.incperflevel(i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
index d8eda10f..6604103 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
@@ -10,7 +10,7 @@
 ; GCN-NOT: lgkmcnt
 ; GCN: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: _store_dwordx2
-define void @test_s_memrealtime(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_s_memrealtime(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.amdgcn.s.memrealtime()
   store volatile i64 %cycle0, i64 addrspace(1)* %out
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
index ff9d746..6aef769 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
@@ -11,7 +11,7 @@
 ; GCN-NOT: lgkmcnt
 ; GCN: s_memtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2
-define void @test_s_memtime(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_s_memtime(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.amdgcn.s.memtime()
   store volatile i64 %cycle0, i64 addrspace(1)* %out
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
index 870aa48..59c910c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
@@ -20,7 +20,7 @@
 ; GCN: s_sleep 13{{$}}
 ; GCN: s_sleep 14{{$}}
 ; GCN: s_sleep 15{{$}}
-define void @test_s_sleep(i32 %x) #0 {
+define amdgpu_kernel void @test_s_sleep(i32 %x) #0 {
   call void @llvm.amdgcn.s.sleep(i32 0)
   call void @llvm.amdgcn.s.sleep(i32 1)
   call void @llvm.amdgcn.s.sleep(i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
index 3aaed9d..2a3705d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_hi_u8:
 ; GCN: v_sad_hi_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_hi_u8_non_immediate:
 ; GCN: v_sad_hi_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_hi_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_sad_hi_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
index 5438571..c404531 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u16:
 ; GCN: v_sad_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u16_non_immediate:
 ; GCN: v_sad_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u16_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_sad_u16_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
index 9422d76..1ee876a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u8:
 ; GCN: v_sad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u8_non_immediate:
 ; GCN: v_sad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_sad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
index c51c1c9..4e232a3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg:
 ; GCN: v_bfe_i32
-define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src1)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -11,7 +11,7 @@
 
 ; GCN-LABEL: {{^}}bfe_i32_arg_arg_imm:
 ; GCN: v_bfe_i32
-define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 123)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -19,7 +19,7 @@
 
 ; GCN-LABEL: {{^}}bfe_i32_arg_imm_arg:
 ; GCN: v_bfe_i32
-define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 123, i32 %src2)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -27,7 +27,7 @@
 
 ; GCN-LABEL: {{^}}bfe_i32_imm_arg_arg:
 ; GCN: v_bfe_i32
-define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 123, i32 %src1, i32 %src2)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@
 
 ; GCN-LABEL: {{^}}v_bfe_print_arg:
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
-define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) #0 {
+define amdgpu_kernel void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) #0 {
   %load = load i32, i32 addrspace(1)* %src0, align 4
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 2, i32 8)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
@@ -45,7 +45,7 @@
 ; GCN-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset:
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 0)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -54,7 +54,7 @@
 ; GCN-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset:
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 8, i32 0)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -64,7 +64,7 @@
 ; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 1, i32 31)
@@ -78,7 +78,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 0, i32 31)
@@ -90,7 +90,7 @@
 ; GCN: buffer_load_dword
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
 ; GCN: s_endpgm
-define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
@@ -103,7 +103,7 @@
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -115,7 +115,7 @@
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 1, i32 31)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -127,7 +127,7 @@
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 8, i32 24)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -139,7 +139,7 @@
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 24, i32 8)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -150,7 +150,7 @@
 ; GCN: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = ashr i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
@@ -161,7 +161,7 @@
 ; GCN-NOT: lshr
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = lshr i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
@@ -173,7 +173,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 0)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -184,7 +184,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 12334, i32 0, i32 0)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -195,7 +195,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 1)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -206,7 +206,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 1, i32 0, i32 1)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -217,7 +217,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 0, i32 1)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -228,7 +228,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 7, i32 1)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -239,7 +239,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 0, i32 8)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -250,7 +250,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 0, i32 8)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -261,7 +261,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 6, i32 8)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -272,7 +272,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65536, i32 16, i32 8)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -283,7 +283,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65535, i32 16, i32 16)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -294,7 +294,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -6
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 4)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -305,7 +305,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 31, i32 1)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -316,7 +316,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 131070, i32 16, i32 16)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -327,7 +327,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 2, i32 30)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -338,7 +338,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 28)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -349,7 +349,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 1, i32 7)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -360,7 +360,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 1, i32 31)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -371,7 +371,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 31, i32 1)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -383,7 +383,7 @@
 ; GCN-NOT: v_ashr
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24
 ; GCN: buffer_store_dword [[BFE]],
-define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 0, i32 24)
   %shl = shl i32 %bfe, 8
@@ -399,7 +399,7 @@
 ; GCN: v_add_i32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]]
 ; GCN: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
 ; GCN: buffer_store_dword [[TMP2]]
-define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %src = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16)
   %div = sdiv i32 %bfe, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
index a1030b5..94aeb07 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
@@ -5,7 +5,7 @@
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsg sendmsg(MSG_INTERRUPT)
-define void @test_interrupt() {
+define amdgpu_kernel void @test_interrupt() {
 body:
   call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0);
   ret void
@@ -15,7 +15,7 @@
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
-define void @test_gs_emit() {
+define amdgpu_kernel void @test_gs_emit() {
 body:
   call void @llvm.amdgcn.s.sendmsg(i32 34, i32 0);
   ret void
@@ -25,7 +25,7 @@
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
-define void @test_gs_cut() {
+define amdgpu_kernel void @test_gs_cut() {
 body:
   call void @llvm.amdgcn.s.sendmsg(i32 274, i32 0);
   ret void
@@ -35,7 +35,7 @@
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
-define void @test_gs_emit_cut() {
+define amdgpu_kernel void @test_gs_emit_cut() {
 body:
   call void @llvm.amdgcn.s.sendmsg(i32 562, i32 0)
   ret void
@@ -45,7 +45,7 @@
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
-define void @test_gs_done() {
+define amdgpu_kernel void @test_gs_done() {
 body:
   call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
   ret void
@@ -66,7 +66,7 @@
 ; VI-NEXT: s_nop 0
 ; GCN-NEXT: s_sendmsghalt sendmsg(MSG_INTERRUPT)
 ; GCN-NEXT: s_endpgm
-define void @sendmsghalt(i32 inreg %a) #0 {
+define amdgpu_kernel void @sendmsghalt(i32 inreg %a) #0 {
   call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 %a)
   ret void
 }
@@ -75,7 +75,7 @@
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsghalt sendmsg(MSG_INTERRUPT)
-define void @test_interrupt_halt() {
+define amdgpu_kernel void @test_interrupt_halt() {
 body:
   call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 0)
   ret void
@@ -85,7 +85,7 @@
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT, 0)
-define void @test_gs_emit_halt() {
+define amdgpu_kernel void @test_gs_emit_halt() {
 body:
   call void @llvm.amdgcn.s.sendmsghalt(i32 34, i32 0)
   ret void
@@ -95,7 +95,7 @@
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_CUT, 1)
-define void @test_gs_cut_halt() {
+define amdgpu_kernel void @test_gs_cut_halt() {
 body:
   call void @llvm.amdgcn.s.sendmsghalt(i32 274, i32 0)
   ret void
@@ -105,7 +105,7 @@
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
-define void @test_gs_emit_cut_halt() {
+define amdgpu_kernel void @test_gs_emit_cut_halt() {
 body:
   call void @llvm.amdgcn.s.sendmsghalt(i32 562, i32 0)
   ret void
@@ -115,7 +115,7 @@
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsghalt sendmsg(MSG_GS_DONE, GS_OP_NOP)
-define void @test_gs_done_halt() {
+define amdgpu_kernel void @test_gs_done_halt() {
 body:
   call void @llvm.amdgcn.s.sendmsghalt(i32 3, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
index 09f8f48..495e36b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
@@ -8,7 +8,7 @@
 ; GCN: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]]
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
-define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
+define amdgpu_kernel void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
   %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
   store i32 %r, i32 addrspace(1)* %out, align 4
   ret void
@@ -18,7 +18,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
-define void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
   store i32 %r, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
index fac0e35..4b930bf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
@@ -7,7 +7,7 @@
 ; VI:  v_sin_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sin_f16(
+define amdgpu_kernel void @sin_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
index e3692fc..0b7064d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}v_sin_f32:
 ; GCN: v_sin_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_sin_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_sin_f32(float addrspace(1)* %out, float %src) #1 {
   %sin = call float @llvm.amdgcn.sin.f32(float %src) #0
   store float %sin, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
index caac6dd..e0cec21 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
@@ -9,7 +9,7 @@
 ; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]]
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
-define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load double, double addrspace(1)* %aptr, align 8
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b) nounwind readnone
@@ -22,7 +22,7 @@
 ; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
-define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
+define amdgpu_kernel void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
   %a = load double, double addrspace(1)* %aptr, align 8
   %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
index 8a2990a..92e3a10 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}bfe_u32_arg_arg_arg:
 ; GCN: v_bfe_u32
-define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -11,7 +11,7 @@
 
 ; GCN-LABEL: {{^}}bfe_u32_arg_arg_imm:
 ; GCN: v_bfe_u32
-define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -19,7 +19,7 @@
 
 ; GCN-LABEL: {{^}}bfe_u32_arg_imm_arg:
 ; GCN: v_bfe_u32
-define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -27,7 +27,7 @@
 
 ; GCN-LABEL: {{^}}bfe_u32_imm_arg_arg:
 ; GCN: v_bfe_u32
-define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -36,7 +36,7 @@
 ; GCN-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset:
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -45,7 +45,7 @@
 ; GCN-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset:
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -55,7 +55,7 @@
 ; GCN: buffer_load_ubyte
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %load = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %load to i32
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8)
@@ -70,7 +70,7 @@
 ; FIXME: Should be using s_add_i32
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -85,7 +85,7 @@
 ; GCN-NEXT: v_and_b32_e32
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 65535
@@ -99,7 +99,7 @@
 ; GCN: v_add_i32
 ; GCN: bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -114,7 +114,7 @@
 ; GCN-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8
 ; GCN-NEXT: bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -129,7 +129,7 @@
 ; GCN-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80
 ; GCN-NEXT: bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -143,7 +143,7 @@
 ; GCN: v_add_i32
 ; GCN-NEXT: bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 65535
@@ -156,14 +156,14 @@
 ; GCN: buffer_load_dword
 ; GCN: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
 ; GCN: s_endpgm
-define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 8)
@@ -171,7 +171,7 @@
   ret void
 }
 
-define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 1)
@@ -186,7 +186,7 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %shr = lshr i32 %shl, 31
@@ -201,7 +201,7 @@
 ; GCN-NOT: shr
 ; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
 ; GCN: s_endpgm
-define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %shr = ashr i32 %shl, 31
@@ -214,7 +214,7 @@
 ; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 1, i32 31)
@@ -226,7 +226,7 @@
 ; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 31)
@@ -239,7 +239,7 @@
 ; GCN: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
@@ -252,7 +252,7 @@
 ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -264,7 +264,7 @@
 ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -276,7 +276,7 @@
 ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -288,7 +288,7 @@
 ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -299,7 +299,7 @@
 ; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = ashr i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
@@ -310,7 +310,7 @@
 ; GCN-NOT: lshr
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = lshr i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
@@ -323,7 +323,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -335,7 +335,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -347,7 +347,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -359,7 +359,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -371,7 +371,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -383,7 +383,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -395,7 +395,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -407,7 +407,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -419,7 +419,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -431,7 +431,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -443,7 +443,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -455,7 +455,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -467,7 +467,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -479,7 +479,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -491,7 +491,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -503,7 +503,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -515,7 +515,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -527,7 +527,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -539,7 +539,7 @@
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -556,7 +556,7 @@
 ; GCN-DAG: buffer_store_dword [[AND]]
 ; GCN-DAG: buffer_store_dword [[BFE]]
 ; GCN: s_endpgm
-define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
+define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
                                             i32 addrspace(1)* %out1,
                                             i32 addrspace(1)* %in) #0 {
   %src = load i32, i32 addrspace(1)* %in, align 4
@@ -570,7 +570,7 @@
 ; GCN-LABEL: {{^}}lshr_and:
 ; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
 ; GCN: buffer_store_dword
-define void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 {
   %b = lshr i32 %a, 6
   %c = and i32 %b, 7
   store i32 %c, i32 addrspace(1)* %out, align 8
@@ -580,7 +580,7 @@
 ; GCN-LABEL: {{^}}v_lshr_and:
 ; GCN: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3
 ; GCN: buffer_store_dword
-define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = lshr i32 %a, %b
   %d = and i32 %c, 7
   store i32 %d, i32 addrspace(1)* %out, align 8
@@ -590,7 +590,7 @@
 ; GCN-LABEL: {{^}}and_lshr:
 ; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
 ; GCN: buffer_store_dword
-define void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
   %b = and i32 %a, 448
   %c = lshr i32 %b, 6
   store i32 %c, i32 addrspace(1)* %out, align 8
@@ -600,7 +600,7 @@
 ; GCN-LABEL: {{^}}and_lshr2:
 ; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
 ; GCN: buffer_store_dword
-define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 {
   %b = and i32 %a, 511
   %c = lshr i32 %b, 6
   store i32 %c, i32 addrspace(1)* %out, align 8
@@ -610,7 +610,7 @@
 ; GCN-LABEL: {{^}}shl_lshr:
 ; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002
 ; GCN: buffer_store_dword
-define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
   %b = shl i32 %a, 9
   %c = lshr i32 %b, 11
   store i32 %c, i32 addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
index e851797..e305f8e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
@@ -4,7 +4,7 @@
 ; GCN-DAG: ; wave barrier
 ; GCN-NOT: s_barrier
 
-define void @test_wave_barrier() #0 {
+define amdgpu_kernel void @test_wave_barrier() #0 {
 entry:
   call void @llvm.amdgcn.wave.barrier() #1
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
index 58529b8..349e7f0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
@@ -34,7 +34,7 @@
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -61,7 +61,7 @@
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workgroup.id.y()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -96,7 +96,7 @@
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @test_workgroup_id_z(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workgroup_id_z(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workgroup.id.z()
   store i32 %id, i32 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index 1f18173..8b80998 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -18,7 +18,7 @@
 
 ; ALL-NOT: v0
 ; ALL: {{buffer|flat}}_store_dword {{.*}}v0
-define void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -33,7 +33,7 @@
 
 ; ALL-NOT: v1
 ; ALL: {{buffer|flat}}_store_dword {{.*}}v1
-define void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.y()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@
 
 ; ALL-NOT: v2
 ; ALL: {{buffer|flat}}_store_dword {{.*}}v2
-define void @test_workitem_id_z(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workitem_id_z(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.z()
   store i32 %id, i32 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 6d5147b..ad6534b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -12,7 +12,7 @@
 ; VI:  v_ceil_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @ceil_f16(
+define amdgpu_kernel void @ceil_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @ceil_v2f16(
+define amdgpu_kernel void @ceil_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 5807263..60c0721 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -13,7 +13,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @cos_f16(
+define amdgpu_kernel void @cos_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -53,7 +53,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @cos_v2f16(
+define amdgpu_kernel void @cos_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.ll
index eb7dcbb..bd89502 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.ll
@@ -11,7 +11,7 @@
 ;SI: v_cos_f32
 ;SI-NOT: v_cos_f32
 
-define void @test(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float %x) #1 {
    %cos = call float @llvm.cos.f32(float %x)
    store float %cos, float addrspace(1)* %out
    ret void
@@ -29,7 +29,7 @@
 ;SI: v_cos_f32
 ;SI-NOT: v_cos_f32
 
-define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
+define amdgpu_kernel void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
    %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx)
    store <4 x float> %cos, <4 x float> addrspace(1)* %out
    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
index 8b0854c..c4a76de 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
@@ -9,7 +9,7 @@
 
 ; CHECK: buffer_store_dword
 ; CHECK: s_endpgm
-define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 {
+define amdgpu_kernel void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 {
 entry:
   tail call void @llvm.dbg.value(metadata i32 addrspace(1)* %globalptr_arg, i64 0, metadata !10, metadata !13), !dbg !14
   store i32 123, i32 addrspace(1)* %globalptr_arg, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
index 5b87e89..586c064 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
@@ -12,7 +12,7 @@
 ; VI:  v_exp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @exp2_f16(
+define amdgpu_kernel void @exp2_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @exp2_v2f16(
+define amdgpu_kernel void @exp2_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index e10b101..387dc3b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -11,7 +11,7 @@
 ;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
 ;SI: v_exp_f32
 
-define void @test(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float %in) {
 entry:
    %0 = call float @llvm.exp2.f32(float %in)
    store float %0, float addrspace(1)* %out
@@ -34,7 +34,7 @@
 ;SI: v_exp_f32
 ;SI: v_exp_f32
 
-define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in)
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -68,7 +68,7 @@
 ;SI: v_exp_f32
 ;SI: v_exp_f32
 ;SI: v_exp_f32
-define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
   store <4 x float> %0, <4 x float> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index 3a5f86d..a62cab9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -12,7 +12,7 @@
 ; VI:  v_floor_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @floor_f16(
+define amdgpu_kernel void @floor_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @floor_v2f16(
+define amdgpu_kernel void @floor_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
index 9a9b87d..a677eb8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
@@ -16,7 +16,7 @@
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16(
+define amdgpu_kernel void @fma_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -42,7 +42,7 @@
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16_imm_a(
+define amdgpu_kernel void @fma_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
@@ -65,7 +65,7 @@
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16_imm_b(
+define amdgpu_kernel void @fma_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
@@ -88,7 +88,7 @@
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16_imm_c(
+define amdgpu_kernel void @fma_f16_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -130,7 +130,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16(
+define amdgpu_kernel void @fma_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -165,7 +165,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16_imm_a(
+define amdgpu_kernel void @fma_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b,
     <2 x half> addrspace(1)* %c) {
@@ -203,7 +203,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16_imm_b(
+define amdgpu_kernel void @fma_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %c) {
@@ -241,7 +241,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16_imm_c(
+define amdgpu_kernel void @fma_v2f16_imm_c(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index d1d618b..9d01571 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -24,7 +24,7 @@
 ; VI-DENORM: buffer_store_short [[RESULT]]
 
 ; GCN: s_endpgm
-define void @fmuladd_f16(
+define amdgpu_kernel void @fmuladd_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -54,7 +54,7 @@
 ; VI-DENORM: buffer_store_short [[RESULT]]
 
 ; GCN: s_endpgm
-define void @fmuladd_f16_imm_a(
+define amdgpu_kernel void @fmuladd_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
@@ -83,7 +83,7 @@
 
 
 ; GCN: s_endpgm
-define void @fmuladd_f16_imm_b(
+define amdgpu_kernel void @fmuladd_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
@@ -135,7 +135,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmuladd_v2f16(
+define amdgpu_kernel void @fmuladd_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.f16.ll
index a534ee0..a12ebcb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.f16.ll
@@ -12,7 +12,7 @@
 ; VI:  v_log_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @log2_f16(
+define amdgpu_kernel void @log2_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @log2_v2f16(
+define amdgpu_kernel void @log2_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index c75e785..b9d593e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -11,7 +11,7 @@
 ;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ;SI: v_log_f32
 
-define void @test(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float %in) {
 entry:
    %0 = call float @llvm.log2.f32(float %in)
    store float %0, float addrspace(1)* %out
@@ -34,7 +34,7 @@
 ;SI: v_log_f32
 ;SI: v_log_f32
 
-define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in)
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -68,7 +68,7 @@
 ;SI: v_log_f32
 ;SI: v_log_f32
 ;SI: v_log_f32
-define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in)
   store <4 x float> %0, <4 x float> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 86ee8f1..4c93c77 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -14,7 +14,7 @@
 ; VI:  v_max_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @maxnum_f16(
+define amdgpu_kernel void @maxnum_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -34,7 +34,7 @@
 ; VI:  v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @maxnum_f16_imm_a(
+define amdgpu_kernel void @maxnum_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -52,7 +52,7 @@
 ; VI:  v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @maxnum_f16_imm_b(
+define amdgpu_kernel void @maxnum_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -87,7 +87,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @maxnum_v2f16(
+define amdgpu_kernel void @maxnum_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -115,7 +115,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @maxnum_v2f16_imm_a(
+define amdgpu_kernel void @maxnum_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -141,7 +141,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @maxnum_v2f16_imm_b(
+define amdgpu_kernel void @maxnum_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
index 009338d..7b4db55 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
@@ -80,7 +80,7 @@
 ; SI-DAG: ds_write_b8
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind
@@ -125,7 +125,7 @@
 ; SI-DAG: ds_write_b16
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind
@@ -144,7 +144,7 @@
 ; SI: ds_write2_b32
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind
@@ -161,7 +161,7 @@
 ; SI: ds_write2_b64
 
 ; SI-DAG: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind
@@ -238,7 +238,7 @@
 ; SI-DAG: buffer_store_byte
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind
@@ -281,7 +281,7 @@
 ; SI-DAG: buffer_store_short
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind
@@ -294,7 +294,7 @@
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind
@@ -307,7 +307,7 @@
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind
@@ -320,7 +320,7 @@
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind
@@ -340,7 +340,7 @@
 ; SI-DAG: s_load_dwordx2
 ; SI-DAG: buffer_store_dwordx4
 ; SI-DAG: buffer_store_dwordx4
-define void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind {
+define amdgpu_kernel void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind {
   %str = bitcast [16 x i8] addrspace(2)* @hello.align4 to i8 addrspace(2)*
   call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i32 4, i1 false)
   ret void
@@ -365,7 +365,7 @@
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
-define void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind {
+define amdgpu_kernel void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind {
   %str = bitcast [16 x i8] addrspace(2)* @hello.align1 to i8 addrspace(2)*
   call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i32 1, i1 false)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index a69d99c..d058256 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -14,7 +14,7 @@
 ; VI:  v_min_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @minnum_f16(
+define amdgpu_kernel void @minnum_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -34,7 +34,7 @@
 ; VI:  v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @minnum_f16_imm_a(
+define amdgpu_kernel void @minnum_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -52,7 +52,7 @@
 ; VI:  v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @minnum_f16_imm_b(
+define amdgpu_kernel void @minnum_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -86,7 +86,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @minnum_v2f16(
+define amdgpu_kernel void @minnum_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -117,7 +117,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @minnum_v2f16_imm_a(
+define amdgpu_kernel void @minnum_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -143,7 +143,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @minnum_v2f16_imm_b(
+define amdgpu_kernel void @minnum_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.dot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
index 4db29c5..de8a477 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
@@ -2,7 +2,7 @@
 
 declare float @llvm.r600.dot4(<4 x float>, <4 x float>) nounwind readnone
 
-define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
+define amdgpu_kernel void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
   %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
   %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16
   %dp4 = call float @llvm.r600.dot4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
index e4e6dd8..93caafb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
@@ -2,7 +2,7 @@
 
 ; EG-LABEL: {{^}}test_group_barrier:
 ; EG: GROUP_BARRIER
-define void @test_group_barrier(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_group_barrier(i32 addrspace(1)* %out) #0 {
 entry:
   %tmp = call i32 @llvm.r600.read.tidig.x()
   %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index a5b07e0..82c4260 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -14,7 +14,7 @@
 
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_x(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_y(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -44,7 +44,7 @@
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_z(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -59,7 +59,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
 ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_xy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_xy(i32 addrspace(1)* %out) {
 entry:
   %x = call i32 @llvm.r600.read.local.size.x() #0
   %y = call i32 @llvm.r600.read.local.size.y() #0
@@ -78,7 +78,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_xz(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_xz(i32 addrspace(1)* %out) {
 entry:
   %x = call i32 @llvm.r600.read.local.size.x() #0
   %z = call i32 @llvm.r600.read.local.size.z() #0
@@ -98,7 +98,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_yz(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_yz(i32 addrspace(1)* %out) {
 entry:
   %y = call i32 @llvm.r600.read.local.size.y() #0
   %z = call i32 @llvm.r600.read.local.size.z() #0
@@ -121,7 +121,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_xyz(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_xyz(i32 addrspace(1)* %out) {
 entry:
   %x = call i32 @llvm.r600.read.local.size.x() #0
   %y = call i32 @llvm.r600.read.local.size.y() #0
@@ -138,7 +138,7 @@
 ; GCN-NOT: 0xffff
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_x_known_bits(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_x_known_bits(i32 addrspace(1)* %out) {
 entry:
   %size = call i32 @llvm.r600.read.local.size.x() #0
   %shl = shl i32 %size, 16
@@ -153,7 +153,7 @@
 ; GCN-NOT: 0xffff
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_y_known_bits(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_y_known_bits(i32 addrspace(1)* %out) {
 entry:
   %size = call i32 @llvm.r600.read.local.size.y() #0
   %shl = shl i32 %size, 16
@@ -168,7 +168,7 @@
 ; GCN-NOT: 0xffff
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_z_known_bits(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_z_known_bits(i32 addrspace(1)* %out) {
 entry:
   %size = call i32 @llvm.r600.read.local.size.z() #0
   %shl = shl i32 %size, 16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
index 1c6e795..90d076d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
@@ -4,7 +4,7 @@
 
 ; EG-LABEL: {{^}}rsq_clamped_f32:
 ; EG: RECIPSQRT_CLAMPED
-define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
+define amdgpu_kernel void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
   %rsq_clamped = call float @llvm.r600.recipsqrt.clamped.f32(float %src)
   store float %rsq_clamped, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
index 1d6bff0..d9177b3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
@@ -4,7 +4,7 @@
 
 ; EG-LABEL: {{^}}recipsqrt.ieee_f32:
 ; EG: RECIPSQRT_IEEE
-define void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind {
+define amdgpu_kernel void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind {
   %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float %src) nounwind readnone
   store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@
 ; TODO: Really these should be constant folded
 ; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_4.0
 ; EG: RECIPSQRT_IEEE
-define void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind {
   %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 4.0) nounwind readnone
   store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@
 
 ; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_100.0
 ; EG: RECIPSQRT_IEEE
-define void @recipsqrt.ieee_f32_constant_100.0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @recipsqrt.ieee_f32_constant_100.0(float addrspace(1)* %out) nounwind {
   %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 100.0) nounwind readnone
   store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.tex.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.tex.ll
index 409037f..9804491 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.tex.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.tex.ll
@@ -17,7 +17,7 @@
 ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
 ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
 
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
 bb:
   %addr = load <4 x float>, <4 x float> addrspace(1)* %in
   %tmp = shufflevector <4 x float> %addr, <4 x float> %addr, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index c8c6e28..8f6007c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -12,7 +12,7 @@
 ; VI:  v_rndne_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @rint_f16(
+define amdgpu_kernel void @rint_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @rint_v2f16(
+define amdgpu_kernel void @rint_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll
index c63fb17..30ce8ed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll
@@ -11,7 +11,7 @@
 ; SI: v_cndmask_b32
 ; SI: v_cndmask_b32
 ; SI: s_endpgm
-define void @rint_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @rint_f64(double addrspace(1)* %out, double %in) {
 entry:
   %0 = call double @llvm.rint.f64(double %in)
   store double %0, double addrspace(1)* %out
@@ -21,7 +21,7 @@
 ; FUNC-LABEL: {{^}}rint_v2f64:
 ; CI: v_rndne_f64_e32
 ; CI: v_rndne_f64_e32
-define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 entry:
   %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in)
   store <2 x double> %0, <2 x double> addrspace(1)* %out
@@ -33,7 +33,7 @@
 ; CI: v_rndne_f64_e32
 ; CI: v_rndne_f64_e32
 ; CI: v_rndne_f64_e32
-define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 entry:
   %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in)
   store <4 x double> %0, <4 x double> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.ll
index cf7c0e4..4056bc3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.ll
@@ -6,7 +6,7 @@
 ; R600: RNDNE
 
 ; SI: v_rndne_f32_e32
-define void @rint_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @rint_f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.rint.f32(float %in) #0
   store float %0, float addrspace(1)* %out
@@ -19,7 +19,7 @@
 
 ; SI: v_rndne_f32_e32
 ; SI: v_rndne_f32_e32
-define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -36,7 +36,7 @@
 ; SI: v_rndne_f32_e32
 ; SI: v_rndne_f32_e32
 ; SI: v_rndne_f32_e32
-define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0
   store <4 x float> %0, <4 x float> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 3ea4551..c58b9b4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -2,7 +2,7 @@
 
 ; FUNC-LABEL: {{^}}round_f64:
 ; SI: s_endpgm
-define void @round_f64(double addrspace(1)* %out, double %x) #0 {
+define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 {
   %result = call double @llvm.round.f64(double %x) #1
   store double %result, double addrspace(1)* %out
   ret void
@@ -26,7 +26,7 @@
 
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
@@ -38,7 +38,7 @@
 
 ; FUNC-LABEL: {{^}}round_v2f64:
 ; SI: s_endpgm
-define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
+define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
   %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
   store <2 x double> %result, <2 x double> addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@
 
 ; FUNC-LABEL: {{^}}round_v4f64:
 ; SI: s_endpgm
-define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
+define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
   %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
   store <4 x double> %result, <4 x double> addrspace(1)* %out
   ret void
@@ -54,7 +54,7 @@
 
 ; FUNC-LABEL: {{^}}round_v8f64:
 ; SI: s_endpgm
-define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
+define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
   %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
   store <8 x double> %result, <8 x double> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index 7e8f8ff..3cf86c38 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -20,7 +20,7 @@
 ; R600-DAG: SETGE
 ; R600-DAG: CNDE
 ; R600-DAG: ADD
-define void @round_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @round_f32(float addrspace(1)* %out, float %x) #0 {
   %result = call float @llvm.round.f32(float %x) #1
   store float %result, float addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@
 ; FUNC-LABEL: {{^}}round_v2f32:
 ; SI: s_endpgm
 ; R600: CF_END
-define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
+define amdgpu_kernel void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
   %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@
 ; FUNC-LABEL: {{^}}round_v4f32:
 ; SI: s_endpgm
 ; R600: CF_END
-define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
+define amdgpu_kernel void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
   %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
@@ -52,7 +52,7 @@
 ; FUNC-LABEL: {{^}}round_v8f32:
 ; SI: s_endpgm
 ; R600: CF_END
-define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
+define amdgpu_kernel void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
   %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1
   store <8 x float> %result, <8 x float> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index d17df13..e89e545 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -13,7 +13,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sin_f16(
+define amdgpu_kernel void @sin_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -52,7 +52,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @sin_v2f16(
+define amdgpu_kernel void @sin_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
index 0475439..2a17303 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
@@ -12,7 +12,7 @@
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @sin_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @sin_f32(float addrspace(1)* %out, float %x) #1 {
    %sin = call float @llvm.sin.f32(float %x)
    store float %sin, float addrspace(1)* %out
    ret void
@@ -24,7 +24,7 @@
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
   %y = fmul float 3.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -38,7 +38,7 @@
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 {
+define amdgpu_kernel void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 {
   %y = fmul float 3.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -51,7 +51,7 @@
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
   %y = fmul float 2.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -65,7 +65,7 @@
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 {
+define amdgpu_kernel void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 {
   %y = fmul float 2.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -78,7 +78,7 @@
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 {
    %y = fmul float 2.0, %x
    %sin = call float @llvm.sin.f32(float %y)
    store float %sin, float addrspace(1)* %out
@@ -91,7 +91,7 @@
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 {
+define amdgpu_kernel void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 {
    %y = fmul float 2.0, %x
    %sin = call float @llvm.sin.f32(float %y)
    store float %sin, float addrspace(1)* %out
@@ -110,7 +110,7 @@
 ; SI: v_sin_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
+define amdgpu_kernel void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
    %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx)
    store <4 x float> %sin, <4 x float> addrspace(1)* %out
    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index 74373ff..64bd5db 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -12,7 +12,7 @@
 ; VI:  v_sqrt_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sqrt_f16(
+define amdgpu_kernel void @sqrt_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @sqrt_v2f16(
+define amdgpu_kernel void @sqrt_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index ee6eb50..bcfe868 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -12,7 +12,7 @@
 ; VI:  v_trunc_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @trunc_f16(
+define amdgpu_kernel void @trunc_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @trunc_v2f16(
+define amdgpu_kernel void @trunc_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index 1b42a9e..0050d1a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -6,7 +6,7 @@
 ; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
 ; GCN-NOHSA: buffer_store_dwordx2
 ; GCN-HSA: flat_store_dwordx2
-define void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
   %ld = load double, double addrspace(2)* %in
   store double %ld, double addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 104af10..c8abe5c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -9,56 +9,56 @@
 
 ; EG: VTX_READ_8
 ; EG: AND_INT
-define void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %load = load i1, i1 addrspace(2)* %in
   store i1 %load, i1 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v2i1:
-define void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   store <2 x i1> %load, <2 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v3i1:
-define void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   store <3 x i1> %load, <3 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v4i1:
-define void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   store <4 x i1> %load, <4 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v8i1:
-define void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   store <8 x i1> %load, <8 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v16i1:
-define void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   store <16 x i1> %load, <16 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v32i1:
-define void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   store <32 x i1> %load, <32 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v64i1:
-define void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   store <64 x i1> %load, <64 x i1> addrspace(1)* %out
   ret void
@@ -67,7 +67,7 @@
 ; FUNC-LABEL: {{^}}constant_zextload_i1_to_i32:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
-define void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = zext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -81,7 +81,7 @@
 
 ; EG: VTX_READ_8
 ; EG: BFE_INT
-define void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = sext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -89,7 +89,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i32:
-define void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = zext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -97,7 +97,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i32:
-define void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = sext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -105,7 +105,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i32:
-define void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = zext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -113,7 +113,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i32:
-define void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = sext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -121,7 +121,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i32:
-define void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = zext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i32:
-define void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = sext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -137,7 +137,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i32:
-define void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = zext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -145,7 +145,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i32:
-define void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = sext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -153,7 +153,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i32:
-define void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = zext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -161,7 +161,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i32:
-define void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = sext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -169,7 +169,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i32:
-define void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = zext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -177,7 +177,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i32:
-define void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = sext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -185,7 +185,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i32:
-define void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = zext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -193,7 +193,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i32:
-define void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = sext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -201,7 +201,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i32:
-define void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = zext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -209,7 +209,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i32:
-define void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = sext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -221,7 +221,7 @@
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]
 ; GCN: buffer_store_dwordx2
-define void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = zext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -233,7 +233,7 @@
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: buffer_store_dwordx2
-define void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = sext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -241,7 +241,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i64:
-define void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = zext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -249,7 +249,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i64:
-define void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = sext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -257,7 +257,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i64:
-define void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = zext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -265,7 +265,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i64:
-define void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = sext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -273,7 +273,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i64:
-define void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = zext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -281,7 +281,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i64:
-define void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = sext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -289,7 +289,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i64:
-define void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = zext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -297,7 +297,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i64:
-define void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = sext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -305,7 +305,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i64:
-define void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = zext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -313,7 +313,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i64:
-define void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = sext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -321,7 +321,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i64:
-define void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = zext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -329,7 +329,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i64:
-define void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = sext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -337,7 +337,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i64:
-define void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = zext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -345,7 +345,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i64:
-define void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = sext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -353,7 +353,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i64:
-define void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = zext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -361,7 +361,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i64:
-define void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = sext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index f7be129..5dd2efd 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -8,7 +8,7 @@
 ; GCN-HSA: flat_load_ushort
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %ld = load i16, i16 addrspace(2)* %in
   store i16 %ld, i16 addrspace(1)* %out
@@ -19,7 +19,7 @@
 ; GCN: s_load_dword s
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <2 x i16>, <2 x i16> addrspace(2)* %in
   store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
@@ -31,7 +31,7 @@
 
 ; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
-define void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
   store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
@@ -42,7 +42,7 @@
 ; GCN: s_load_dwordx2
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <4 x i16>, <4 x i16> addrspace(2)* %in
   store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
@@ -53,7 +53,7 @@
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <8 x i16>, <8 x i16> addrspace(2)* %in
   store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
@@ -65,7 +65,7 @@
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <16 x i16>, <16 x i16> addrspace(2)* %in
   store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
@@ -80,7 +80,7 @@
 ; GCN-HSA: flat_store_dword
 
 ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
-define void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -97,7 +97,7 @@
 ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 16
-define void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -109,7 +109,7 @@
 ; GCN-HSA: flat_load_ushort
 
 ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
-define void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -123,7 +123,7 @@
 ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 16
-define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -140,7 +140,7 @@
 ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal
 ; EG: 16
 ; EG: 16
-define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -160,7 +160,7 @@
 ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -183,7 +183,7 @@
 ; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal
 ; EG-DAG: 65535
 ; EG-DAG: 65535
-define void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -204,7 +204,7 @@
 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -229,7 +229,7 @@
 ; EG-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{T[0-9]\.[XYZW]}}, literal
 ; EG-DAG: 65535
 ; EG-DAG: 65535
-define void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -254,7 +254,7 @@
 ; EG-DAG: 16
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -288,7 +288,7 @@
 ; EG-DAG: 65535
 ; EG-DAG: 65535
 ; EG-DAG: 65535
-define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -322,7 +322,7 @@
 ; EG-DAG: 16
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -337,7 +337,7 @@
 ; v16i16 is naturally 32 byte aligned
 ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 0, #1
 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 16, #1
-define void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -352,7 +352,7 @@
 ; v16i16 is naturally 32 byte aligned
 ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 0, #1
 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 16, #1
-define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -369,7 +369,7 @@
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
-define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -385,7 +385,7 @@
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
-define void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -404,7 +404,7 @@
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1
-define void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -421,7 +421,7 @@
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1
-define void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -438,7 +438,7 @@
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -464,7 +464,7 @@
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EG: 31
-define void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -475,7 +475,7 @@
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -488,7 +488,7 @@
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EG: 31
-define void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -498,7 +498,7 @@
 ; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -508,7 +508,7 @@
 ; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -518,7 +518,7 @@
 ; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -528,7 +528,7 @@
 ; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -538,7 +538,7 @@
 ; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -548,7 +548,7 @@
 ; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -559,7 +559,7 @@
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -570,7 +570,7 @@
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -583,7 +583,7 @@
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -596,7 +596,7 @@
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -606,7 +606,7 @@
 ; These trigger undefined register machine verifier errors
 
 ; ; XFUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i64:
-; define void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -614,7 +614,7 @@
 ; }
 
 ; ; XFUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i64:
-; define void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index d1ff1c7..7370d45 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -7,7 +7,7 @@
 ; GCN: s_load_dword s{{[0-9]+}}
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
 entry:
   %ld = load i32, i32 addrspace(2)* %in
   store i32 %ld, i32 addrspace(1)* %out
@@ -18,7 +18,7 @@
 ; GCN: s_load_dwordx2
 
 ; EG: VTX_READ_64
-define void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
   store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
@@ -29,7 +29,7 @@
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i32>, <3 x i32> addrspace(2)* %in
   store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
@@ -40,7 +40,7 @@
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
   store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
@@ -52,7 +52,7 @@
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
   store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
@@ -66,7 +66,7 @@
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
   store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
@@ -81,7 +81,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
 ; EG: CF_END
 ; EG: VTX_READ_32
-define void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
   %ld = load i32, i32 addrspace(2)* %in
   %ext = zext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -98,7 +98,7 @@
 ; EG: VTX_READ_32
 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.
 ; EG: 31
-define void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
   %ld = load i32, i32 addrspace(2)* %in
   %ext = sext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -108,7 +108,7 @@
 ; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64:
 ; GCN: s_load_dword
 ; GCN: store_dwordx2
-define void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in
   %ext = zext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -119,7 +119,7 @@
 ; GCN: s_load_dword s[[LO:[0-9]+]]
 ; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31
 ; GCN: store_dwordx2
-define void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -129,7 +129,7 @@
 ; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64:
 ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: store_dwordx4
-define void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
   %ext = zext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -143,7 +143,7 @@
 ; GCN-DAG: s_ashr_i32
 
 ; GCN: store_dwordx4
-define void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
   %ext = sext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -155,7 +155,7 @@
 
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
   %ext = zext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -172,7 +172,7 @@
 
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
   %ext = sext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -191,7 +191,7 @@
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-SA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
   %ext = zext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -219,7 +219,7 @@
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
   %ext = sext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -240,7 +240,7 @@
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -267,7 +267,7 @@
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
-define void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
   %ext = zext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -319,7 +319,7 @@
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 
-define void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in
   %ext = sext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -370,7 +370,7 @@
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in
   %ext = zext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 0d071a1..14e50ea 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -7,7 +7,7 @@
 ; FUNC-LABEL: {{^}}constant_load_i64:
 ; GCN: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; EG: VTX_READ_64
-define void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 {
   %ld = load i64, i64 addrspace(2)* %in
   store i64 %ld, i64 addrspace(1)* %out
   ret void
@@ -17,7 +17,7 @@
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <2 x i64>, <2 x i64> addrspace(2)* %in
   store <2 x i64> %ld, <2 x i64> addrspace(1)* %out
@@ -29,7 +29,7 @@
 
 ; EG-DAG: VTX_READ_128
 ; EG-DAG: VTX_READ_128
-define void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i64>, <3 x i64> addrspace(2)* %in
   store <3 x i64> %ld, <3 x i64> addrspace(1)* %out
@@ -41,7 +41,7 @@
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <4 x i64>, <4 x i64> addrspace(2)* %in
   store <4 x i64> %ld, <4 x i64> addrspace(1)* %out
@@ -55,7 +55,7 @@
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <8 x i64>, <8 x i64> addrspace(2)* %in
   store <8 x i64> %ld, <8 x i64> addrspace(1)* %out
@@ -74,7 +74,7 @@
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <16 x i64>, <16 x i64> addrspace(2)* %in
   store <16 x i64> %ld, <16 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 9fdc4eb..6e56b9f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -10,7 +10,7 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; TODO: NOT AND
-define void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
 entry:
   %ld = load i8, i8 addrspace(2)* %in
   store i8 %ld, i8 addrspace(1)* %out
@@ -22,7 +22,7 @@
 ; GCN-HSA: flat_load_ushort v
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in
   store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
@@ -33,7 +33,7 @@
 ; GCN: s_load_dword s
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
   store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
@@ -44,7 +44,7 @@
 ; GCN: s_load_dword s
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <4 x i8>, <4 x i8> addrspace(2)* %in
   store <4 x i8> %ld, <4 x i8> addrspace(1)* %out
@@ -55,7 +55,7 @@
 ; GCN: s_load_dwordx2
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <8 x i8>, <8 x i8> addrspace(2)* %in
   store <8 x i8> %ld, <8 x i8> addrspace(1)* %out
@@ -66,7 +66,7 @@
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <16 x i8>, <16 x i8> addrspace(2)* %in
   store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
@@ -78,7 +78,7 @@
 ; GCN-HSA: flat_load_ubyte
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = zext i8 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -92,7 +92,7 @@
 ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %ld = load i8, i8 addrspace(2)* %in
   %ext = sext i8 %ld to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -102,7 +102,7 @@
 ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i32:
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = zext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -114,7 +114,7 @@
 ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = sext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@
 ; TODO: This should use DST, but for some there are redundant MOVs
 ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG: 8
-define void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = zext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -150,7 +150,7 @@
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = sext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -170,7 +170,7 @@
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
   %ext = zext <3 x i8> %ld to <3 x i32>
@@ -193,7 +193,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
   %ext = sext <3 x i8> %ld to <3 x i32>
@@ -214,7 +214,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = zext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -236,7 +236,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = sext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -264,7 +264,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = zext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -294,7 +294,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = sext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -335,7 +335,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = zext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -378,7 +378,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = sext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -450,7 +450,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = zext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -526,7 +526,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = sext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -539,7 +539,7 @@
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
   %ext = zext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -552,7 +552,7 @@
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
   %ext = sext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -570,7 +570,7 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = zext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -589,7 +589,7 @@
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = sext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -600,7 +600,7 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = zext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -613,7 +613,7 @@
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = sext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -623,7 +623,7 @@
 ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = zext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -633,7 +633,7 @@
 ; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = sext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -643,7 +643,7 @@
 ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = zext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -653,7 +653,7 @@
 ; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = sext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -663,7 +663,7 @@
 ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = zext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -673,7 +673,7 @@
 ; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = sext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -683,7 +683,7 @@
 ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = zext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -693,7 +693,7 @@
 ; FUNC-LABEL: {{^}}constant_sextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = sext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -704,7 +704,7 @@
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = zext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -715,7 +715,7 @@
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = sext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -723,7 +723,7 @@
 }
 
 ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i64:
-; define void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -731,7 +731,7 @@
 ; }
 
 ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i64:
-; define void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -744,7 +744,7 @@
 
 ; GCN-HSA: flat_load_ubyte v[[VAL:[0-9]+]],
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
-define void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = zext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -759,7 +759,7 @@
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = sext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -767,7 +767,7 @@
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i16:
-define void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = zext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -778,7 +778,7 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = sext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -788,7 +788,7 @@
 ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i16:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = zext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -800,7 +800,7 @@
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = sext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -810,7 +810,7 @@
 ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i16:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = zext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -824,7 +824,7 @@
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = sext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -834,7 +834,7 @@
 ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i16:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = zext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -853,7 +853,7 @@
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 
-define void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = sext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -863,7 +863,7 @@
 ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i16:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = zext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -889,7 +889,7 @@
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = sext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -900,7 +900,7 @@
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = zext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -943,7 +943,7 @@
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = sext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -951,7 +951,7 @@
 }
 
 ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i16:
-; define void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
@@ -959,7 +959,7 @@
 ; }
 
 ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i16:
-; define void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
index 805c0a7..bd6fea5 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -10,7 +10,7 @@
 ; GCN-HSA: flat_load_dword
 
 ; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load float, float addrspace(1)* %in
   store float %tmp0, float addrspace(1)* %out
@@ -22,7 +22,7 @@
 ; GCN-HSA: flat_load_dwordx2
 
 ; R600: VTX_READ_64
-define void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <2 x float>, <2 x float> addrspace(1)* %in
   store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
@@ -34,7 +34,7 @@
 ; GCN-HSA: flat_load_dwordx4
 
 ; R600: VTX_READ_128
-define void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <3 x float>, <3 x float> addrspace(1)* %in
   store <3 x float> %tmp0, <3 x float> addrspace(1)* %out
@@ -46,7 +46,7 @@
 ; GCN-HSA: flat_load_dwordx4
 
 ; R600: VTX_READ_128
-define void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <4 x float>, <4 x float> addrspace(1)* %in
   store <4 x float> %tmp0, <4 x float> addrspace(1)* %out
@@ -61,7 +61,7 @@
 
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
-define void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <8 x float>, <8 x float> addrspace(1)* %in
   store <8 x float> %tmp0, <8 x float> addrspace(1)* %out
@@ -83,7 +83,7 @@
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
-define void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <16 x float>, <16 x float> addrspace(1)* %in
   store <16 x float> %tmp0, <16 x float> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f64.ll b/llvm/test/CodeGen/AMDGPU/load-global-f64.ll
index dc1a943..5b772e1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-f64.ll
@@ -8,7 +8,7 @@
 
 ; GCN-HSA: flat_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
 ; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
-define void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %ld = load double, double addrspace(1)* %in
   store double %ld, double addrspace(1)* %out
   ret void
@@ -17,7 +17,7 @@
 ; FUNC-LABEL: {{^}}global_load_v2f64:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x double>, <2 x double> addrspace(1)* %in
   store <2 x double> %ld, <2 x double> addrspace(1)* %out
@@ -29,7 +29,7 @@
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x double>, <3 x double> addrspace(1)* %in
   store <3 x double> %ld, <3 x double> addrspace(1)* %out
@@ -42,7 +42,7 @@
 
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x double>, <4 x double> addrspace(1)* %in
   store <4 x double> %ld, <4 x double> addrspace(1)* %out
@@ -59,7 +59,7 @@
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v8f64(<8 x double> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8f64(<8 x double> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x double>, <8 x double> addrspace(1)* %in
   store <8 x double> %ld, <8 x double> addrspace(1)* %out
@@ -84,7 +84,7 @@
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v16f64(<16 x double> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16f64(<16 x double> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x double>, <16 x double> addrspace(1)* %in
   store <16 x double> %ld, <16 x double> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i1.ll b/llvm/test/CodeGen/AMDGPU/load-global-i1.ll
index e2e90ca..cb3536a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i1.ll
@@ -9,56 +9,56 @@
 
 ; EG: VTX_READ_8
 ; EG: AND_INT
-define void @global_load_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %load = load i1, i1 addrspace(1)* %in
   store i1 %load, i1 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v2i1:
-define void @global_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   store <2 x i1> %load, <2 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v3i1:
-define void @global_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   store <3 x i1> %load, <3 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v4i1:
-define void @global_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   store <4 x i1> %load, <4 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v8i1:
-define void @global_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   store <8 x i1> %load, <8 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v16i1:
-define void @global_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   store <16 x i1> %load, <16 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v32i1:
-define void @global_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   store <32 x i1> %load, <32 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v64i1:
-define void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   store <64 x i1> %load, <64 x i1> addrspace(1)* %out
   ret void
@@ -67,7 +67,7 @@
 ; FUNC-LABEL: {{^}}global_zextload_i1_to_i32:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
-define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -81,7 +81,7 @@
 
 ; EG: VTX_READ_8
 ; EG: BFE_INT
-define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -89,7 +89,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i32:
-define void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = zext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -97,7 +97,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i32:
-define void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = sext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -105,7 +105,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i32:
-define void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = zext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -113,7 +113,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i32:
-define void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = sext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -121,7 +121,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i32:
-define void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = zext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i32:
-define void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = sext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -137,7 +137,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i32:
-define void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = zext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -145,7 +145,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i32:
-define void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = sext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -153,7 +153,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i32:
-define void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = zext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -161,7 +161,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i32:
-define void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = sext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -169,7 +169,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i32:
-define void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = zext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -177,7 +177,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i32:
-define void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = sext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -185,7 +185,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i32:
-define void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = zext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -193,7 +193,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i32:
-define void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = sext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -201,7 +201,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i32:
-define void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = zext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -209,7 +209,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i32:
-define void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = sext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -221,7 +221,7 @@
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]{{$}}
 ; GCN: buffer_store_dwordx2
-define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -233,7 +233,7 @@
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: buffer_store_dwordx2
-define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -241,7 +241,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i64:
-define void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = zext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -249,7 +249,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i64:
-define void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = sext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -257,7 +257,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i64:
-define void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = zext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -265,7 +265,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i64:
-define void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = sext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -273,7 +273,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i64:
-define void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = zext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -281,7 +281,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i64:
-define void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = sext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -289,7 +289,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i64:
-define void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = zext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -297,7 +297,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i64:
-define void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = sext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -305,7 +305,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i64:
-define void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = zext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -313,7 +313,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i64:
-define void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = sext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -321,7 +321,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i64:
-define void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = zext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -329,7 +329,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i64:
-define void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = sext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -337,7 +337,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i64:
-define void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = zext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -345,7 +345,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i64:
-define void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = sext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -353,7 +353,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i64:
-define void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = zext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -361,7 +361,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i64:
-define void @global_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = sext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 88d6b7b..dcdd1a9 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -11,7 +11,7 @@
 ; GCN-HSA: flat_load_ushort
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
   %ld = load i16, i16 addrspace(1)* %in
   store i16 %ld, i16 addrspace(1)* %out
@@ -23,7 +23,7 @@
 ; GCN-HSA: flat_load_dword v
 
 ; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
   store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
@@ -36,7 +36,7 @@
 
 ; EGCM-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
-define void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
   store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
@@ -48,7 +48,7 @@
 ; GCN-HSA: flat_load_dwordx2
 
 ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
   store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
@@ -60,7 +60,7 @@
 ; GCN-HSA: flat_load_dwordx4
 
 ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <8 x i16>, <8 x i16> addrspace(1)* %in
   store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
@@ -76,7 +76,7 @@
 
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <16 x i16>, <16 x i16> addrspace(1)* %in
   store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
@@ -91,7 +91,7 @@
 ; GCN-HSA: flat_store_dword
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -108,7 +108,7 @@
 ; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
 ; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EGCM: 16
-define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -120,7 +120,7 @@
 ; GCN-HSA: flat_load_ushort
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -134,7 +134,7 @@
 ; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
 ; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EGCM: 16
-define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -148,7 +148,7 @@
 ; EGCM: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EGCM: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal
 ; EGCM: 16
-define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -168,7 +168,7 @@
 ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV.[XYZW]}}, 0.0, literal
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -190,7 +190,7 @@
 ; EGCM: 16
 ; EGCM: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal
 ; EGCM: AND_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], literal
-define void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -214,7 +214,7 @@
 ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], 0.0, literal
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -237,7 +237,7 @@
 ; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].X, {{.*}}, literal
 ; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{.*}}, literal
 ; EGCM-DAG: 16
-define void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -262,7 +262,7 @@
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -296,7 +296,7 @@
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -330,7 +330,7 @@
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -346,7 +346,7 @@
 
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -357,7 +357,7 @@
 
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -379,7 +379,7 @@
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
-define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -401,7 +401,7 @@
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
-define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -435,7 +435,7 @@
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
-define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -452,7 +452,7 @@
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
-define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -469,7 +469,7 @@
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EGCM: MOV {{.*}}, 0.0
-define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -495,7 +495,7 @@
 ; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EGCM: 31
-define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -506,7 +506,7 @@
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EGCM: MOV {{.*}}, 0.0
-define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -519,7 +519,7 @@
 ; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EGCM: 31
-define void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -527,7 +527,7 @@
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i16_to_v2i64:
-define void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -537,7 +537,7 @@
 ; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i64:
 
 ; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -547,7 +547,7 @@
 ; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i64:
 
 ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -557,7 +557,7 @@
 ; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i64:
 
 ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -567,7 +567,7 @@
 ; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i64:
 
 ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -577,7 +577,7 @@
 ; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i64:
 
 ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -588,7 +588,7 @@
 
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -599,7 +599,7 @@
 
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -612,7 +612,7 @@
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -625,7 +625,7 @@
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -633,7 +633,7 @@
 }
 
 ; ; XFUNC-LABEL: {{^}}global_zextload_v64i16_to_v64i64:
-; define void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -641,7 +641,7 @@
 ; }
 
 ; ; XFUNC-LABEL: {{^}}global_sextload_v64i16_to_v64i64:
-; define void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index e333534..5df32c1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -9,7 +9,7 @@
 ; GCN-HSA: flat_load_dword
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
   %ld = load i32, i32 addrspace(1)* %in
   store i32 %ld, i32 addrspace(1)* %out
@@ -21,7 +21,7 @@
 ; GCN-HSA: flat_load_dwordx2
 
 ; EG: VTX_READ_64
-define void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
@@ -33,7 +33,7 @@
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
   store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
@@ -45,7 +45,7 @@
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
@@ -60,7 +60,7 @@
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
   store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
@@ -82,7 +82,7 @@
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
   store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
@@ -98,7 +98,7 @@
 ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
-define void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %ld = load i32, i32 addrspace(1)* %in
   %ext = zext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -117,7 +117,7 @@
 ; EG: VTX_READ_32
 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.
 ; EG: 31
-define void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %ld = load i32, i32 addrspace(1)* %in
   %ext = sext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -130,7 +130,7 @@
 
 ; GCN-HSA: flat_load_dword
 ; GCN-HSA: flat_store_dwordx2
-define void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
   %ext = zext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -143,7 +143,7 @@
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -156,7 +156,7 @@
 
 ; GCN-HSA: flat_load_dwordx2
 ; GCN-HSA: flat_store_dwordx4
-define void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %ext = zext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -172,7 +172,7 @@
 
 ; GCN-NOHSA-DAG: buffer_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %ext = sext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -187,7 +187,7 @@
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
-define void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %ext = zext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -208,7 +208,7 @@
 
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %ext = sext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -231,7 +231,7 @@
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-SA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %ext = zext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -263,7 +263,7 @@
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %ext = sext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -309,7 +309,7 @@
 ; GCN-DAG: v_ashrrev_i32
 ; GCN-NOHSA-DAG: buffer_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -344,7 +344,7 @@
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
-define void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %ext = zext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -444,7 +444,7 @@
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 
-define void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
   %ext = sext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -511,7 +511,7 @@
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
   %ext = zext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i64.ll b/llvm/test/CodeGen/AMDGPU/load-global-i64.ll
index dd4ce2c..de16b6c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i64.ll
@@ -13,7 +13,7 @@
 ; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
 
 ; EG: VTX_READ_64
-define void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %ld = load i64, i64 addrspace(1)* %in
   store i64 %ld, i64 addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x i64>, <2 x i64> addrspace(1)* %in
   store <2 x i64> %ld, <2 x i64> addrspace(1)* %out
@@ -40,7 +40,7 @@
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i64>, <3 x i64> addrspace(1)* %in
   store <3 x i64> %ld, <3 x i64> addrspace(1)* %out
@@ -56,7 +56,7 @@
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x i64>, <4 x i64> addrspace(1)* %in
   store <4 x i64> %ld, <4 x i64> addrspace(1)* %out
@@ -78,7 +78,7 @@
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x i64>, <8 x i64> addrspace(1)* %in
   store <8 x i64> %ld, <8 x i64> addrspace(1)* %out
@@ -112,7 +112,7 @@
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x i64>, <16 x i64> addrspace(1)* %in
   store <16 x i64> %ld, <16 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index c880700..71adf09 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -11,7 +11,7 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; TODO: NOT AND
-define void @global_load_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 entry:
   %ld = load i8, i8 addrspace(1)* %in
   store i8 %ld, i8 addrspace(1)* %out
@@ -23,7 +23,7 @@
 ; GCN-HSA: flat_load_ushort v
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
   store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
@@ -35,7 +35,7 @@
 ; GCN-HSA: flat_load_dword v
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
@@ -47,7 +47,7 @@
 ; GCN-HSA: flat_load_dword v
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
   store <4 x i8> %ld, <4 x i8> addrspace(1)* %out
@@ -59,7 +59,7 @@
 ; GCN-HSA: flat_load_dwordx2
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x i8>, <8 x i8> addrspace(1)* %in
   store <8 x i8> %ld, <8 x i8> addrspace(1)* %out
@@ -72,7 +72,7 @@
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x i8>, <16 x i8> addrspace(1)* %in
   store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
@@ -84,7 +84,7 @@
 ; GCN-HSA: flat_load_ubyte
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -98,7 +98,7 @@
 ; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %ld = load i8, i8 addrspace(1)* %in
   %ext = sext i8 %ld to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -108,7 +108,7 @@
 ; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i32:
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = zext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -120,7 +120,7 @@
 ; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = sext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -135,7 +135,7 @@
 ; TODO: These should use DST, but for some there are redundant MOVs
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
 ; EG-DAG: 8
-define void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = zext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -152,7 +152,7 @@
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = sext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -174,7 +174,7 @@
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   %ext = zext <3 x i8> %ld to <3 x i32>
@@ -207,7 +207,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   %ext = sext <3 x i8> %ld to <3 x i32>
@@ -227,7 +227,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = zext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -248,7 +248,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = sext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -273,7 +273,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = zext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -300,7 +300,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = sext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -341,7 +341,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = zext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -384,7 +384,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = sext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -456,7 +456,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = zext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -532,7 +532,7 @@
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = sext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -545,7 +545,7 @@
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
   %ext = zext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -558,7 +558,7 @@
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
   %ext = sext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -576,7 +576,7 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -595,7 +595,7 @@
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = sext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -606,7 +606,7 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = zext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -619,7 +619,7 @@
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = sext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -629,7 +629,7 @@
 ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = zext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -639,7 +639,7 @@
 ; FUNC-LABEL: {{^}}global_sextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = sext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -649,7 +649,7 @@
 ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = zext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -659,7 +659,7 @@
 ; FUNC-LABEL: {{^}}global_sextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = sext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -669,7 +669,7 @@
 ; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = zext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -679,7 +679,7 @@
 ; FUNC-LABEL: {{^}}global_sextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = sext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -689,7 +689,7 @@
 ; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = zext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -699,7 +699,7 @@
 ; FUNC-LABEL: {{^}}global_sextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = sext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -710,7 +710,7 @@
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = zext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -721,7 +721,7 @@
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = sext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -729,7 +729,7 @@
 }
 
 ; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i64:
-; define void @global_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -737,7 +737,7 @@
 ; }
 
 ; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i64:
-; define void @global_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -752,7 +752,7 @@
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -768,7 +768,7 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = sext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -778,7 +778,7 @@
 ; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i16:
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = zext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -789,7 +789,7 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = sext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -799,7 +799,7 @@
 ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i16:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = zext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -811,7 +811,7 @@
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = sext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -821,7 +821,7 @@
 ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i16:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = zext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -835,7 +835,7 @@
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = sext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -845,7 +845,7 @@
 ; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i16:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = zext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -863,7 +863,7 @@
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = sext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -873,7 +873,7 @@
 ; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i16:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = zext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -899,7 +899,7 @@
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = sext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -910,7 +910,7 @@
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = zext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -953,7 +953,7 @@
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = sext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -961,7 +961,7 @@
 }
 
 ; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i16:
-; define void @global_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
@@ -969,7 +969,7 @@
 ; }
 
 ; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i16:
-; define void @global_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-f32.ll b/llvm/test/CodeGen/AMDGPU/load-local-f32.ll
index 77b5e3c..09d7145 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-f32.ll
@@ -7,7 +7,7 @@
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load float, float addrspace(3)* %in
   store float %tmp0, float addrspace(1)* %out
@@ -20,7 +20,7 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <2 x float>, <2 x float> addrspace(3)* %in
   store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
@@ -38,7 +38,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <3 x float>, <3 x float> addrspace(3)* %in
   store <3 x float> %tmp0, <3 x float> addrspace(3)* %out
@@ -52,7 +52,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4f32(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4f32(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <4 x float>, <4 x float> addrspace(3)* %in
   store <4 x float> %tmp0, <4 x float> addrspace(3)* %out
@@ -71,7 +71,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8f32(<8 x float> addrspace(3)* %out, <8 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8f32(<8 x float> addrspace(3)* %out, <8 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <8 x float>, <8 x float> addrspace(3)* %in
   store <8 x float> %tmp0, <8 x float> addrspace(3)* %out
@@ -100,7 +100,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16f32(<16 x float> addrspace(3)* %out, <16 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16f32(<16 x float> addrspace(3)* %out, <16 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <16 x float>, <16 x float> addrspace(3)* %in
   store <16 x float> %tmp0, <16 x float> addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-f64.ll b/llvm/test/CodeGen/AMDGPU/load-local-f64.ll
index 27d39b7..9ad6c08 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-f64.ll
@@ -9,7 +9,7 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) #0 {
   %ld = load double, double addrspace(3)* %in
   store double %ld, double addrspace(3)* %out
   ret void
@@ -22,7 +22,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v2f64(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2f64(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x double>, <2 x double> addrspace(3)* %in
   store <2 x double> %ld, <2 x double> addrspace(3)* %out
@@ -39,7 +39,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x double>, <3 x double> addrspace(3)* %in
   store <3 x double> %ld, <3 x double> addrspace(3)* %out
@@ -59,7 +59,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4f64(<4 x double> addrspace(3)* %out, <4 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4f64(<4 x double> addrspace(3)* %out, <4 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x double>, <4 x double> addrspace(3)* %in
   store <4 x double> %ld, <4 x double> addrspace(3)* %out
@@ -88,7 +88,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8f64(<8 x double> addrspace(3)* %out, <8 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8f64(<8 x double> addrspace(3)* %out, <8 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x double>, <8 x double> addrspace(3)* %in
   store <8 x double> %ld, <8 x double> addrspace(3)* %out
@@ -144,7 +144,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16f64(<16 x double> addrspace(3)* %out, <16 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16f64(<16 x double> addrspace(3)* %out, <16 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x double>, <16 x double> addrspace(3)* %in
   store <16 x double> %ld, <16 x double> addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i1.ll b/llvm/test/CodeGen/AMDGPU/load-local-i1.ll
index 2eed991..e8f134b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i1.ll
@@ -10,56 +10,56 @@
 ; EG: LDS_UBYTE_READ_RET
 ; EG: AND_INT
 ; EG: LDS_BYTE_WRITE
-define void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %load = load i1, i1 addrspace(3)* %in
   store i1 %load, i1 addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v2i1:
-define void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   store <2 x i1> %load, <2 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v3i1:
-define void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   store <3 x i1> %load, <3 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v4i1:
-define void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   store <4 x i1> %load, <4 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v8i1:
-define void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   store <8 x i1> %load, <8 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v16i1:
-define void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   store <16 x i1> %load, <16 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v32i1:
-define void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   store <32 x i1> %load, <32 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v64i1:
-define void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   store <64 x i1> %load, <64 x i1> addrspace(3)* %out
   ret void
@@ -68,7 +68,7 @@
 ; FUNC-LABEL: {{^}}local_zextload_i1_to_i32:
 ; GCN: ds_read_u8
 ; GCN: ds_write_b32
-define void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = zext i1 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -82,7 +82,7 @@
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
-define void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = sext i1 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -90,7 +90,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i32:
-define void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = zext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -98,7 +98,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i32:
-define void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = sext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -106,7 +106,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i32:
-define void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = zext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -114,7 +114,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i32:
-define void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = sext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -122,7 +122,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i32:
-define void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = zext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
@@ -130,7 +130,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i32:
-define void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = sext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
@@ -138,7 +138,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i32:
-define void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = zext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -146,7 +146,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i32:
-define void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = sext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -154,7 +154,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i32:
-define void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = zext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -162,7 +162,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i32:
-define void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = sext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -170,7 +170,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i32:
-define void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = zext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -178,7 +178,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i32:
-define void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = sext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -186,7 +186,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i32:
-define void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = zext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -194,7 +194,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i32:
-define void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = sext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -202,7 +202,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i32:
-define void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = zext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -210,7 +210,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i32:
-define void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = sext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -221,7 +221,7 @@
 ; GCN-DAG: ds_read_u8 [[LOAD:v[0-9]+]],
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN: ds_write_b64
-define void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = zext i1 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -233,7 +233,7 @@
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: ds_write_b64
-define void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = sext i1 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -241,7 +241,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i64:
-define void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = zext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -249,7 +249,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i64:
-define void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = sext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -257,7 +257,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i64:
-define void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = zext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -265,7 +265,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i64:
-define void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = sext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -273,7 +273,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i64:
-define void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = zext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(3)* %out
@@ -281,7 +281,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i64:
-define void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = sext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(3)* %out
@@ -289,7 +289,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i64:
-define void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = zext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -297,7 +297,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i64:
-define void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = sext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -305,7 +305,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i64:
-define void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = zext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -313,7 +313,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i64:
-define void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = sext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -321,7 +321,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i64:
-define void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = zext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -329,7 +329,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i64:
-define void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = sext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -337,7 +337,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i64:
-define void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = zext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -345,7 +345,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i64:
-define void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = sext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -353,7 +353,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i64:
-define void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = zext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -361,7 +361,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i64:
-define void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = sext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index d4e86de..bbbb34e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -10,7 +10,7 @@
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_SHORT_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) {
 entry:
   %ld = load i16, i16 addrspace(3)* %in
   store i16 %ld, i16 addrspace(3)* %out
@@ -25,7 +25,7 @@
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <2 x i16>, <2 x i16> addrspace(3)* %in
   store <2 x i16> %ld, <2 x i16> addrspace(3)* %out
@@ -39,7 +39,7 @@
 
 ; EG-DAG: LDS_USHORT_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
   store <3 x i16> %ld, <3 x i16> addrspace(3)* %out
@@ -51,7 +51,7 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <4 x i16>, <4 x i16> addrspace(3)* %in
   store <4 x i16> %ld, <4 x i16> addrspace(3)* %out
@@ -65,7 +65,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in
   store <8 x i16> %ld, <8 x i16> addrspace(3)* %out
@@ -86,7 +86,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <16 x i16>, <16 x i16> addrspace(3)* %in
   store <16 x i16> %ld, <16 x i16> addrspace(3)* %out
@@ -102,7 +102,7 @@
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -121,7 +121,7 @@
 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
 ; EG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -136,7 +136,7 @@
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -153,7 +153,7 @@
 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
 ; EG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -166,7 +166,7 @@
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -181,7 +181,7 @@
 ; EG: LDS_READ_RET
 ; EG: BFE_INT
 ; EG: BFE_INT
-define void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -194,7 +194,7 @@
 ; GCN-DAG: ds_write_b64
 
 ; EG: LDS_READ_RET
-define void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -211,7 +211,7 @@
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -226,7 +226,7 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -244,7 +244,7 @@
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -258,7 +258,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -280,7 +280,7 @@
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -304,7 +304,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -340,7 +340,7 @@
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -369,7 +369,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -406,7 +406,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -471,7 +471,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -512,7 +512,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -531,7 +531,7 @@
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG-DAG: LDS_WRITE
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -558,7 +558,7 @@
 ; EG-DAG: LDS_WRITE
 ; EG-DAG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -573,7 +573,7 @@
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG-DAG: LDS_WRITE
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -590,7 +590,7 @@
 ; EG-DAG: LDS_WRITE
 ; EG-DAG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -600,7 +600,7 @@
 ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64:
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -612,7 +612,7 @@
 ; EG: LDS_READ_RET
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
-define void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -623,7 +623,7 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -638,7 +638,7 @@
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -651,7 +651,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -672,7 +672,7 @@
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -689,7 +689,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -722,7 +722,7 @@
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -747,7 +747,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -804,7 +804,7 @@
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -812,7 +812,7 @@
 }
 
 ; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64:
-; define void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -820,7 +820,7 @@
 ; }
 
 ; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64:
-; define void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
index 280f965..8605541 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
@@ -9,7 +9,7 @@
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
 entry:
   %ld = load i32, i32 addrspace(3)* %in
   store i32 %ld, i32 addrspace(3)* %out
@@ -18,7 +18,7 @@
 
 ; FUNC-LABEL: {{^}}local_load_v2i32:
 ; GCN: ds_read_b64
-define void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
   store <2 x i32> %ld, <2 x i32> addrspace(3)* %out
@@ -28,7 +28,7 @@
 ; FUNC-LABEL: {{^}}local_load_v3i32:
 ; GCN-DAG: ds_read_b64
 ; GCN-DAG: ds_read_b32
-define void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i32>, <3 x i32> addrspace(3)* %in
   store <3 x i32> %ld, <3 x i32> addrspace(3)* %out
@@ -38,7 +38,7 @@
 ; FUNC-LABEL: {{^}}local_load_v4i32:
 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
 
-define void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
   store <4 x i32> %ld, <4 x i32> addrspace(3)* %out
@@ -48,7 +48,7 @@
 ; FUNC-LABEL: {{^}}local_load_v8i32:
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-define void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
   store <8 x i32> %ld, <8 x i32> addrspace(3)* %out
@@ -64,7 +64,7 @@
 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
-define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
   store <16 x i32> %ld, <16 x i32> addrspace(3)* %out
@@ -72,7 +72,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_i32_to_i64:
-define void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
   %ld = load i32, i32 addrspace(3)* %in
   %ext = zext i32 %ld to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -80,7 +80,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_i32_to_i64:
-define void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
   %ld = load i32, i32 addrspace(3)* %in
   %ext = sext i32 %ld to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -88,7 +88,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v1i32_to_v1i64:
-define void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
   %ext = zext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -96,7 +96,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v1i32_to_v1i64:
-define void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -104,7 +104,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v2i32_to_v2i64:
-define void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
   %ext = zext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -112,7 +112,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v2i32_to_v2i64:
-define void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
   %ext = sext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -120,7 +120,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v4i32_to_v4i64:
-define void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
   %ext = zext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -128,7 +128,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v4i32_to_v4i64:
-define void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
   %ext = sext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -136,7 +136,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64:
-define void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
   %ext = zext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -144,7 +144,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v8i32_to_v8i64:
-define void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
   %ext = sext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -152,7 +152,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v16i32_to_v16i64:
-define void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -160,7 +160,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v16i32_to_v16i64
-define void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
   %ext = zext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -168,7 +168,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v32i32_to_v32i64:
-define void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
   %ext = sext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -176,7 +176,7 @@
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v32i32_to_v32i64:
-define void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
   %ext = zext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i64.ll b/llvm/test/CodeGen/AMDGPU/load-local-i64.ll
index 180807d..0c719a9 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i64.ll
@@ -9,7 +9,7 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 {
   %ld = load i64, i64 addrspace(3)* %in
   store i64 %ld, i64 addrspace(3)* %out
   ret void
@@ -22,7 +22,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x i64>, <2 x i64> addrspace(3)* %in
   store <2 x i64> %ld, <2 x i64> addrspace(3)* %out
@@ -39,7 +39,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i64>, <3 x i64> addrspace(3)* %in
   store <3 x i64> %ld, <3 x i64> addrspace(3)* %out
@@ -59,7 +59,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4i64(<4 x i64> addrspace(3)* %out, <4 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i64(<4 x i64> addrspace(3)* %out, <4 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x i64>, <4 x i64> addrspace(3)* %in
   store <4 x i64> %ld, <4 x i64> addrspace(3)* %out
@@ -88,7 +88,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8i64(<8 x i64> addrspace(3)* %out, <8 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i64(<8 x i64> addrspace(3)* %out, <8 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x i64>, <8 x i64> addrspace(3)* %in
   store <8 x i64> %ld, <8 x i64> addrspace(3)* %out
@@ -144,7 +144,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16i64(<16 x i64> addrspace(3)* %out, <16 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i64(<16 x i64> addrspace(3)* %out, <16 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x i64>, <16 x i64> addrspace(3)* %in
   store <16 x i64> %ld, <16 x i64> addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
index 9ffc742..731996e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
@@ -9,7 +9,7 @@
 ; GCN: ds_read_u8
 
 ; EG: LDS_UBYTE_READ_RET
-define void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
 entry:
   %ld = load i8, i8 addrspace(3)* %in
   store i8 %ld, i8 addrspace(3)* %out
@@ -22,7 +22,7 @@
 ; GCN: ds_read_u16
 
 ; EG: LDS_USHORT_READ_RET
-define void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x i8>, <2 x i8> addrspace(3)* %in
   store <2 x i8> %ld, <2 x i8> addrspace(3)* %out
@@ -33,7 +33,7 @@
 ; GCN: ds_read_b32
 
 ; EG: DS_READ_RET
-define void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
   store <3 x i8> %ld, <3 x i8> addrspace(3)* %out
@@ -44,7 +44,7 @@
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x i8>, <4 x i8> addrspace(3)* %in
   store <4 x i8> %ld, <4 x i8> addrspace(3)* %out
@@ -56,7 +56,7 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x i8>, <8 x i8> addrspace(3)* %in
   store <8 x i8> %ld, <8 x i8> addrspace(3)* %out
@@ -71,7 +71,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in
   store <16 x i8> %ld, <16 x i8> addrspace(3)* %out
@@ -84,7 +84,7 @@
 ; GCN: ds_read_u8
 
 ; EG: LDS_UBYTE_READ_RET
-define void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = zext i8 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -98,7 +98,7 @@
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
-define void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %ld = load i8, i8 addrspace(3)* %in
   %ext = sext i8 %ld to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -108,7 +108,7 @@
 ; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i32:
 
 ; EG: LDS_UBYTE_READ_RET
-define void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = zext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -119,7 +119,7 @@
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
-define void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = sext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -130,7 +130,7 @@
 ; GCN: ds_read_u16
 
 ; EG: LDS_USHORT_READ_RET
-define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = zext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -156,7 +156,7 @@
 ; EG: LDS_USHORT_READ_RET
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = sext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -172,7 +172,7 @@
 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
   %ext = zext <3 x i8> %ld to <3 x i32>
@@ -197,7 +197,7 @@
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
   %ext = sext <3 x i8> %ld to <3 x i32>
@@ -214,7 +214,7 @@
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
-define void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = zext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -231,7 +231,7 @@
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = sext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -248,7 +248,7 @@
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
-define void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = zext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -267,7 +267,7 @@
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = sext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -292,7 +292,7 @@
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
-define void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = zext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -321,7 +321,7 @@
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = sext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -338,7 +338,7 @@
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = zext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -355,7 +355,7 @@
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = sext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -380,7 +380,7 @@
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
   %ext = zext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -405,7 +405,7 @@
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
   %ext = sext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -420,7 +420,7 @@
 ; EG: LDS_UBYTE_READ_RET
 ; EG: MOV {{.*}}, literal
 ; EG: 0.0
-define void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = zext i8 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -437,7 +437,7 @@
 ; EG: ASHR
 ; TODO: why not 7?
 ; EG: 31
-define void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = sext i8 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -450,7 +450,7 @@
 ; EG: MOV {{.*}}, literal
 ; TODO: merge?
 ; EG: 0.0
-define void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = zext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -463,7 +463,7 @@
 ; EG: ASHR
 ; TODO: why not 7?
 ; EG: 31
-define void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = sext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -473,7 +473,7 @@
 ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64:
 
 ; EG: LDS_USHORT_READ_RET
-define void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = zext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -485,7 +485,7 @@
 ; EG: LDS_USHORT_READ_RET
 ; EG: BFE_INT
 ; EG: BFE_INT
-define void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = sext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -495,7 +495,7 @@
 ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64:
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = zext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -505,7 +505,7 @@
 ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64:
 
 ; EG: LDS_READ_RET
-define void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = sext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -516,7 +516,7 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = zext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -536,7 +536,7 @@
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = sext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -549,7 +549,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = zext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -562,7 +562,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = sext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -579,7 +579,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = zext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -596,7 +596,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = sext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -604,7 +604,7 @@
 }
 
 ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i64:
-; define void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -612,7 +612,7 @@
 ; }
 
 ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i64:
-; define void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -625,7 +625,7 @@
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: LDS_SHORT_WRITE
-define void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = zext i8 %a to i16
   store i16 %ext, i16 addrspace(3)* %out
@@ -639,7 +639,7 @@
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
 ; EG: LDS_SHORT_WRITE
-define void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = sext i8 %a to i16
   store i16 %ext, i16 addrspace(3)* %out
@@ -650,7 +650,7 @@
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: LDS_SHORT_WRITE
-define void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = zext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
@@ -662,7 +662,7 @@
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
 ; EG: LDS_SHORT_WRITE
-define void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = sext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
@@ -673,7 +673,7 @@
 
 ; EG: LDS_USHORT_READ_RET
 ; EG: LDS_WRITE
-define void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = zext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
@@ -686,7 +686,7 @@
 ; EG: BFE_INT
 ; EG: BFE_INT
 ; EG: LDS_WRITE
-define void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = sext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
@@ -698,7 +698,7 @@
 ; EG: LDS_READ_RET
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = zext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
@@ -715,7 +715,7 @@
 ; EG-DAG: BFE_INT
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = sext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
@@ -730,7 +730,7 @@
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = zext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
@@ -754,7 +754,7 @@
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = sext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
@@ -775,7 +775,7 @@
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = zext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
@@ -813,7 +813,7 @@
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = sext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
@@ -846,7 +846,7 @@
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = zext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
@@ -908,7 +908,7 @@
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = sext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
@@ -916,7 +916,7 @@
 }
 
 ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i16:
-; define void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
@@ -924,7 +924,7 @@
 ; }
 
 ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i16:
-; define void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll
index bc5e494..d6162c3 100644
--- a/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll
@@ -8,7 +8,7 @@
 ; SI: {{flat|buffer}}_load_ubyte
 ; SI: {{flat|buffer}}_load_ushort
 ; SI: {{flat|buffer}}_store_dword
-define void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
   %1 = load i24, i24 addrspace(1)* %in
   %2 = zext i24 %1 to i32
   store i32 %2, i32 addrspace(1)* %out
@@ -21,7 +21,7 @@
 
 ; CI-HSA: flat_load_dword [[VAL:v[0-9]+]]
 ; CI-HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAL]]
-define void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 {
   %1 = load i25, i25 addrspace(1)* %in
   %2 = zext i25 %1 to i32
   store i32 %2, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/local-64.ll b/llvm/test/CodeGen/AMDGPU/local-64.ll
index a7cee43..bf4a932 100644
--- a/llvm/test/CodeGen/AMDGPU/local-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-64.ll
@@ -5,7 +5,7 @@
 ; BOTH-LABEL: {{^}}local_i32_load
 ; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
 ; BOTH: buffer_store_dword [[REG]],
-define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
   %val = load i32, i32 addrspace(3)* %gep, align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -15,7 +15,7 @@
 ; BOTH-LABEL: {{^}}local_i32_load_0_offset
 ; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}}
 ; BOTH: buffer_store_dword [[REG]],
-define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %val = load i32, i32 addrspace(3)* %in, align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
@@ -25,7 +25,7 @@
 ; BOTH-NOT: ADD
 ; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535
 ; BOTH: buffer_store_byte [[REG]],
-define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535
   %val = load i8, i8 addrspace(3)* %gep, align 4
   store i8 %val, i8 addrspace(1)* %out, align 4
@@ -40,7 +40,7 @@
 ; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
 ; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]]
 ; BOTH: buffer_store_byte [[REG]],
-define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536
   %val = load i8, i8 addrspace(3)* %gep, align 4
   store i8 %val, i8 addrspace(1)* %out, align 4
@@ -51,7 +51,7 @@
 ; BOTH-NOT: ADD
 ; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7
   %val = load i64, i64 addrspace(3)* %gep, align 8
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -61,7 +61,7 @@
 ; BOTH-LABEL: {{^}}local_i64_load_0_offset
 ; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %val = load i64, i64 addrspace(3)* %in, align 8
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
@@ -71,7 +71,7 @@
 ; BOTH-NOT: ADD
 ; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %gep = getelementptr double, double addrspace(3)* %in, i32 7
   %val = load double, double addrspace(3)* %gep, align 8
   store double %val, double addrspace(1)* %out, align 8
@@ -81,7 +81,7 @@
 ; BOTH-LABEL: {{^}}local_f64_load_0_offset
 ; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %val = load double, double addrspace(3)* %in, align 8
   store double %val, double addrspace(1)* %out, align 8
   ret void
@@ -90,7 +90,7 @@
 ; BOTH-LABEL: {{^}}local_i64_store:
 ; BOTH-NOT: ADD
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
-define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_i64_store(i64 addrspace(3)* %out) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7
   store i64 5678, i64 addrspace(3)* %gep, align 8
   ret void
@@ -99,7 +99,7 @@
 ; BOTH-LABEL: {{^}}local_i64_store_0_offset:
 ; BOTH-NOT: ADD
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
   store i64 1234, i64 addrspace(3)* %out, align 8
   ret void
 }
@@ -107,7 +107,7 @@
 ; BOTH-LABEL: {{^}}local_f64_store:
 ; BOTH-NOT: ADD
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
-define void @local_f64_store(double addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_f64_store(double addrspace(3)* %out) nounwind {
   %gep = getelementptr double, double addrspace(3)* %out, i32 7
   store double 16.0, double addrspace(3)* %gep, align 8
   ret void
@@ -115,7 +115,7 @@
 
 ; BOTH-LABEL: {{^}}local_f64_store_0_offset
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
   store double 20.0, double addrspace(3)* %out, align 8
   ret void
 }
@@ -124,7 +124,7 @@
 ; BOTH-NOT: ADD
 ; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
 ; BOTH: s_endpgm
-define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
   store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
   ret void
@@ -134,7 +134,7 @@
 ; BOTH-NOT: ADD
 ; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
 ; BOTH: s_endpgm
-define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
   store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
   ret void
 }
@@ -144,7 +144,7 @@
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
 ; BOTH: s_endpgm
-define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
   store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
   ret void
@@ -155,7 +155,7 @@
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
 ; BOTH: s_endpgm
-define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
   store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics.ll b/llvm/test/CodeGen/AMDGPU/local-atomics.ll
index 6714a28..de029d9 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics.ll
@@ -11,7 +11,7 @@
 ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@
 ; EG: LDS_WRXCHG_RET *
 ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -37,7 +37,7 @@
 ; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -47,7 +47,7 @@
 ; EG: LDS_ADD_RET *
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -59,7 +59,7 @@
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -73,7 +73,7 @@
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -84,7 +84,7 @@
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -96,7 +96,7 @@
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -109,7 +109,7 @@
 ; EG: LDS_SUB_RET *
 ; GCN: ds_sub_rtn_u32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -119,7 +119,7 @@
 ; EG: LDS_SUB_RET *
 ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -131,7 +131,7 @@
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -142,7 +142,7 @@
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -153,7 +153,7 @@
 ; EG: LDS_AND_RET *
 ; GCN: ds_and_rtn_b32
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -163,7 +163,7 @@
 ; EG: LDS_AND_RET *
 ; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -174,7 +174,7 @@
 ; EG: LDS_OR_RET *
 ; GCN: ds_or_rtn_b32
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -184,7 +184,7 @@
 ; EG: LDS_OR_RET *
 ; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -195,7 +195,7 @@
 ; EG: LDS_XOR_RET *
 ; GCN: ds_xor_rtn_b32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -205,7 +205,7 @@
 ; EG: LDS_XOR_RET *
 ; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -214,7 +214,7 @@
 
 ; FIXME: There is no atomic nand instr
 ; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
 ;   store i32 %result, i32 addrspace(1)* %out, align 4
 ;   ret void
@@ -224,7 +224,7 @@
 ; EG: LDS_MIN_INT_RET *
 ; GCN: ds_min_rtn_i32
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -234,7 +234,7 @@
 ; EG: LDS_MIN_INT_RET *
 ; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -245,7 +245,7 @@
 ; EG: LDS_MAX_INT_RET *
 ; GCN: ds_max_rtn_i32
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -255,7 +255,7 @@
 ; EG: LDS_MAX_INT_RET *
 ; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -266,7 +266,7 @@
 ; EG: LDS_MIN_UINT_RET *
 ; GCN: ds_min_rtn_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -276,7 +276,7 @@
 ; EG: LDS_MIN_UINT_RET *
 ; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -287,7 +287,7 @@
 ; EG: LDS_MAX_UINT_RET *
 ; GCN: ds_max_rtn_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -297,7 +297,7 @@
 ; EG: LDS_MAX_UINT_RET *
 ; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -310,7 +310,7 @@
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -318,7 +318,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset:
 ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -330,7 +330,7 @@
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_add_u32 [[VPTR]], [[DATA]]
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -338,7 +338,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset:
 ; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -348,7 +348,7 @@
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -360,7 +360,7 @@
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   ret void
 }
@@ -369,7 +369,7 @@
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
   ret void
@@ -379,7 +379,7 @@
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -390,7 +390,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32:
 ; GCN: ds_sub_u32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -398,7 +398,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset:
 ; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -408,7 +408,7 @@
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   ret void
 }
@@ -417,7 +417,7 @@
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
   ret void
@@ -426,7 +426,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32:
 ; GCN: ds_and_b32
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -434,7 +434,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset:
 ; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -443,7 +443,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32:
 ; GCN: ds_or_b32
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -451,7 +451,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset:
 ; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -460,7 +460,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32:
 ; GCN: ds_xor_b32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -468,7 +468,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset:
 ; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -476,7 +476,7 @@
 
 ; FIXME: There is no atomic nand instr
 ; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
 ;   ret void
 ; }
@@ -484,7 +484,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32:
 ; GCN: ds_min_i32
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -492,7 +492,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset:
 ; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -501,7 +501,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32:
 ; GCN: ds_max_i32
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -509,7 +509,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset:
 ; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -518,7 +518,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32:
 ; GCN: ds_min_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -526,7 +526,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset:
 ; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -535,7 +535,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32:
 ; GCN: ds_max_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -543,7 +543,7 @@
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset:
 ; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics64.ll b/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
index c889178..6572a7b 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
@@ -4,7 +4,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64:
 ; GCN: ds_wrxchg_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -13,7 +13,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
 ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -23,7 +23,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_add_ret_i64:
 ; GCN: ds_add_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -38,7 +38,7 @@
 ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -51,7 +51,7 @@
 ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -60,7 +60,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64_offset:
 ; GCN: ds_add_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -70,7 +70,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64:
 ; GCN: ds_sub_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -79,7 +79,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
 ; GCN: ds_sub_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -92,7 +92,7 @@
 ; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -101,7 +101,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64_offset:
 ; GCN: ds_sub_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -111,7 +111,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_and_ret_i64:
 ; GCN: ds_and_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -120,7 +120,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
 ; GCN: ds_and_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -130,7 +130,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_or_ret_i64:
 ; GCN: ds_or_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -139,7 +139,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
 ; GCN: ds_or_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -149,7 +149,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64:
 ; GCN: ds_xor_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -158,7 +158,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
 ; GCN: ds_xor_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -167,7 +167,7 @@
 
 ; FIXME: There is no atomic nand instr
 ; XGCN-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
 ;   store i64 %result, i64 addrspace(1)* %out, align 8
 ;   ret void
@@ -176,7 +176,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_min_ret_i64:
 ; GCN: ds_min_rtn_i64
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -185,7 +185,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
 ; GCN: ds_min_rtn_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -195,7 +195,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_max_ret_i64:
 ; GCN: ds_max_rtn_i64
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -204,7 +204,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
 ; GCN: ds_max_rtn_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -214,7 +214,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64:
 ; GCN: ds_min_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -223,7 +223,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
 ; GCN: ds_min_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -233,7 +233,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64:
 ; GCN: ds_max_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -242,7 +242,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
 ; GCN: ds_max_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -252,7 +252,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64:
 ; GCN: ds_wrxchg_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -260,7 +260,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
 ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -269,7 +269,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_add_noret_i64:
 ; GCN: ds_add_u64
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -282,7 +282,7 @@
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
   ret void
@@ -293,7 +293,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
 ; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   ret void
 }
@@ -301,7 +301,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64_offset:
 ; GCN: ds_add_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
   ret void
@@ -310,7 +310,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64:
 ; GCN: ds_sub_u64
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -318,7 +318,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
 ; GCN: ds_sub_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -329,7 +329,7 @@
 ; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
 ; GCN: ds_sub_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   ret void
 }
@@ -337,7 +337,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64_offset:
 ; GCN: ds_sub_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
   ret void
@@ -346,7 +346,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_and_noret_i64:
 ; GCN: ds_and_b64
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -354,7 +354,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
 ; GCN: ds_and_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -363,7 +363,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_or_noret_i64:
 ; GCN: ds_or_b64
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -371,7 +371,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
 ; GCN: ds_or_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -380,7 +380,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64:
 ; GCN: ds_xor_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -388,7 +388,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
 ; GCN: ds_xor_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -396,7 +396,7 @@
 
 ; FIXME: There is no atomic nand instr
 ; XGCN-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
 ;   ret void
 ; }
@@ -404,7 +404,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_min_noret_i64:
 ; GCN: ds_min_i64
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -412,7 +412,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
 ; GCN: ds_min_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -421,7 +421,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_max_noret_i64:
 ; GCN: ds_max_i64
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -429,7 +429,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
 ; GCN: ds_max_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -438,7 +438,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64:
 ; GCN: ds_min_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -446,7 +446,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
 ; GCN: ds_min_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -455,7 +455,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64:
 ; GCN: ds_max_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -463,7 +463,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
 ; GCN: ds_max_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
index a57e4f5..4ce9208 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -17,7 +17,7 @@
 ; GCN: s_barrier
 
 ; GCN: ds_read_b32 {{v[0-9]+}},
-define void @local_memory(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 {
 entry:
   %y.i = call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
@@ -61,7 +61,7 @@
 
 ; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
-define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x()
   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.ll b/llvm/test/CodeGen/AMDGPU/local-memory.ll
index 1a11332..6124237 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.ll
@@ -14,7 +14,7 @@
 ; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
 
 ; R600: LDS_READ_RET
-define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
 entry:
   %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
   %tmp1 = load i32, i32 addrspace(3)* %tmp0
@@ -30,7 +30,7 @@
 ; R600: LDS_READ_RET
 ; GCN-DAG: ds_read_b32
 ; GCN-DAG: ds_read2_b32
-define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
   %scalar = load i32, i32 addrspace(3)* %in
   %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
   %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.r600.ll b/llvm/test/CodeGen/AMDGPU/local-memory.r600.ll
index 9841b88..c8f4e4c 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.r600.ll
@@ -15,7 +15,7 @@
 ; EG-NEXT: ALU clause
 
 ; EG: LDS_READ_RET
-define void @local_memory(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 {
 entry:
   %y.i = call i32 @llvm.r600.read.tidig.x() #1
   %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
@@ -57,7 +57,7 @@
 ; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
 ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
 
-define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
 entry:
   %x.i = call i32 @llvm.r600.read.tidig.x() #1
   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
diff --git a/llvm/test/CodeGen/AMDGPU/loop-address.ll b/llvm/test/CodeGen/AMDGPU/loop-address.ll
index f60d574..e25d4f4 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-address.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-address.ll
@@ -5,7 +5,7 @@
 ;CHECK: LOOP_BREAK @10
 ;CHECK: POP @10
 
-define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 {
+define amdgpu_kernel void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 {
 entry:
   %cmp5 = icmp sgt i32 %iterations, 0
   br i1 %cmp5, label %for.body, label %for.end
diff --git a/llvm/test/CodeGen/AMDGPU/loop-idiom.ll b/llvm/test/CodeGen/AMDGPU/loop-idiom.ll
index 5fd9806..23ddd64 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-idiom.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-idiom.ll
@@ -9,7 +9,7 @@
 ; FUNC: @no_memcpy
 ; R600-NOT: {{^}}llvm.memcpy
 ; SI-NOT: {{^}}llvm.memcpy
-define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) {
+define amdgpu_kernel void @no_memcpy(i8 addrspace(3)* %in, i32 %size) {
 entry:
   %dest = alloca i8, i32 32
   br label %for.body
@@ -33,7 +33,7 @@
 ; R600-NOT: {{^}}memset_pattern16:
 ; SI-NOT: {{^}}llvm.memset
 ; SI-NOT: {{^}}memset_pattern16:
-define void @no_memset(i32 %size) {
+define amdgpu_kernel void @no_memset(i32 %size) {
 entry:
   %dest = alloca i8, i32 32
   br label %for.body
diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll
index 97212f5..4924721 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_break.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll
@@ -43,7 +43,7 @@
 ; GCN: ; BB#4: ; %bb9
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN-NEXT: s_endpgm
-define void @break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
@@ -87,7 +87,7 @@
 ; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
-define void @undef_phi_cond_break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
@@ -140,7 +140,7 @@
 ; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
-define void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
@@ -190,7 +190,7 @@
 ; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
-define void @true_phi_cond_break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
@@ -240,7 +240,7 @@
 ; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
-define void @false_phi_cond_break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
@@ -295,7 +295,7 @@
 ; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %1)
 ; OPT-NEXT: store volatile i32 7, i32 addrspace(3)* undef
 ; OPT-NEXT: ret void
-define void @invert_true_phi_cond_break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index d5cd10e..74564f3 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -9,7 +9,7 @@
 ; Test the upper bound for sizes to leave
 ; OPT-LABEL: @max_size_small_static_memcpy_caller0(
 ; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
-define void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+define amdgpu_kernel void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
   ret void
 }
@@ -21,14 +21,14 @@
 ; OPT-NEXT: load i8
 ; OPT: getelementptr
 ; OPT-NEXT: store i8
-define void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
   ret void
 }
 
 ; OPT-LABEL: @max_size_small_static_memmove_caller0(
 ; OPT: call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
-define void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+define amdgpu_kernel void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
   call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
   ret void
 }
@@ -39,14 +39,14 @@
 ; OPT-NEXT: load i8
 ; OPT: getelementptr
 ; OPT-NEXT: store i8
-define void @min_size_large_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+define amdgpu_kernel void @min_size_large_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
   call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
   ret void
 }
 
 ; OPT-LABEL: @max_size_small_static_memset_caller0(
 ; OPT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false)
-define void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
+define amdgpu_kernel void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
   call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false)
   ret void
 }
@@ -55,7 +55,7 @@
 ; OPT-NOT: call
 ; OPT: getelementptr
 ; OPT: store i8
-define void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
+define amdgpu_kernel void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
   call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1025, i32 1, i1 false)
   ret void
 }
@@ -63,7 +63,7 @@
 ; OPT-LABEL: @variable_memcpy_caller0(
 ; OPT-NOT: call
 ; OPT: phi
-define void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
+define amdgpu_kernel void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
   ret void
 }
@@ -71,7 +71,7 @@
 ; OPT-LABEL: @variable_memcpy_caller1(
 ; OPT-NOT: call
 ; OPT: phi
-define void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
+define amdgpu_kernel void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
   ret void
 }
@@ -82,7 +82,7 @@
 ; OPT-NOT: call
 ; OPT: phi
 ; OPT-NOT: call
-define void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n, i64 %m) #0 {
+define amdgpu_kernel void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n, i64 %m) #0 {
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %m, i32 1, i1 false)
   ret void
@@ -94,7 +94,7 @@
 ; OPT: load i8, i8 addrspace(3)*
 ; OPT: getelementptr inbounds i8, i8 addrspace(1)*
 ; OPT: store i8
-define void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
+define amdgpu_kernel void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n, i32 1, i1 false)
   ret void
 }
@@ -107,7 +107,7 @@
 ; OPT: store i8
 
 ; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false)
-define void @memcpy_multi_use_one_function_keep_small(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n) #0 {
+define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n) #0 {
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll b/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
index e1fad13..4e0ecc0 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range:
 ; CHECK-NOT: v0
 ; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
-define void @test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
   %and = and i32 %id, 1023
@@ -16,7 +16,7 @@
 ; CHECK-LABEL: {{^}}test_workitem_id_x_known_trunc_1_bit_range:
 ; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1ff, v0
 ; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
-define void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
   %and = and i32 %id, 511
@@ -28,7 +28,7 @@
 ; CHECK-NOT: v0
 ; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xff, v0
 ; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
-define void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !1
   %and = and i32 %id, 255
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index ee07b0e..7a05d65 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -12,7 +12,7 @@
 ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
   %result = lshr <2 x i16> %lhs, %rhs
   store <2 x i16> %result, <2 x i16> addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@
 ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -55,7 +55,7 @@
 ; GFX9: s_load_dword [[RHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -70,7 +70,7 @@
 ; GFX9: s_load_dword [[LHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -84,7 +84,7 @@
 ; GCN-LABEL: {{^}}lshr_imm_v_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8
-define void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -98,7 +98,7 @@
 ; GCN-LABEL: {{^}}lshr_v_imm_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]]
-define void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -115,7 +115,7 @@
 ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -133,7 +133,7 @@
 ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
index 9caba32..b855fc5 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
@@ -31,7 +31,7 @@
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; SI-STD: buffer_store_dword [[C]]
-define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -70,7 +70,7 @@
 ; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -107,7 +107,7 @@
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; SI-STD: buffer_store_dword [[C]]
-define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -137,7 +137,7 @@
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
 
 ; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -174,7 +174,7 @@
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -209,7 +209,7 @@
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
 
 ; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -245,7 +245,7 @@
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -281,7 +281,7 @@
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
 
 ; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -319,7 +319,7 @@
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -362,7 +362,7 @@
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -404,7 +404,7 @@
 ; SI-DENORM: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -447,7 +447,7 @@
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -497,7 +497,7 @@
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -548,7 +548,7 @@
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
index 9183ae0..1e78c4e 100644
--- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -11,7 +11,7 @@
 ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
 ; GCN: v_mov_b32_e32 [[VWGSIZEX:v[0-9]+]], [[WGSIZEX]]
 ; GCN: v_mad_u32_u24 v{{[0-9]+}}, [[VWGSIZEX]], s8, v0
-define void @get_global_id_0(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @get_global_id_0(i32 addrspace(1)* %out) #1 {
   %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
   %cast.dispatch.ptr = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
   %gep = getelementptr inbounds i32, i32 addrspace(2)* %cast.dispatch.ptr, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/mad_int24.ll b/llvm/test/CodeGen/AMDGPU/mad_int24.ll
index f149ea0..af0159a 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_int24.ll
@@ -11,7 +11,7 @@
 ; CM: MULADD_INT24
 ; SI-NOT: and
 ; SI: v_mad_i32_i24
-define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = ashr i32 %0, 8
diff --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
index 9fde950..2c4f7d3 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -11,7 +11,7 @@
 ; SI: v_mad_u32_u24
 ; VI: v_mad_u32_u24
 
-define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = lshr i32 %0, 8
@@ -32,7 +32,7 @@
 ; FIXME: Should be using scalar instructions here.
 ; GCN: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
-define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
+define amdgpu_kernel void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 entry:
   %0 = mul i16 %a, %b
   %1 = add i16 %0, %c
@@ -49,7 +49,7 @@
 ; EG: 8
 ; GCN: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
-define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
+define amdgpu_kernel void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
 entry:
   %0 = mul i8 %a, %b
   %1 = add i8 %0, %c
@@ -68,7 +68,7 @@
 ; FUNC-LABEL: {{^}}i24_i32_i32_mad:
 ; EG: CNDE_INT
 ; SI: v_cndmask
-define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
   %0 = ashr i32 %a, 8
   %1 = icmp ne i32 %c, 0
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index 6722aa79..eb4066a 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -10,7 +10,7 @@
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -37,7 +37,7 @@
 ; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], [[VK]]
 ; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]]
 ; GCN: s_endpgm
-define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -64,7 +64,7 @@
 ; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
-define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
+define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -84,7 +84,7 @@
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
-define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -106,7 +106,7 @@
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN-NOT: v_madak_f32
 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
-define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
+define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -125,7 +125,7 @@
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN-NOT: v_madak_f32
 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
-define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -141,7 +141,7 @@
 ; GCN-LABEL: {{^}}s_s_madak_f32:
 ; GCN-NOT: v_madak_f32
 ; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %mul = fmul float %a, %b
   %madak = fadd float %mul, 10.0
   store float %madak, float addrspace(1)* %out, align 4
@@ -153,7 +153,7 @@
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
 ; GCN: s_endpgm
-define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -175,7 +175,7 @@
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
 ; GCN: s_endpgm
-define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -201,7 +201,7 @@
 ; GCN: v_madak_f32_e32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
+define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
 bb:
   %tmp = icmp eq i32 %arg1, 0
   br i1 %tmp, label %bb3, label %bb4
diff --git a/llvm/test/CodeGen/AMDGPU/madmk.ll b/llvm/test/CodeGen/AMDGPU/madmk.ll
index 27fbf58..6e70e95 100644
--- a/llvm/test/CodeGen/AMDGPU/madmk.ll
+++ b/llvm/test/CodeGen/AMDGPU/madmk.ll
@@ -12,7 +12,7 @@
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]]
-define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -35,7 +35,7 @@
 ; GCN-DAG: v_mac_f32_e32 [[VB]], [[VK]], [[VA]]
 ; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]]
 ; GCN: s_endpgm
-define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -64,7 +64,7 @@
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]]
-define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -83,7 +83,7 @@
 ; GCN-NOT: v_madmk_f32
 ; GCN: v_mac_f32_e32
 ; GCN: s_endpgm
-define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
@@ -97,7 +97,7 @@
 ; GCN-NOT: v_madmk_f32
 ; GCN: v_mad_f32
 ; GCN: s_endpgm
-define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
+define amdgpu_kernel void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -113,7 +113,7 @@
 ; GCN-NOT: v_madmk_f32
 ; GCN: v_mac_f32_e32
 ; GCN: s_endpgm
-define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
+define amdgpu_kernel void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -130,7 +130,7 @@
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], |[[VA]]|, [[VB]]
-define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -151,7 +151,7 @@
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}|
-define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -172,7 +172,7 @@
 ; GCN: buffer_load_dword [[A:v[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0
-define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -189,7 +189,7 @@
 ; SI: s_xor_b64
 ; SI: v_mac_f32_e32 {{v[0-9]+}}, 0x472aee8c, {{v[0-9]+}}
 ; SI: s_or_b64
-define void @kill_madmk_verifier_error() nounwind {
+define amdgpu_kernel void @kill_madmk_verifier_error() nounwind {
 bb:
   br label %bb2
 
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index d442c87..8fcf85b 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -4,7 +4,7 @@
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_imax_sge_i16:
 ; VIPLUS: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -23,7 +23,7 @@
 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid
@@ -45,7 +45,7 @@
 
 ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %aptr, <3 x i16> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %aptr, <3 x i16> addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %bptr, i32 %tid
@@ -67,7 +67,7 @@
 
 ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %bptr, i32 %tid
@@ -83,7 +83,7 @@
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_imax_sgt_i16:
 ; VIPLUS: v_max_i16_e32
-define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -99,7 +99,7 @@
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_umax_uge_i16:
 ; VIPLUS: v_max_u16_e32
-define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -115,7 +115,7 @@
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_umax_ugt_i16:
 ; VIPLUS: v_max_u16_e32
-define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -133,7 +133,7 @@
 ; VI: v_max_u16_e32
 
 ; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index 5fa307b..ffcdac0 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -6,7 +6,7 @@
 ; SI: v_max_i32_e32
 
 ; EG: MAX_INT
-define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp sge i32 %a, %b
@@ -26,7 +26,7 @@
 ; EG: MAX_INT
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind {
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %aptr, align 4
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %bptr, align 4
   %cmp = icmp sge <4 x i32> %a, %b
@@ -39,7 +39,7 @@
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -50,7 +50,7 @@
 ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
 
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cmp = icmp sge i32 %a, 9
   %val = select i1 %cmp, i32 %a, i32 9
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -63,7 +63,7 @@
 ; SI: v_max_i32_e32
 
 ; EG: MAX_INT
-define void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
   %a = load i8, i8 addrspace(1)* %aptr, align 1
   %b = load i8, i8 addrspace(1)* %bptr, align 1
   %cmp = icmp sge i8 %a, %b
@@ -76,7 +76,7 @@
 ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
 
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cmp = icmp sgt i32 %a, 9
   %val = select i1 %cmp, i32 %a, i32 9
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -89,7 +89,7 @@
 
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
   %cmp = icmp sgt <2 x i32> %a, <i32 9, i32 9>
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 9, i32 9>
   store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
@@ -100,7 +100,7 @@
 ; SI: v_max_i32_e32
 
 ; EG: MAX_INT
-define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp sgt i32 %a, %b
@@ -113,7 +113,7 @@
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sgt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -124,7 +124,7 @@
 ; SI: v_max_u32_e32
 
 ; EG: MAX_UINT
-define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp uge i32 %a, %b
@@ -137,7 +137,7 @@
 ; SI: s_max_u32
 
 ; EG: MAX_UINT
-define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp uge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -155,7 +155,7 @@
 ; EG: MAX_UINT
 ; EG: MAX_UINT
 ; EG-NOT: MAX_UINT
-define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind {
   %cmp = icmp uge <3 x i32> %a, %b
   %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
   store <3 x i32> %val, <3 x i32> addrspace(1)* %out, align 4
@@ -168,7 +168,7 @@
 ; SI: v_max_u32_e32
 
 ; EG: MAX_UINT
-define void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
   %a = load i8, i8 addrspace(1)* %aptr, align 1
   %b = load i8, i8 addrspace(1)* %bptr, align 1
   %cmp = icmp uge i8 %a, %b
@@ -181,7 +181,7 @@
 ; SI: v_max_u32_e32
 
 ; EG: MAX_UINT
-define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp ugt i32 %a, %b
@@ -194,7 +194,7 @@
 ; SI: s_max_u32
 
 ; EG: MAX_UINT
-define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp ugt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -207,7 +207,7 @@
 
 ; EG: MAX_UINT {{.*}}literal.{{[xyzw]}}
 ; EG: MAX_UINT {{.*}}literal.{{[xyzw]}}
-define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
   %cmp = icmp ugt <2 x i32> %a, <i32 15, i32 23>
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 15, i32 23>
   store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
@@ -223,7 +223,7 @@
 ; SI: buffer_store_dword [[VMAX]]
 
 ; EG: MAX_UINT
-define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
   %cmp = icmp ugt i32 %a.ext, %b.ext
@@ -243,7 +243,7 @@
 ; SI: buffer_store_dword [[VMAX]]
 
 ; EG: MAX_INT
-define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
   %cmp = icmp sgt i32 %a.ext, %b.ext
@@ -262,7 +262,7 @@
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
   %cmp = icmp sge i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
   store i16 %val, i16 addrspace(1)* %out
@@ -275,7 +275,7 @@
 
 ; EG: MAX_UINT
 ; EG: MAX_UINT
-define void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp ugt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -287,7 +287,7 @@
 
 ; EG: MAX_UINT
 ; EG: MAX_UINT
-define void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp uge i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -299,7 +299,7 @@
 
 ; EG-DAG: MAX_UINT
 ; EG-DAG: MAX_INT
-define void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp sgt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -311,7 +311,7 @@
 
 ; EG-DAG: MAX_UINT
 ; EG-DAG: MAX_INT
-define void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp sge i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/max3.ll b/llvm/test/CodeGen/AMDGPU/max3.ll
index a12dba2..4bb4fd4 100644
--- a/llvm/test/CodeGen/AMDGPU/max3.ll
+++ b/llvm/test/CodeGen/AMDGPU/max3.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: @v_test_imax3_sgt_i32
 ; SI: v_max3_i32
-define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -23,7 +23,7 @@
 
 ; FUNC-LABEL: @v_test_umax3_ugt_i32
 ; SI: v_max3_u32
-define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/mem-builtins.ll b/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
index 9751267..1cbd0c3 100644
--- a/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
+++ b/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
@@ -9,7 +9,7 @@
 
 
 ; ERROR: error: <unknown>:0:0: in function test_memcmp void (i8 addrspace(1)*, i8 addrspace(1)*, i32*): unsupported call to function memcmp
-define void @test_memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i32* nocapture %p) #0 {
+define amdgpu_kernel void @test_memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i32* nocapture %p) #0 {
 entry:
   %cmp = tail call i32 @memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i64 2)
   store volatile i32 %cmp, i32 addrspace(1)* undef
@@ -17,35 +17,35 @@
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_memchr void (i8 addrspace(1)*, i32, i64): unsupported call to function memchr
-define void @test_memchr(i8 addrspace(1)* %src, i32 %char, i64 %len) #0 {
+define amdgpu_kernel void @test_memchr(i8 addrspace(1)* %src, i32 %char, i64 %len) #0 {
   %res = call i8 addrspace(1)* @memchr(i8 addrspace(1)* %src, i32 %char, i64 %len)
   store volatile i8 addrspace(1)* %res, i8 addrspace(1)* addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strcpy void (i8*, i8*): unsupported call to function strcpy
-define void @test_strcpy(i8* %dst, i8* %src) #0 {
+define amdgpu_kernel void @test_strcpy(i8* %dst, i8* %src) #0 {
   %res = call i8* @strcpy(i8* %dst, i8* %src)
   store volatile i8* %res, i8* addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strcmp void (i8*, i8*): unsupported call to function strcmp
-define void @test_strcmp(i8* %src0, i8* %src1) #0 {
+define amdgpu_kernel void @test_strcmp(i8* %src0, i8* %src1) #0 {
   %res = call i32 @strcmp(i8* %src0, i8* %src1)
   store volatile i32 %res, i32 addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strlen void (i8*): unsupported call to function strlen
-define void @test_strlen(i8* %src) #0 {
+define amdgpu_kernel void @test_strlen(i8* %src) #0 {
   %res = call i32 @strlen(i8* %src)
   store volatile i32 %res, i32 addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strnlen void (i8*, i32): unsupported call to function strnlen
-define void @test_strnlen(i8* %src, i32 %size) #0 {
+define amdgpu_kernel void @test_strnlen(i8* %src, i32 %size) #0 {
   %res = call i32 @strnlen(i8* %src, i32 %size)
   store volatile i32 %res, i32 addrspace(1)* undef
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
index fd2d8ee..dfd5b97 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
@@ -13,7 +13,7 @@
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -25,7 +25,7 @@
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -35,7 +35,7 @@
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
 ; GCN: buffer_store_dword v
-define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -45,7 +45,7 @@
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
 ; GCN: buffer_store_dword v
-define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 0, i16 addrspace(1)* %out.gep.1
@@ -57,7 +57,7 @@
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
 ; GCN: s_endpgm
-define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -69,7 +69,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
@@ -79,7 +79,7 @@
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
 ; GCN: buffer_store_dwordx2
-define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
   store float 1.0, float addrspace(1)* %out.gep.1.bc
@@ -91,7 +91,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
   store i32 123, i32 addrspace(1)* %out.gep.1.bc
@@ -105,7 +105,7 @@
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -119,7 +119,7 @@
 
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -134,7 +134,7 @@
 ; First store is out of order.
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -149,7 +149,7 @@
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
 ; GCN-AA: buffer_store_dwordx4 v
 ; GCN: s_endpgm
-define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -169,7 +169,7 @@
 ; SI-DAG: buffer_store_dword
 ; SI-NOT: buffer_store_dword
 ; GCN: s_endpgm
-define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 
@@ -181,7 +181,7 @@
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 
   store i64 123, i64 addrspace(1)* %out.gep.1
@@ -192,7 +192,7 @@
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
@@ -207,7 +207,7 @@
 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx2 [[LOAD]]
-define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -222,7 +222,7 @@
 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 
@@ -241,7 +241,7 @@
 ; GCN: buffer_load_dword v
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
-define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -256,7 +256,7 @@
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[LOAD]]
-define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -283,7 +283,7 @@
 ; SI-DAG: buffer_store_dword v
 ; SI-DAG: buffer_store_dwordx2 v
 ; GCN: s_endpgm
-define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
@@ -302,7 +302,7 @@
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[LOAD]]
-define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -325,7 +325,7 @@
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
 ; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
-define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
@@ -351,7 +351,7 @@
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: s_barrier
 ; GCN: buffer_store_dwordx4 [[LOAD]]
-define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -388,7 +388,7 @@
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
-define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -416,7 +416,7 @@
 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
 ; GCN: buffer_store_dword [[LOAD]]
 ; GCN: s_endpgm
-define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -446,7 +446,7 @@
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -470,7 +470,7 @@
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[LOAD]]
 ; GCN: s_endpgm
-define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -492,7 +492,7 @@
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: s_endpgm
-define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 
   store i8 123, i8 addrspace(3)* %out.gep.1
@@ -504,7 +504,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
-define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 
   store i32 123, i32 addrspace(3)* %out.gep.1
@@ -522,7 +522,7 @@
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
 
 ; GCN: s_endpgm
-define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
@@ -540,7 +540,7 @@
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
 ; GCN: buffer_store_dword v[[HI]]
-define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
   store i32 9, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 12, i32 addrspace(1)* %idx1, align 4
@@ -556,7 +556,7 @@
 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx2
-define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
   store i32 13, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 15, i32 addrspace(1)* %idx1, align 4
@@ -575,7 +575,7 @@
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx2
 ; GCN: buffer_store_dword v
-define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -596,7 +596,7 @@
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -630,7 +630,7 @@
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
   store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
   ret void
@@ -646,7 +646,7 @@
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
   store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
   ret void
@@ -662,7 +662,7 @@
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %fadd, <3 x float> addrspace(1)* %out
@@ -679,7 +679,7 @@
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
   store <3 x double> %fadd, <3 x double> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index c7a89e0..1aaa827 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -7,7 +7,7 @@
 ; GCN: v_min_i32_e32
 
 ; EG: MIN_INT
-define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
@@ -24,7 +24,7 @@
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp sle i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -35,7 +35,7 @@
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
   %cmp = icmp sle <1 x i32> %a, %b
   %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
   store <1 x i32> %val, <1 x i32> addrspace(1)* %out
@@ -52,7 +52,7 @@
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
   %cmp = icmp sle <4 x i32> %a, %b
   %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
   store <4 x i32> %val, <4 x i32> addrspace(1)* %out
@@ -65,7 +65,7 @@
 ; GCN: s_sext_i32_i8
 ; GCN: s_sext_i32_i8
 ; GCN: s_min_i32
-define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 {
   %cmp = icmp sle i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
   store i8 %val, i8 addrspace(1)* %out
@@ -106,7 +106,7 @@
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 {
   %cmp = icmp sle <4 x i8> %a, %b
   %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out
@@ -124,7 +124,7 @@
 
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
   %cmp = icmp sle <2 x i16> %a, %b
   %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
   store <2 x i16> %val, <2 x i16> addrspace(1)* %out
@@ -150,7 +150,7 @@
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) #0 {
   %cmp = icmp sle <4 x i16> %a, %b
   %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
   store <4 x i16> %val, <4 x i16> addrspace(1)* %out
@@ -161,7 +161,7 @@
 ; GCN: v_min_i32_e32
 
 ; EG: MIN_INT
-define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -180,7 +180,7 @@
 ; GFX89: v_min_i16_e32
 
 ; EG: MIN_INT
-define void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -198,7 +198,7 @@
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp slt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -211,7 +211,7 @@
 
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %cmp = icmp slt <2 x i32> %a, %b
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
   store <2 x i32> %val, <2 x i32> addrspace(1)* %out
@@ -222,7 +222,7 @@
 ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
 
 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
   %cmp = icmp slt i32 %a, 8
   %val = select i1 %cmp, i32 %a, i32 8
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -233,7 +233,7 @@
 ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
 
 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
   %cmp = icmp sle i32 %a, 8
   %val = select i1 %cmp, i32 %a, i32 8
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -244,7 +244,7 @@
 ; GCN: v_min_u32_e32
 
 ; EG: MIN_UINT
-define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
@@ -267,7 +267,7 @@
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %a.ptr, <3 x i32> addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %a.ptr, <3 x i32> addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b.ptr, i32 %tid
@@ -301,7 +301,7 @@
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %b.ptr, i32 %tid
@@ -319,7 +319,7 @@
 ; GCN: s_min_u32
 
 ; EG: MIN_UINT
-define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp ule i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -330,7 +330,7 @@
 ; GCN: v_min_u32_e32
 
 ; EG: MIN_UINT
-define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
@@ -353,7 +353,7 @@
 ; GFX89: v_min_u16_e32
 
 ; EG: MIN_UINT
-define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds i8, i8 addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds i8, i8 addrspace(1)* %b.ptr, i32 %tid
@@ -371,7 +371,7 @@
 ; GCN: s_min_u32
 
 ; EG: MIN_UINT
-define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp ult i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -386,7 +386,7 @@
 ; GCN: s_endpgm
 
 ; EG-NOT: MIN_UINT
-define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp ult i32 %a, %b
@@ -404,7 +404,7 @@
 ; GCN: s_endpgm
 
 ; EG-NOT: MIN_UINT
-define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
   %a = load i16, i16 addrspace(1)* %aptr, align 2
   %b = load i16, i16 addrspace(1)* %bptr, align 2
   %cmp = icmp ult i16 %a, %b
@@ -419,7 +419,7 @@
 ; GCN: s_min_u32
 
 ; EG: MIN_UINT
-define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
+define amdgpu_kernel void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
   %cmp = icmp ult <1 x i32> %a, %b
   %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
   store <1 x i32> %val, <1 x i32> addrspace(1)* %out
@@ -444,7 +444,7 @@
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) #0 {
+define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) #0 {
   %cmp = icmp ult <8 x i32> %a, %b
   %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
   store <8 x i32> %val, <8 x i32> addrspace(1)* %out
@@ -478,7 +478,7 @@
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) #0 {
+define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) #0 {
   %cmp = icmp ult <8 x i16> %a, %b
   %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
   store <8 x i16> %val, <8 x i16> addrspace(1)* %out
@@ -494,7 +494,7 @@
 ; GCN: buffer_store_dword [[VMIN]]
 
 ; EG: MIN_UINT
-define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
+define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
   %cmp = icmp ult i32 %a.ext, %b.ext
@@ -514,7 +514,7 @@
 ; GCN: buffer_store_dword [[VMIN]]
 
 ; EG: MIN_INT
-define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 {
+define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
   %cmp = icmp slt i32 %a.ext, %b.ext
@@ -529,7 +529,7 @@
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #0 {
   %cmp = icmp sle i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
   store i16 %val, i16 addrspace(1)* %out
@@ -542,7 +542,7 @@
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp ult i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -554,7 +554,7 @@
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp ule i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -566,7 +566,7 @@
 
 ; EG-DAG: MIN_UINT
 ; EG-DAG: MIN_INT
-define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp slt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -578,7 +578,7 @@
 
 ; EG-DAG: MIN_UINT
 ; EG-DAG: MIN_INT
-define void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp sle i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -596,7 +596,7 @@
 
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid
@@ -621,7 +621,7 @@
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @v_test_imin_ule_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_imin_ule_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/min3.ll b/llvm/test/CodeGen/AMDGPU/min3.ll
index 728479a..59d5d2c 100644
--- a/llvm/test/CodeGen/AMDGPU/min3.ll
+++ b/llvm/test/CodeGen/AMDGPU/min3.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: @v_test_imin3_slt_i32
 ; SI: v_min3_i32
-define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -23,7 +23,7 @@
 
 ; FUNC-LABEL: @v_test_umin3_ult_i32
 ; SI: v_min3_u32
-define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -43,7 +43,7 @@
 ; FUNC-LABEL: @v_test_umin_umin_umin
 ; SI: v_min_i32
 ; SI: v_min3_i32
-define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tid2 = mul i32 %tid, 2
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
@@ -77,7 +77,7 @@
 
 ; FUNC-LABEL: @v_test_umin3_2_uses
 ; SI-NOT: v_min3
-define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tid2 = mul i32 %tid, 2
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/missing-store.ll b/llvm/test/CodeGen/AMDGPU/missing-store.ll
index 8e1b003..83c2a91 100644
--- a/llvm/test/CodeGen/AMDGPU/missing-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/missing-store.ll
@@ -15,7 +15,7 @@
 ; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8
   %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
 
diff --git a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
index 85dfbe6..e1fb00a 100644
--- a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
@@ -19,7 +19,7 @@
 ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}},
 
-define void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
+define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
 bb:
   %tmp = icmp sgt i32 %arg3, 0
   br i1 %tmp, label %bb4, label %bb17
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
index 1a0a390..417b4ba 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
@@ -11,7 +11,7 @@
 
 ; GCN-LABEL: {{^}}atomic_max_i32:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400 glc{{$}}
-define void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
   %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
@@ -31,7 +31,7 @@
 
 ; GCN-LABEL: {{^}}atomic_max_i32_noret:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400{{$}}
-define void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
+define amdgpu_kernel void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
   %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf.ll b/llvm/test/CodeGen/AMDGPU/mubuf.ll
index a574365..9e1d2e0 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf.ll
@@ -9,7 +9,7 @@
 ; MUBUF load with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_load0:
 ; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
-define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1
   %1 = load i32, i32 addrspace(1)* %0
@@ -20,7 +20,7 @@
 ; MUBUF load with the largest possible immediate offset
 ; CHECK-LABEL: {{^}}mubuf_load1:
 ; CHECK: buffer_load_ubyte v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
-define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095
   %1 = load i8, i8 addrspace(1)* %0
@@ -32,7 +32,7 @@
 ; CHECK-LABEL: {{^}}mubuf_load2:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
 ; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0
-define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024
   %1 = load i32, i32 addrspace(1)* %0
@@ -44,7 +44,7 @@
 ; CHECK-LABEL: {{^}}mubuf_load3:
 ; CHECK-NOT: ADD
 ; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0
-define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
+define amdgpu_kernel void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %in, i64 %offset
   %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
@@ -91,7 +91,7 @@
 ; MUBUF store with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_store0:
 ; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0
-define void @mubuf_store0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @mubuf_store0(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1
   store i32 0, i32 addrspace(1)* %0
@@ -102,7 +102,7 @@
 ; CHECK-LABEL: {{^}}mubuf_store1:
 ; CHECK: buffer_store_byte v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0
 
-define void @mubuf_store1(i8 addrspace(1)* %out) {
+define amdgpu_kernel void @mubuf_store1(i8 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i8, i8 addrspace(1)* %out, i64 4095
   store i8 0, i8 addrspace(1)* %0
@@ -113,7 +113,7 @@
 ; CHECK-LABEL: {{^}}mubuf_store2:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
 ; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0
-define void @mubuf_store2(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @mubuf_store2(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024
   store i32 0, i32 addrspace(1)* %0
@@ -124,7 +124,7 @@
 ; CHECK-LABEL: {{^}}mubuf_store3:
 ; CHECK-NOT: ADD
 ; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0
-define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
+define amdgpu_kernel void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset
   %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
@@ -134,14 +134,14 @@
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0
-define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
   store i32 99, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
-define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
@@ -150,7 +150,7 @@
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
 ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
@@ -159,7 +159,7 @@
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic:
 ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
 ; CHECK: buffer_atomic_add v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst
   ret void
@@ -167,7 +167,7 @@
 
 ; CHECK-LABEL: {{^}}store_vgpr_ptr:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   store i32 99, i32 addrspace(1)* %out.gep, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 5215838..a72a6ef 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -11,7 +11,7 @@
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -31,7 +31,7 @@
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -45,7 +45,7 @@
 ; SI: s_load_dword
 ; SI: s_mul_i32
 ; SI: buffer_store_dword
-define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %mul = mul i64 %b, %a
   %trunc = trunc i64 %mul to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
@@ -57,7 +57,7 @@
 ; SI: s_load_dword
 ; SI: v_mul_lo_i32
 ; SI: buffer_store_dword
-define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
   %mul = mul i64 %b, %a
@@ -73,7 +73,7 @@
 ; EG-DAG: MULHI_INT
 ; SI-DAG: s_mul_i32
 ; SI-DAG: v_mul_hi_i32
-define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = sext i32 %in to i64
   %1 = mul i64 %0, 80
@@ -87,7 +87,7 @@
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_mul_hi_i32
 ; SI: s_endpgm
-define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ext = sext i32 %val to i64
   %mul = mul i64 %ext, 80
@@ -99,7 +99,7 @@
 ; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
 ; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
 ; SI: s_endpgm
-define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ext = sext i32 %val to i64
   %mul = mul i64 %ext, 9
@@ -114,7 +114,7 @@
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
 ; SI: s_endpgm
-define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %mul = mul i32 %a, %b
   store i32 %mul, i32 addrspace(1)* %out, align 4
   ret void
@@ -122,7 +122,7 @@
 
 ; FUNC-LABEL: {{^}}v_mul_i32:
 ; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -139,7 +139,7 @@
 ; crash with a 'failed to select' error.
 
 ; FUNC-LABEL: {{^}}s_mul_i64:
-define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %mul = mul i64 %a, %b
   store i64 %mul, i64 addrspace(1)* %out, align 8
   ret void
@@ -147,7 +147,7 @@
 
 ; FUNC-LABEL: {{^}}v_mul_i64:
 ; SI: v_mul_lo_i32
-define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
   %mul = mul i64 %a, %b
@@ -157,7 +157,7 @@
 
 ; FUNC-LABEL: {{^}}mul32_in_branch:
 ; SI: s_mul_i32
-define void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %if, label %else
@@ -180,7 +180,7 @@
 ; SI-DAG: s_mul_i32
 ; SI-DAG: v_mul_hi_u32
 ; SI: s_endpgm
-define void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
@@ -224,7 +224,7 @@
 ; SI: s_mul_i32
 
 ; SI: buffer_store_dwordx4
-define void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
+define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
   %mul = mul i128 %a, %b
   store i128 %mul, i128 addrspace(1)* %out
   ret void
@@ -253,7 +253,7 @@
 ; SI-DAG: v_mul_lo_i32
 
 ; SI: {{buffer|flat}}_store_dwordx4
-define void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %gep.a = getelementptr inbounds i128, i128 addrspace(1)* %aptr, i32 %tid
   %gep.b = getelementptr inbounds i128, i128 addrspace(1)* %bptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 6f7dfe2..3137569 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -13,7 +13,7 @@
 ; Make sure we are not masking the inputs
 ; CM-NOT: AND
 ; CM: MUL_INT24
-define void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %a.shl = shl i32 %a, 8
   %a.24 = ashr i32 %a.shl, 8
@@ -39,7 +39,7 @@
 ; CM: MULHI_INT24
 ; CM: MULHI_INT24
 ; CM: MULHI_INT24
-define void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %a.shl = shl i32 %a, 8
   %a.24 = ashr i32 %a.shl, 8
@@ -70,7 +70,7 @@
 ; GCN-DAG: v_mul_i32_i24_e32
 
 ; GCN: buffer_store_dwordx2
-define void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %shl.i = shl i32 %a, 8
   %shr.i = ashr i32 %shl.i, 8
   %conv.i = sext i32 %shr.i to i64
@@ -87,7 +87,7 @@
 ; GCN-DAG: v_mul_hi_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]]
 ; GCN-DAG: v_mul_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]]
 ; GCN: buffer_store_dwordx2
-define void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %shl.i = shl i32 %a, 8
   %shr.i = ashr i32 %shl.i, 8
   %conv.i = sext i32 %shr.i to i64
@@ -112,7 +112,7 @@
 ; VI: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}}
 
 ; GCN: buffer_store_dwordx2
-define void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
+define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
 entry:
   %a.shl = shl i33 %a, 9
   %a.24 = ashr i33 %a.shl, 9
@@ -133,7 +133,7 @@
 ; SI: v_mul_hi_i32_i24_e32 v[[MUL_HI:[0-9]+]],
 ; SI-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
 ; SI-NEXT: buffer_store_dword v[[HI]]
-define void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
+define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
 entry:
   %tmp0 = shl i33 %a, 9
   %a_24 = ashr i33 %tmp0, 9
@@ -151,7 +151,7 @@
 ; GCN: v_mul_i32_i24_e32 v[[VAL_LO:[0-9]+]]
 ; GCN: v_mov_b32_e32 v[[VAL_HI:[0-9]+]], v[[VAL_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
-define void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
+define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
 bb:
   %cmp = icmp eq i32 %arg0, 0
   br i1 %cmp, label %bb11, label %bb7
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 004d36f..59fdc8b 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -6,7 +6,7 @@
 
 ; FUNC-LABEL: {{^}}test_umul24_i32:
 ; GCN: v_mul_u32_u24
-define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = lshr i32 %0, 8
@@ -22,7 +22,7 @@
 ; SI: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16
 ; VI: s_mul_i32 [[SI_MUL:s[0-9]]], s{{[0-9]}}, s{{[0-9]}}
 ; VI: s_sext_i32_i16 s{{[0-9]}}, [[SI_MUL]]
-define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
   %ext = sext i16 %mul to i32
@@ -34,7 +34,7 @@
 ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
-define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
   %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
@@ -54,7 +54,7 @@
 ; VI: s_mul_i32
 ; VI: s_and_b32
 ; VI: v_mov_b32_e32
-define void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
   %ext = zext i16 %mul to i32
@@ -66,7 +66,7 @@
 ; SI: v_mul_u32_u24_e32
 ; SI: v_and_b32_e32
 ; VI: v_mul_lo_u16
-define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
   %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
@@ -83,7 +83,7 @@
 ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
-define void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) {
+define amdgpu_kernel void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) {
 entry:
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -101,7 +101,7 @@
 ; GCN-NOT: and
 ; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %a.24 = and i32 %a, 16777215
   %b.24 = and i32 %b, 16777215
@@ -118,7 +118,7 @@
 ; GCN-NOT: and
 ; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %a.24 = and i64 %a, 16777215
   %b.24 = and i64 %b, 16777215
@@ -136,7 +136,7 @@
 ; GCN-DAG: v_mul_u32_u24_e32
 ; GCN-DAG: v_mul_hi_u32_u24_e32
 ; GCN: buffer_store_dwordx2
-define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %tmp0 = shl i64 %a, 40
   %a_24 = lshr i64 %tmp0, 40
@@ -152,7 +152,7 @@
 ; GCN-NOT: s_and_b32
 ; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
 ; GCN-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
-define void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
 entry:
   %tmp0 = shl i64 %a, 40
   %a.24 = lshr i64 %tmp0, 40
@@ -166,7 +166,7 @@
 ; GCN: s_and_b32
 ; GCN: v_mul_u32_u24_e32 [[MUL24:v[0-9]+]]
 ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[MUL24]]
-define void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %a.16 = and i32 %a, 65535
   %b.16 = and i32 %b, 65535
@@ -186,7 +186,7 @@
 ; GCN-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
 ; GCN-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[HI]]{{\]}}
-define void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
+define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
 entry:
   %tmp0 = shl i33 %a, 9
   %a_24 = lshr i33 %tmp0, 9
@@ -206,7 +206,7 @@
 ; GCN: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
 ; GCN-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
 ; GCN-NEXT: buffer_store_dword v[[HI]]
-define void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
+define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
 entry:
   %tmp0 = shl i33 %a, 9
   %a_24 = lshr i33 %tmp0, 9
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-r600.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-r600.ll
index da1c111..0a646b7 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-r600.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}test_umul24_i32:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = lshr i32 %0, 8
@@ -19,7 +19,7 @@
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
 ; EG: 16
-define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
   %ext = sext i16 %mul to i32
@@ -31,7 +31,7 @@
 ; FUNC-LABEL: {{^}}test_umul24_i8:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
-define void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
+define amdgpu_kernel void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
 entry:
   %mul = mul i8 %a, %b
   %ext = sext i8 %mul to i32
@@ -41,7 +41,7 @@
 
 ; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
 ; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %a.24 = and i32 %a, 16777215
   %b.24 = and i32 %b, 16777215
@@ -56,7 +56,7 @@
 
 ; FUNC-LABEL: {{^}}test_umulhi24:
 ; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
-define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %a.24 = and i64 %a, 16777215
   %b.24 = and i64 %b, 16777215
@@ -71,7 +71,7 @@
 ; FUNC-LABEL: {{^}}test_umul24_i64:
 ; EG; MUL_UINT24
 ; EG: MULHI
-define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %tmp0 = shl i64 %a, 40
   %a_24 = lshr i64 %tmp0, 40
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index 95c7ce8..15de689 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -64,7 +64,7 @@
   br i1 %tmp51, label %LOOP, label %LOOP.outer
 }
 
-; OPT-LABEL: define void @multi_if_break_loop(
+; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
 ; OPT: llvm.amdgcn.break
 ; OPT: llvm.amdgcn.loop
 ; OPT: llvm.amdgcn.if.break
@@ -79,7 +79,7 @@
 ; Uses a copy intsead of an or
 ; GCN: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[BREAK_REG]]
 ; GCN: s_or_b64 [[BREAK_REG]], exec, [[COPY]]
-define void @multi_if_break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
diff --git a/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll b/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
index 9dd99ef..97dc67f 100644
--- a/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
@@ -9,7 +9,7 @@
 @extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4
 
 ; CHECK-DAG: Name: load_extern_const_init
-define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
   %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
@@ -19,7 +19,7 @@
 @undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4
 
 ; CHECK-DAG: Name: undef_const_addrspace
-define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
   %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
index fd66b0b..8a7bf6d 100644
--- a/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -9,7 +9,7 @@
 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16:
 ; SI: s_load_dword s
 ; SI: buffer_store_short v
-define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
   %trunc = trunc i32 %arg to i16
   store i16 %trunc, i16 addrspace(1)* %out
   ret void
@@ -21,7 +21,7 @@
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_short v
-define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
@@ -34,7 +34,7 @@
 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
   %trunc = trunc i32 %arg to i8
   store i8 %trunc, i8 addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -56,7 +56,7 @@
 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
   %trunc = trunc i32 %arg to i1
   store i1 %trunc, i1 addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid
@@ -78,7 +78,7 @@
 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
 ; SI: s_load_dword s
 ; SI: buffer_store_dword v
-define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
   %trunc = trunc i64 %arg to i32
   store i32 %trunc, i32 addrspace(1)* %out
   ret void
@@ -87,7 +87,7 @@
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
-define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -100,7 +100,7 @@
 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
 ; SI: s_load_dword s
 ; SI: buffer_store_dword v
-define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
   %srl = lshr i64 %arg, 32
   %trunc = trunc i64 %srl to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -110,7 +110,7 @@
 ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
-define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -125,7 +125,7 @@
 ; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
   %trunc = trunc i16 %arg to i8
   store i8 %trunc, i8 addrspace(1)* %out
   ret void
@@ -134,7 +134,7 @@
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8:
 ; SI: buffer_load_ubyte v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -147,7 +147,7 @@
 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
   %srl = lshr i64 %arg, 32
   %trunc = trunc i64 %srl to i8
   store i8 %trunc, i8 addrspace(1)* %out
@@ -157,7 +157,7 @@
 ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -171,7 +171,7 @@
 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
   %trunc = trunc i64 %arg to i8
   store i8 %trunc, i8 addrspace(1)* %out
   ret void
@@ -180,7 +180,7 @@
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -194,7 +194,7 @@
 ; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff
-define void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %val = load i32, i32 addrspace(2)* %in
   %mask = and i32 %val, 65535
@@ -205,7 +205,7 @@
 ; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
-define void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %bc = bitcast <2 x i32> %ld to i64
   %hi = lshr i64 %bc, 32
diff --git a/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll b/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll
index 0242f6d..c974471 100644
--- a/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll
@@ -6,7 +6,7 @@
 
 ; EG: CF_END
 ; SI: s_endpgm
-define void @kernel(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel(i32 addrspace(1)* %out) {
 entry:
   store i32 0, i32 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
index 4e5ea4b..3836a2b 100644
--- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: {{^}}fold_sgpr:
 ; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s
-define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {
+define amdgpu_kernel void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {
 entry:
   %tmp0 = icmp ne i32 %fold, 0
   br i1 %tmp0, label %if, label %endif
@@ -20,7 +20,7 @@
 
 ; CHECK-LABEL: {{^}}fold_imm:
 ; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5
-define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
+define amdgpu_kernel void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
 entry:
   %fold = add i32 3, 2
   %tmp0 = icmp ne i32 %cmp, 0
@@ -46,7 +46,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},
 
-define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
+define amdgpu_kernel void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
 entry:
   %tmp0 = add i64 %val, 1
   store i64 %tmp0, i64 addrspace(1)* %out
@@ -61,7 +61,7 @@
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
 
-define void @vector_inline(<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @vector_inline(<4 x i32> addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp0, 1
@@ -80,7 +80,7 @@
 ; CHECK-LABEL: {{^}}imm_one_use:
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}}
 
-define void @imm_one_use(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @imm_one_use(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = xor i32 %tmp0, 100
@@ -94,7 +94,7 @@
 ; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
 ; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
 
-define void @vector_imm(<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @vector_imm(<4 x i32> addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp0, 1
@@ -114,7 +114,7 @@
 ; CHECK: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
 ; CHECK: v_mac_f32_e32 v[[LO]], 0x41200000, v[[HI]]
 ; CHECK: buffer_store_dword v[[LO]]
-define void @no_fold_tied_subregister() {
+define amdgpu_kernel void @no_fold_tied_subregister() {
   %tmp1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
   %tmp2 = extractelement <2 x float> %tmp1, i32 0
   %tmp3 = extractelement <2 x float> %tmp1, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/operand-spacing.ll b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
index 127f3da..fc6f070 100644
--- a/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
@@ -11,7 +11,7 @@
 ; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
 ; GCN: buffer_store_dword [[RESULT]],
-define void @add_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @add_f32(float addrspace(1)* %out, float %a, float %b) {
   %result = fadd float %a, %b
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir b/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
index 4584802..2de6b59 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
@@ -3,7 +3,7 @@
 --- |
   target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 
-  define void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) #0 {
   main_body:
     %id = call i32 @llvm.amdgcn.workitem.id.x()
     %cc = icmp eq i32 %id, 0
@@ -23,7 +23,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -34,7 +34,7 @@
     ret void
   }
 
-  define void @optimize_if_or_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_or_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -46,7 +46,7 @@
   }
 
 
-  define void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) #0 {
   main_body:
     %id = call i32 @llvm.amdgcn.workitem.id.x()
     %cc = icmp eq i32 %id, 0
@@ -67,7 +67,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -78,7 +78,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -89,7 +89,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -100,7 +100,7 @@
     ret void
   }
 
-  define void @optimize_if_unknown_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_unknown_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -111,7 +111,7 @@
     ret void
   }
 
-  define void @optimize_if_andn2_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_andn2_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -122,7 +122,7 @@
     ret void
   }
 
-  define void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index eca6909..eb08284 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -9,7 +9,7 @@
 
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -28,7 +28,7 @@
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -39,7 +39,7 @@
 
 ; FUNC-LABEL: {{^}}scalar_or_i32:
 ; SI: s_or_b32
-define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %or = or i32 %a, %b
   store i32 %or, i32 addrspace(1)* %out
   ret void
@@ -47,7 +47,7 @@
 
 ; FUNC-LABEL: {{^}}vector_or_i32:
 ; SI: v_or_b32_e32 v{{[0-9]}}
-define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
+define amdgpu_kernel void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
   %loada = load i32, i32 addrspace(1)* %a
   %or = or i32 %loada, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -56,7 +56,7 @@
 
 ; FUNC-LABEL: {{^}}scalar_or_literal_i32:
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f
-define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
   %or = or i32 %a, 99999
   store i32 %or, i32 addrspace(1)* %out, align 4
   ret void
@@ -68,7 +68,7 @@
 ; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
-define void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = or i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -82,7 +82,7 @@
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
-define void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
 
@@ -101,7 +101,7 @@
 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
 ; SI-NOT: or_b32
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = or i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -111,7 +111,7 @@
 ; SI-NOT: or_b32
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 63
 ; SI-NOT: or_b32
-define void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   %foo = add i64 %b, 63
@@ -125,7 +125,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}}
 ; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = or i64 %a, -8
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -133,7 +133,7 @@
 
 ; FUNC-LABEL: {{^}}vector_or_literal_i32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
-define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
   %loada = load i32, i32 addrspace(1)* %a, align 4
   %or = or i32 %loada, 65535
   store i32 %or, i32 addrspace(1)* %out, align 4
@@ -142,7 +142,7 @@
 
 ; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}}
-define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
   %loada = load i32, i32 addrspace(1)* %a, align 4
   %or = or i32 %loada, 4
   store i32 %or, i32 addrspace(1)* %out, align 4
@@ -154,7 +154,7 @@
 ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
 
 ; SI: s_or_b64
-define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, %b
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -163,7 +163,7 @@
 ; FUNC-LABEL: {{^}}vector_or_i64:
 ; SI: v_or_b32_e32 v{{[0-9]}}
 ; SI: v_or_b32_e32 v{{[0-9]}}
-define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %loadb = load i64, i64 addrspace(1)* %b, align 8
   %or = or i64 %loada, %loadb
@@ -174,7 +174,7 @@
 ; FUNC-LABEL: {{^}}scalar_vector_or_i64:
 ; SI: v_or_b32_e32 v{{[0-9]}}
 ; SI: v_or_b32_e32 v{{[0-9]}}
-define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
+define amdgpu_kernel void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
   %loada = load i64, i64 addrspace(1)* %a
   %or = or i64 %loada, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -186,7 +186,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, 22470723082367
   store i64 %or, i64 addrspace(1)* %out
@@ -200,7 +200,7 @@
 ; SI-NOT: v_or_b32_e32 {{v[0-9]+}}, 0
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO_RESULT]]:[[HI_VREG]]{{\]}}
 ; SI: s_endpgm
-define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, 8
   store i64 %or, i64 addrspace(1)* %out
@@ -213,7 +213,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[RES_HI:[0-9]+]], -1{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[RES_LO]]:[[RES_HI]]{{\]}}
 ; SI: s_endpgm
-define void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, -8
   store i64 %or, i64 addrspace(1)* %out
@@ -226,7 +226,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffffff38, v[[LO_VREG]]
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, -200
   store i64 %or, i64 addrspace(1)* %out
@@ -239,7 +239,7 @@
 ; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
-define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %add = or i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
@@ -250,7 +250,7 @@
 ; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
 
 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
-define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
   %a = load float, float addrspace(1)* %in0
   %b = load float, float addrspace(1)* %in1
   %acmp = fcmp oge float %a, 0.000000e+00
@@ -263,7 +263,7 @@
 
 ; FUNC-LABEL: {{^}}s_or_i1:
 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
-define void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %cmp0 = icmp eq i32 %a, %b
   %cmp1 = icmp eq i32 %c, %d
   %or = or i1 %cmp0, %cmp1
diff --git a/llvm/test/CodeGen/AMDGPU/over-max-lds-size.ll b/llvm/test/CodeGen/AMDGPU/over-max-lds-size.ll
index 32ad9ab..57777e7 100644
--- a/llvm/test/CodeGen/AMDGPU/over-max-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/over-max-lds-size.ll
@@ -6,7 +6,7 @@
 
 @huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4
 
-define void @use_huge_lds() {
+define amdgpu_kernel void @use_huge_lds() {
 entry:
   %v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0
   store i32 0, i32 addrspace(3)* %v0
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
index 10cc54d..b862156 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -9,7 +9,7 @@
 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]]
 ; GFX9: ; use [[PACKED]]
-define void @s_pack_v2f16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
+define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
   %val0 = load volatile i32, i32 addrspace(2)* %in0
   %val1 = load volatile i32, i32 addrspace(2)* %in1
   %lo.i = trunc i32 %val0 to i16
@@ -28,7 +28,7 @@
 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1234, [[VAL1]]
 ; GFX9: ; use [[PACKED]]
-define void @s_pack_v2f16_imm_lo(i32 addrspace(2)* %in1) #0 {
+define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(2)* %in1) #0 {
   %val1 = load i32, i32 addrspace(2)* %in1
   %hi.i = trunc i32 %val1 to i16
   %hi = bitcast i16 %hi.i to half
@@ -44,7 +44,7 @@
 ; GFX9: s_load_dword [[VAL0:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1234
 ; GFX9: ; use [[PACKED]]
-define void @s_pack_v2f16_imm_hi(i32 addrspace(2)* %in0) #0 {
+define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(2)* %in0) #0 {
   %val0 = load i32, i32 addrspace(2)* %in0
   %lo.i = trunc i32 %val0 to i16
   %lo = bitcast i16 %lo.i to half
@@ -64,7 +64,7 @@
 ; GFX9-FLUSH: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]]
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]]
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -91,7 +91,7 @@
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]]
 
 ; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]]
-define void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -118,7 +118,7 @@
 ; GFX9-FLUSH-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234{{$}}
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]]
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
@@ -140,7 +140,7 @@
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]]
 
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
@@ -164,7 +164,7 @@
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]]
 
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 {
+define amdgpu_kernel void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -187,7 +187,7 @@
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]]
 
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0) #0 {
+define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -209,7 +209,7 @@
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], 64, 16, [[MASKED]]
 
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2f16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
+define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
index bedf799..9ffd167 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -9,7 +9,7 @@
 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]]
 ; GFX9: ; use [[PACKED]]
-define void @s_pack_v2i16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
+define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
   %val0 = load volatile i32, i32 addrspace(2)* %in0
   %val1 = load volatile i32, i32 addrspace(2)* %in1
   %lo = trunc i32 %val0 to i16
@@ -26,7 +26,7 @@
 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1c8, [[VAL1]]
 ; GFX9: ; use [[PACKED]]
-define void @s_pack_v2i16_imm_lo(i32 addrspace(2)* %in1) #0 {
+define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(2)* %in1) #0 {
   %val1 = load i32, i32 addrspace(2)* %in1
   %hi = trunc i32 %val1 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 456, i32 0
@@ -41,7 +41,7 @@
 ; GFX9: s_load_dword [[VAL0:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1c8
 ; GFX9: ; use [[PACKED]]
-define void @s_pack_v2i16_imm_hi(i32 addrspace(2)* %in0) #0 {
+define amdgpu_kernel void @s_pack_v2i16_imm_hi(i32 addrspace(2)* %in0) #0 {
   %val0 = load i32, i32 addrspace(2)* %in0
   %lo = trunc i32 %val0 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
@@ -60,7 +60,7 @@
 ; GFX9-FLUSH: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]]
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]]
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -85,7 +85,7 @@
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]]
 
 ; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]]
-define void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -111,7 +111,7 @@
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]]
 
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
@@ -130,7 +130,7 @@
 
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, 64
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
@@ -151,7 +151,7 @@
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[VAL0]]
 
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 {
+define amdgpu_kernel void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -169,7 +169,7 @@
 ; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL]], 7
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], 7, 16, [[VAL0]]
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2i16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
+define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/packetizer.ll b/llvm/test/CodeGen/AMDGPU/packetizer.ll
index 49a7c0d..1764d64 100644
--- a/llvm/test/CodeGen/AMDGPU/packetizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/packetizer.ll
@@ -7,7 +7,7 @@
 ; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z
 ; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W
 
-define void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
 entry:
   %shl = sub i32 32, %e
   %x = add i32 %x_arg, 1
diff --git a/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll
index ea943a5..a90f200 100644
--- a/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll
+++ b/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll
@@ -11,7 +11,7 @@
 ; to do its transfomation, however now that we are using local memory for
 ; allocas, the transformation isn't happening.
 
-define void @_Z9chk1D_512v() #0 {
+define amdgpu_kernel void @_Z9chk1D_512v() #0 {
 entry:
   %a0 = alloca i32, align 4
   %b0 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll b/llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll
index 1da1e91b8..91116b0 100644
--- a/llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll
+++ b/llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll
@@ -12,7 +12,7 @@
 ; CHECK: OR_INT
 ; CHECK-NEXT: OR_INT
 ; CHECK-NEXT: OR_INT
-define void @_Z9chk1D_512v() #0 {
+define amdgpu_kernel void @_Z9chk1D_512v() #0 {
 entry:
   %a0 = alloca i32, align 4
   %b0 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll b/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
index 3e0d36978..4bcfe5f 100644
--- a/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
+++ b/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
@@ -10,7 +10,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
-define void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %in.gep
diff --git a/llvm/test/CodeGen/AMDGPU/predicates.ll b/llvm/test/CodeGen/AMDGPU/predicates.ll
index c1af815..566b48e 100644
--- a/llvm/test/CodeGen/AMDGPU/predicates.ll
+++ b/llvm/test/CodeGen/AMDGPU/predicates.ll
@@ -6,7 +6,7 @@
 ; CHECK-LABEL: {{^}}simple_if:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @simple_if(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @simple_if(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %cmp0 = icmp sgt i32 %in, 0
   br i1 %cmp0, label %IF, label %ENDIF
@@ -25,7 +25,7 @@
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @simple_if_else(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @simple_if_else(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   br i1 %0, label %IF, label %ELSE
@@ -51,7 +51,7 @@
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @nested_if(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @nested_if(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   br i1 %0, label %IF0, label %ENDIF
@@ -79,7 +79,7 @@
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @nested_if_else(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @nested_if_else(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   br i1 %0, label %IF0, label %ENDIF
diff --git a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
index 2894730..af26835 100644
--- a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
@@ -18,7 +18,7 @@
 
 ; OPTNONE-NOT: s_mov_b32
 ; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
-define void @store_to_undef() #0 {
+define amdgpu_kernel void @store_to_undef() #0 {
   store volatile i32 0, i32* undef
   ret void
 }
@@ -28,7 +28,7 @@
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
 ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
 ; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
-define void @store_to_inttoptr() #0 {
+define amdgpu_kernel void @store_to_inttoptr() #0 {
  store volatile i32 0, i32* inttoptr (i32 123 to i32*)
  ret void
 }
@@ -38,7 +38,7 @@
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
 ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
 ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
-define void @load_from_undef() #0 {
+define amdgpu_kernel void @load_from_undef() #0 {
   %ld = load volatile i32, i32* undef
   ret void
 }
@@ -48,7 +48,7 @@
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
 ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
 ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
-define void @load_from_inttoptr() #0 {
+define amdgpu_kernel void @load_from_inttoptr() #0 {
   %ld = load volatile i32, i32* inttoptr (i32 123 to i32*)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/private-element-size.ll b/llvm/test/CodeGen/AMDGPU/private-element-size.ll
index 9e75cc2..f805430 100644
--- a/llvm/test/CodeGen/AMDGPU/private-element-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-element-size.ll
@@ -36,7 +36,7 @@
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
-define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -106,7 +106,7 @@
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
-define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -143,7 +143,7 @@
 
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
-define void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -179,7 +179,7 @@
 
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
-define void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -228,7 +228,7 @@
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
-define void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index eea10c8..9fa3051 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -4,7 +4,7 @@
 ; This works because promote allocas pass replaces these with LDS atomics.
 
 ; Private atomics have no real use, but at least shouldn't crash on it.
-define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
@@ -17,7 +17,7 @@
   ret void
 }
 
-define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-broken.ll b/llvm/test/CodeGen/AMDGPU/private-memory-broken.ll
index 8ba0b70..9b5f655 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-broken.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-broken.ll
@@ -7,7 +7,7 @@
 
 declare i32 @foo(i32*) nounwind
 
-define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
index 3e17969..d07a0a0 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
@@ -16,7 +16,7 @@
 ; OPT: call i32 @llvm.r600.read.tidig.y(), !range !0
 ; OPT: call i32 @llvm.r600.read.tidig.z(), !range !0
 
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -47,7 +47,7 @@
 ; R600-NOT: MOVA_INT
 %struct.point = type { i32, i32 }
 
-define void @multiple_structs(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 {
 entry:
   %a = alloca %struct.point
   %b = alloca %struct.point
@@ -75,7 +75,7 @@
 ; FUNC-LABEL: {{^}}direct_loop:
 ; R600-NOT: MOVA_INT
 
-define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
   %prv_array_const = alloca [2 x i32]
   %prv_array = alloca [2 x i32]
@@ -110,7 +110,7 @@
 ; FUNC-LABEL: {{^}}short_array:
 
 ; R600: MOVA_INT
-define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i16]
   %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
@@ -127,7 +127,7 @@
 ; FUNC-LABEL: {{^}}char_array:
 
 ; R600: MOVA_INT
-define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i8]
   %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
@@ -148,7 +148,7 @@
 ; R600-NOT: MOV T0.X
 ; Additional check in case the move ends up in the last slot
 ; R600-NOT: MOV * TO.X
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [2 x i32]
   %1 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 0
@@ -169,7 +169,7 @@
 ; R600_CHECK: MOV
 ; R600_CHECK: [[CHAN:[XYZW]]]+
 ; R600-NOT: [[CHAN]]+
-define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1
   %1 = alloca [2 x i8], align 1
@@ -193,7 +193,7 @@
   ret void
 }
 
-define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i8]]
   %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
@@ -207,7 +207,7 @@
   ret void
 }
 
-define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@@ -220,7 +220,7 @@
   ret void
 }
 
-define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i64]]
   %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
@@ -235,7 +235,7 @@
 
 %struct.pair32 = type { i32, i32 }
 
-define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x %struct.pair32]]
   %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
@@ -248,7 +248,7 @@
   ret void
 }
 
-define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x %struct.pair32]
   %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
@@ -261,7 +261,7 @@
   ret void
 }
 
-define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr inbounds  [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
@@ -282,7 +282,7 @@
 ; SI-NOT: ds_write
 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
-define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32]
   %tmp0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   store i32 5, i32* %tmp0
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
index 3bd0aec..41a68b1 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
@@ -5,7 +5,7 @@
 
 ; CHECK-LABEL: @array_alloca(
 ; CHECK: %stack = alloca i32, i32 5, align 4
-define void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca i32, i32 5, align 4
   %ld0 = load i32, i32 addrspace(1)* %in, align 4
@@ -27,7 +27,7 @@
 
 ; CHECK-LABEL: @array_alloca_dynamic(
 ; CHECK: %stack = alloca i32, i32 %size, align 4
-define void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 {
+define amdgpu_kernel void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 {
 entry:
   %stack = alloca i32, i32 %size, align 4
   %ld0 = load i32, i32 addrspace(1)* %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
index 82030f3..a5eb92d 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
@@ -7,14 +7,14 @@
 declare void @foo.varargs(...) #0
 
 ; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo
-define void @crash_call_constexpr_cast() #0 {
+define amdgpu_kernel void @crash_call_constexpr_cast() #0 {
   %alloca = alloca i32
   call void bitcast (void (float*)* @foo to void (i32*)*)(i32* %alloca) #0
   ret void
 }
 
 ; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo.varargs
-define void @crash_call_constexpr_cast_varargs() #0 {
+define amdgpu_kernel void @crash_call_constexpr_cast_varargs() #0 {
   %alloca = alloca i32
   call void bitcast (void (...)* @foo.varargs to void (i32*)*)(i32* %alloca) #0
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
index eb0d0cc..38db51d 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
@@ -5,12 +5,12 @@
 @global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
 @global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
 
-; IR-LABEL: define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 ; IR: alloca [10 x i32]
 ; ASM-LABEL: {{^}}promote_alloca_size_256:
 ; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only)
 
-define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [10 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
index 4c3c15d..f83eb56 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
@@ -7,7 +7,7 @@
 ; GCN-LABEL: {{^}}use_invariant_promotable_lds:
 ; GCN: buffer_load_dword
 ; GCN: ds_write_b32
-define void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 {
+define amdgpu_kernel void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 {
 bb:
   %tmp = alloca i32, align 4
   %tmp1 = bitcast i32* %tmp to i8*
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
index eeda19f..9d6f78f 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
@@ -7,7 +7,7 @@
 ; OPT-NOT: alloca i32
 ; OPT-NOT: llvm.lifetime
 ; OPT: store i32 %tmp3, i32 addrspace(3)*
-define void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 {
+define amdgpu_kernel void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 {
 bb:
   %tmp = alloca i32, align 4
   %tmp1 = bitcast i32* %tmp to i8*
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
index ce50f1b..7a4a451 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
@@ -14,7 +14,7 @@
 ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
 ; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false)
-define void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
@@ -28,7 +28,7 @@
 ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call void @llvm.memmove.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
 ; CHECK: call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false)
-define void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
@@ -41,7 +41,7 @@
 ; CHECK-LABEL: @promote_with_memset(
 ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call void @llvm.memset.p3i8.i32(i8 addrspace(3)* %alloca.bc, i8 7, i32 68, i32 4, i1 false)
-define void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
@@ -53,7 +53,7 @@
 ; CHECK-LABEL: @promote_with_objectsize(
 ; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false, i1 false)
-define void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
index 8ba849e..9f22f20 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
@@ -5,7 +5,7 @@
 ; NOOPTS: workgroup_group_segment_byte_size = 0{{$}}
 ; NOOPTS-NOT ds_write
 ; OPTS: ds_write
-define void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@@ -21,7 +21,7 @@
 ; ALL-LABEL: {{^}}optnone_promote_alloca_i32_array_array:
 ; ALL: workgroup_group_segment_byte_size = 0{{$}}
 ; ALL-NOT ds_write
-define void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 {
+define amdgpu_kernel void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
index 468a789..bf3bc49 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
@@ -30,7 +30,7 @@
 
 ; GCN-LABEL: {{^}}promote_alloca_size_order_0:
 ; GCN: workgroup_group_segment_byte_size = 2340
-define void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp0 = load i32, i32 addrspace(1)* %in, align 4
@@ -62,7 +62,7 @@
 
 ; GCN-LABEL: {{^}}promote_alloca_size_order_1:
 ; GCN: workgroup_group_segment_byte_size = 2352
-define void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp0 = load i32, i32 addrspace(1)* %in, align 4
@@ -100,7 +100,7 @@
 
 ; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit:
 ; GCN: workgroup_group_segment_byte_size = 1060
-define void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp0 = load i32, i32 addrspace(1)* %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
index 3bcbb4f..03ce116 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}stored_lds_pointer_value:
 ; GCN: buffer_store_dword v
-define void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
   %tmp = alloca float
   store float 0.0, float *%tmp
   store float* %tmp, float* addrspace(1)* %ptr
@@ -14,7 +14,7 @@
 
 ; GCN-LABEL: {{^}}stored_lds_pointer_value_offset:
 ; GCN: buffer_store_dword v
-define void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
   %tmp0 = alloca float
   %tmp1 = alloca float
   store float 0.0, float *%tmp0
@@ -29,7 +29,7 @@
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
-define void @stored_lds_pointer_value_gep(float* addrspace(1)* %ptr, i32 %idx) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value_gep(float* addrspace(1)* %ptr, i32 %idx) #0 {
 bb:
   %tmp = alloca float, i32 16
   store float 0.0, float* %tmp
@@ -46,7 +46,7 @@
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) {
 entry:
   %tmp0 = alloca [4 x i32]
   %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0
@@ -64,7 +64,7 @@
 
 ; GCN-LABEL: {{^}}stored_fi_to_self:
 ; GCN-NOT: ds_
-define void @stored_fi_to_self() #0 {
+define amdgpu_kernel void @stored_fi_to_self() #0 {
   %tmp = alloca i32*
   store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp
   %bitcast = bitcast i32** %tmp to i32*
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
index 2e7527d..ebef612 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
@@ -8,7 +8,7 @@
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
 ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
 ; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, %ptr1
-define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
@@ -22,7 +22,7 @@
 ; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
 ; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, null
-define void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %cmp = icmp eq i32* %ptr0, null
@@ -35,7 +35,7 @@
 ; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
 ; CHECK: %cmp = icmp eq i32 addrspace(3)* null, %ptr0
-define void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %cmp = icmp eq i32* null, %ptr0
@@ -49,7 +49,7 @@
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
 ; CHECK: %ptr1 = call i32* @get_unknown_pointer()
 ; CHECK: %cmp = icmp eq i32* %ptr0, %ptr1
-define void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = call i32* @get_unknown_pointer()
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
index 0462a35..d196897 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
@@ -13,7 +13,7 @@
 ; CHECK: endif:
 ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
 ; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
-define void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %else
@@ -34,7 +34,7 @@
 
 ; CHECK-LABEL: @branch_ptr_phi_alloca_null_0(
 ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ null, %entry ]
-define void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %endif
@@ -51,7 +51,7 @@
 
 ; CHECK-LABEL: @branch_ptr_phi_alloca_null_1(
 ; CHECK: %phi.ptr = phi i32 addrspace(3)*  [ null, %entry ], [ %arrayidx0, %if ]
-define void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %endif
@@ -73,7 +73,7 @@
 ; CHECK: br label %exit
 ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ]
 ; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
-define void @one_phi_value(i32 %a) #0 {
+define amdgpu_kernel void @one_phi_value(i32 %a) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
@@ -97,7 +97,7 @@
 ; CHECK: endif:
 ; CHECK: %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
 ; CHECK: store i32 0, i32* %phi.ptr, align 4
-define void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %else
@@ -134,7 +134,7 @@
 ; CHECK-LABEL: @ptr_induction_var_same_alloca(
 ; CHECK: %alloca = alloca [64 x i32], align 4
 ; CHECK: phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
-define void @ptr_induction_var_same_alloca() #0 {
+define amdgpu_kernel void @ptr_induction_var_same_alloca() #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
@@ -172,7 +172,7 @@
 ; CHECK: %alloca = alloca [64 x i32], align 4
 ; CHECK: %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
 ; CHECK: %cmp = icmp eq i32* %incdec.ptr, %call
-define void @ptr_induction_var_alloca_unknown() #0 {
+define amdgpu_kernel void @ptr_induction_var_alloca_unknown() #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
index 34d274d..55c2229 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand(
 ; CHECK: %alloca = alloca i32
 ; CHECK: select i1 undef, i32* undef, i32* %alloca
-define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
+define amdgpu_kernel void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
   %alloca = alloca i32, align 4
   %select = select i1 undef, i32* undef, i32* %alloca
   store i32 0, i32* %select, align 4
@@ -16,7 +16,7 @@
 ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
 ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
-define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
@@ -33,7 +33,7 @@
 ; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
 ; CHECK: %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
 ; CHECK: %select = select i1 undef, i32* %ptr0, i32* %ptr1
-define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
   %alloca0 = alloca i32, i32 16, align 4
   %alloca1 = alloca i32, i32 16, align 4
   %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
@@ -50,7 +50,7 @@
 ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 3
 ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
-define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
+define amdgpu_kernel void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 1
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 3
@@ -67,7 +67,7 @@
 ; CHECK: %select0 = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
 ; CHECK: %select1 = select i1 undef, i32 addrspace(3)* %select0, i32 addrspace(3)* %ptr2
 ; CHECK: store i32 0, i32 addrspace(3)* %select1, align 4
-define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
@@ -78,7 +78,7 @@
   ret void
 }
 
-define void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
 entry:
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
@@ -102,7 +102,7 @@
 ; CHECK-LABEL: @select_null_rhs(
 ; CHECK-NOT: alloca
 ; CHECK: select i1 %tmp2, double addrspace(3)* %{{[0-9]+}}, double addrspace(3)* null
-define void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
+define amdgpu_kernel void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
 bb:
   %tmp = alloca double, align 8
   store double 0.000000e+00, double* %tmp, align 8
@@ -117,7 +117,7 @@
 ; CHECK-LABEL: @select_null_lhs(
 ; CHECK-NOT: alloca
 ; CHECK: select i1 %tmp2, double addrspace(3)* null, double addrspace(3)* %{{[0-9]+}}
-define void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
+define amdgpu_kernel void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
 bb:
   %tmp = alloca double, align 8
   store double 0.000000e+00, double* %tmp, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
index e331731..88c0e91 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
@@ -8,7 +8,7 @@
 ; CHECK-LABEL: @try_promote_unhandled_intrinsic(
 ; CHECK: alloca
 ; CHECK: call void @llvm.stackrestore(i8* %tmp1)
-define void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 {
+define amdgpu_kernel void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 {
 bb:
   %tmp = alloca i32, align 4
   %tmp1 = bitcast i32* %tmp to i8*
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
index 626ff1e..9c43a6d 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: @volatile_load(
 ; CHECK: alloca [5 x i32]
 ; CHECK: load volatile i32, i32*
-define void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+define amdgpu_kernel void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
@@ -16,7 +16,7 @@
 ; CHECK-LABEL: @volatile_store(
 ; CHECK: alloca [5 x i32]
 ; CHECK: store volatile i32 %tmp, i32*
-define void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+define amdgpu_kernel void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
@@ -30,7 +30,7 @@
 ; CHECK: alloca double
 ; CHECK: load double
 ; CHECK: load volatile double
-define void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 {
+define amdgpu_kernel void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 {
 bb:
   %tmp = alloca double, align 8
   store double 0.000000e+00, double* %tmp, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll b/llvm/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
index 866a4a9..b7ed34b 100644
--- a/llvm/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
@@ -2,7 +2,7 @@
 ; Don't crash
 
 ; CHECK: MAX_UINT
-define void @test(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i64 addrspace(1)* %out) {
 bb:
   store i64 2, i64 addrspace(1)* %out
   %tmp = load i64, i64 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/r600.alu-limits.ll b/llvm/test/CodeGen/AMDGPU/r600.alu-limits.ll
index c5b8934..2604ed4 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.alu-limits.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.alu-limits.ll
@@ -6,7 +6,7 @@
 
 %struct.foo = type {i32, i32, i32}
 
-define void @alu_limits(i32 addrspace(1)* %out, %struct.foo* %in, i32 %offset) {
+define amdgpu_kernel void @alu_limits(i32 addrspace(1)* %out, %struct.foo* %in, i32 %offset) {
 entry:
   %ptr = getelementptr inbounds %struct.foo, %struct.foo* %in, i32 1, i32 2
   %x = load i32, i32 *%ptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll b/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll
index 49441ee..acf7a66 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll
@@ -8,7 +8,7 @@
 ; EG: VTX_READ_128 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
   %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
@@ -21,7 +21,7 @@
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %load = load float, float addrspace(1)* %in, align 4
   %bc = bitcast float %load to <2 x i16>
   store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4
@@ -33,7 +33,7 @@
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
   %bc = bitcast <2 x i16> %load to float
   store float %bc, float addrspace(1)* %out, align 4
@@ -45,7 +45,7 @@
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %bc = bitcast <4 x i8> %load to i32
   store i32 %bc, i32 addrspace(1)* %out, align 4
@@ -57,7 +57,7 @@
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %load to <4 x i8>
   store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
@@ -69,7 +69,7 @@
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
   %load = load <2 x i16>, <2 x i16>  addrspace(1)* %in, align 4
   %bc = bitcast <2 x i16> %load to <4 x i8>
   store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
@@ -85,7 +85,7 @@
 ; EG: VTX_READ_16
 ; EG-DAG: BFE_UINT
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
   %load = load <4 x i16>, <4 x i16>  addrspace(1)* %in, align 2
   %bc = bitcast <4 x i16> %load to <8 x i8>
   %element = extractelement <8 x i8> %bc, i32 5
@@ -98,7 +98,7 @@
 ; EG: VTX_READ_64 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %bc = bitcast <2 x i32> %val to double
   store double %bc, double addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/r600.global_atomics.ll b/llvm/test/CodeGen/AMDGPU/r600.global_atomics.ll
index 7047c63..1ddc41f 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.global_atomics.ll
@@ -6,7 +6,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_i32_offset:
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -16,7 +16,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_i32_soffset:
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -27,7 +27,7 @@
 ; FIXME: looks like the offset is wrong
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595
 
@@ -38,7 +38,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -49,7 +49,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_i32:
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -58,7 +58,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -68,7 +68,7 @@
 ; FUNC-LABEL: {{^}}atomic_and_i32_offset:
 ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -78,7 +78,7 @@
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -89,7 +89,7 @@
 ; FUNC-LABEL: {{^}}atomic_and_i32:
 ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -98,7 +98,7 @@
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
 ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -108,7 +108,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
 ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -118,7 +118,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -129,7 +129,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_i32:
 ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -138,7 +138,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
 ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -148,7 +148,7 @@
 ; FUNC-LABEL: {{^}}atomic_max_i32_offset:
 ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -158,7 +158,7 @@
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -169,7 +169,7 @@
 ; FUNC-LABEL: {{^}}atomic_max_i32:
 ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -178,7 +178,7 @@
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
 ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -188,7 +188,7 @@
 ; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
 ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -198,7 +198,7 @@
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -209,7 +209,7 @@
 ; FUNC-LABEL: {{^}}atomic_umax_i32:
 ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -218,7 +218,7 @@
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
 ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -228,7 +228,7 @@
 ; FUNC-LABEL: {{^}}atomic_min_i32_offset:
 ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -238,7 +238,7 @@
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -249,7 +249,7 @@
 ; FUNC-LABEL: {{^}}atomic_min_i32:
 ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -258,7 +258,7 @@
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
 ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -268,7 +268,7 @@
 ; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
 ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -278,7 +278,7 @@
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -289,7 +289,7 @@
 ; FUNC-LABEL: {{^}}atomic_umin_i32:
 ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -298,7 +298,7 @@
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
 ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -308,7 +308,7 @@
 ; FUNC-LABEL: {{^}}atomic_or_i32_offset:
 ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -318,7 +318,7 @@
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -329,7 +329,7 @@
 ; FUNC-LABEL: {{^}}atomic_or_i32:
 ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -338,7 +338,7 @@
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
 ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -348,7 +348,7 @@
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -358,7 +358,7 @@
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -369,7 +369,7 @@
 ; FUNC-LABEL: {{^}}atomic_xchg_i32:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -378,7 +378,7 @@
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -388,7 +388,7 @@
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset:
 ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -398,7 +398,7 @@
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -409,7 +409,7 @@
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32:
 ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
   ret void
@@ -418,7 +418,7 @@
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
 ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -428,7 +428,7 @@
 ; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
 ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -438,7 +438,7 @@
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -449,7 +449,7 @@
 ; FUNC-LABEL: {{^}}atomic_xor_i32:
 ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -458,7 +458,7 @@
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
 ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -468,7 +468,7 @@
 ; FUNC-LABEL: {{^}}atomic_store_i32_offset:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
-define void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   store atomic i32 %in, i32 addrspace(1)* %gep  seq_cst, align 4
@@ -478,7 +478,7 @@
 ; FUNC-LABEL: {{^}}atomic_store_i32:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
-define void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
   ret void
@@ -487,7 +487,7 @@
 ; FUNC-LABEL: {{^}}atomic_store_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
-define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -498,7 +498,7 @@
 ; FUNC-LABEL: {{^}}atomic_store_i32_addr64:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
-define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   store atomic i32 %in, i32 addrspace(1)* %ptr seq_cst, align 4
@@ -507,7 +507,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_inc_add
 ; EG: MEM_RAT ATOMIC_INC_UINT
-define void @atomic_inc_add(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_inc_add(i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 1 seq_cst
@@ -516,7 +516,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_dec_add
 ; EG: MEM_RAT ATOMIC_DEC_UINT
-define void @atomic_dec_add(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_dec_add(i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 -1 seq_cst
@@ -525,7 +525,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_inc_sub
 ; EG: MEM_RAT ATOMIC_INC_UINT
-define void @atomic_inc_sub(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_inc_sub(i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 -1 seq_cst
@@ -534,7 +534,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_dec_sub
 ; EG: MEM_RAT ATOMIC_DEC_UINT
-define void @atomic_dec_sub(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_dec_sub(i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 1 seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/r600.private-memory.ll b/llvm/test/CodeGen/AMDGPU/r600.private-memory.ll
index f406c16..53ee214 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.private-memory.ll
@@ -10,7 +10,7 @@
 ; Additional check in case the move ends up in the last slot
 ; R600-NOT: MOV * TO.X
 
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = alloca [2 x i32]
   %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
index a34a48e..9eee9a6 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
@@ -2,7 +2,7 @@
 
 ; FUNC-LABEL: {{^}}tgid_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T1.X
-define void @tgid_x(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@
 
 ; FUNC-LABEL: {{^}}tgid_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T1.Y
-define void @tgid_y(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@
 
 ; FUNC-LABEL: {{^}}tgid_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T1.Z
-define void @tgid_z(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@
 
 ; FUNC-LABEL: {{^}}tidig_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.X
-define void @tidig_x(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -38,7 +38,7 @@
 
 ; FUNC-LABEL: {{^}}tidig_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.Y
-define void @tidig_y(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -47,7 +47,7 @@
 
 ; FUNC-LABEL: {{^}}tidig_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.Z
-define void @tidig_z(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -57,7 +57,7 @@
 ; FUNC-LABEL: {{^}}test_implicit:
 ; 36 prepended implicit bytes + 4(out pointer) + 4*4 = 56
 ; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 56
-define void @test_implicit(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
   %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
   %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 4
@@ -69,7 +69,7 @@
 ; FUNC-LABEL: {{^}}test_implicit_dyn:
 ; 36 prepended implicit bytes + 8(out pointer + in) = 44
 ; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 44
-define void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 {
+define amdgpu_kernel void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
   %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
   %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 %in
diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
index 0aa11f2..fbdaeb8 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -9,7 +9,7 @@
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -33,7 +33,7 @@
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv fast float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -45,7 +45,7 @@
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv arcp float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -57,7 +57,7 @@
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 {
+define amdgpu_kernel void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 {
   %rcp = fdiv float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -69,7 +69,7 @@
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %rcp = fdiv float 1.0, %src.fabs
   store float %rcp, float addrspace(1)* %out, align 4
@@ -82,7 +82,7 @@
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv float -1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -92,7 +92,7 @@
 ; GCN: s_load_dword [[SRC:s[0-9]+]]
 ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -|[[SRC]]|
 ; GCN: buffer_store_dword [[RCP]]
-define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %src.fabs.fneg = fsub float -0.0, %src.fabs
   %rcp = fdiv float 1.0, %src.fabs.fneg
@@ -106,7 +106,7 @@
 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[SRC]], -|[[SRC]]|
 ; GCN: buffer_store_dword [[RCP]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %src.fabs.fneg = fsub float -0.0, %src.fabs
   %rcp = fdiv float 1.0, %src.fabs.fneg
@@ -120,7 +120,7 @@
 ; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f32:
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[MUL]]
-define void @div_arcp_2_x_pat_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_arcp_2_x_pat_f32(float addrspace(1)* %out) #0 {
   %x = load float, float addrspace(1)* undef
   %rcp = fdiv arcp float %x, 2.0
   store float %rcp, float addrspace(1)* %out, align 4
@@ -130,7 +130,7 @@
 ; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f32:
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0x3dcccccd, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[MUL]]
-define void @div_arcp_k_x_pat_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_arcp_k_x_pat_f32(float addrspace(1)* %out) #0 {
   %x = load float, float addrspace(1)* undef
   %rcp = fdiv arcp float %x, 10.0
   store float %rcp, float addrspace(1)* %out, align 4
@@ -140,7 +140,7 @@
 ; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f32:
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbdcccccd, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[MUL]]
-define void @div_arcp_neg_k_x_pat_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_arcp_neg_k_x_pat_f32(float addrspace(1)* %out) #0 {
   %x = load float, float addrspace(1)* undef
   %rcp = fdiv arcp float %x, -10.0
   store float %rcp, float addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll b/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
index a5581d7..34cbe39 100644
--- a/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
+++ b/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
@@ -4,7 +4,7 @@
 
 declare i32 @llvm.read_register.i32(metadata) #0
 
-define void @test_invalid_read_flat_scratch_lo(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @test_invalid_read_flat_scratch_lo(i32 addrspace(1)* %out) nounwind {
   store volatile i32 0, i32 addrspace(3)* undef
   %m0 = call i32 @llvm.read_register.i32(metadata !0)
   store i32 %m0, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
index 2617ad7..6417d28 100644
--- a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
@@ -4,7 +4,7 @@
 
 declare i32 @llvm.read_register.i32(metadata) #0
 
-define void @test_invalid_read_exec(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @test_invalid_read_exec(i32 addrspace(1)* %out) nounwind {
   store volatile i32 0, i32 addrspace(3)* undef
   %m0 = call i32 @llvm.read_register.i32(metadata !0)
   store i32 %m0, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
index dcde8a1..8e248fd 100644
--- a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
@@ -4,7 +4,7 @@
 
 declare i64 @llvm.read_register.i64(metadata) #0
 
-define void @test_invalid_read_m0(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_invalid_read_m0(i64 addrspace(1)* %out) #0 {
   %exec = call i64 @llvm.read_register.i64(metadata !0)
   store i64 %exec, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/read_register.ll b/llvm/test/CodeGen/AMDGPU/read_register.ll
index 601a0ad..8fe9e7f 100644
--- a/llvm/test/CodeGen/AMDGPU/read_register.ll
+++ b/llvm/test/CodeGen/AMDGPU/read_register.ll
@@ -9,7 +9,7 @@
 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], [[COPY_M0]]
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_m0(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_m0(i32 addrspace(1)* %out) #0 {
   store volatile i32 0, i32 addrspace(3)* undef
   %m0 = call i32 @llvm.read_register.i32(metadata !0)
   store i32 %m0, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@
 ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], exec_lo
 ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], exec_hi
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_read_exec(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_exec(i64 addrspace(1)* %out) #0 {
   %exec = call i64 @llvm.read_register.i64(metadata !1)
   store i64 %exec, i64 addrspace(1)* %out
   ret void
@@ -30,7 +30,7 @@
 ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], flat_scratch_lo
 ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], flat_scratch_hi
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 {
   %flat_scratch = call i64 @llvm.read_register.i64(metadata !2)
   store i64 %flat_scratch, i64 addrspace(1)* %out
   ret void
@@ -39,7 +39,7 @@
 ; CHECK-LABEL: {{^}}test_read_flat_scratch_lo:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_lo
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 {
   %flat_scratch_lo = call i32 @llvm.read_register.i32(metadata !3)
   store i32 %flat_scratch_lo, i32 addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@
 ; CHECK-LABEL: {{^}}test_read_flat_scratch_hi:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_hi
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 {
   %flat_scratch_hi = call i32 @llvm.read_register.i32(metadata !4)
   store i32 %flat_scratch_hi, i32 addrspace(1)* %out
   ret void
@@ -57,7 +57,7 @@
 ; CHECK-LABEL: {{^}}test_read_exec_lo:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_lo
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_exec_lo(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_exec_lo(i32 addrspace(1)* %out) #0 {
   %exec_lo = call i32 @llvm.read_register.i32(metadata !5)
   store i32 %exec_lo, i32 addrspace(1)* %out
   ret void
@@ -66,7 +66,7 @@
 ; CHECK-LABEL: {{^}}test_read_exec_hi:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_hi
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_exec_hi(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_exec_hi(i32 addrspace(1)* %out) #0 {
   %exec_hi = call i32 @llvm.read_register.i32(metadata !6)
   store i32 %exec_hi, i32 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
index 7965b06..5c698c8 100644
--- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -13,7 +13,7 @@
 ; SI: s_memtime s{{\[[0-9]+:[0-9]+\]}}
 ; VI: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: store_dwordx2
-define void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.readcyclecounter()
   store volatile i64 %cycle0, i64 addrspace(1)* %out
 
diff --git a/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll b/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
index dd67dc4..ecb513c 100644
--- a/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
@@ -6,7 +6,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, [[VAL]]
 ; GCN: buffer_store_dwordx2
-define void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %a = load i64, i64 addrspace(1)* %in, align 4
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -16,7 +16,7 @@
 ; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt0:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %a = load i64, i64 addrspace(1)* %in, align 4
   %vec = bitcast i64 %a to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 0
@@ -27,7 +27,7 @@
 ; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt1:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4
 ; GCN: buffer_store_dword [[VAL]]
-define void @reduce_i64_align_4_bitcast_v2i32_elt1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %a = load i64, i64 addrspace(1)* %in, align 4
   %vec = bitcast i64 %a to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
index 281e49f..601aca4 100644
--- a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: {{^}}store_v2i32_as_v4i16_align_4:
 ; GCN: s_load_dwordx2
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
+define amdgpu_kernel void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
   %x.bc = bitcast <2 x i32> %x to <4 x i16>
   store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4
   ret void
@@ -13,7 +13,7 @@
 ; GCN: s_load_dwordx4
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
+define amdgpu_kernel void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
   %x.bc = bitcast <4 x i32> %x to <8 x i16>
   store <8 x i16> %x.bc, <8 x i16> addrspace(3)* %out, align 4
   ret void
@@ -22,7 +22,7 @@
 ; GCN-LABEL: {{^}}store_v2i32_as_i64_align_4:
 ; GCN: s_load_dwordx2
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
+define amdgpu_kernel void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
   %x.bc = bitcast <2 x i32> %x to <4 x i16>
   store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4
   ret void
@@ -32,7 +32,7 @@
 ; GCN: s_load_dwordx4
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
+define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
   %x.bc = bitcast <4 x i32> %x to <2 x i64>
   store <2 x i64> %x.bc, <2 x i64> addrspace(3)* %out, align 4
   ret void
@@ -44,7 +44,7 @@
 ; GCN: buffer_load_ushort
 ; GCN: buffer_load_ushort
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
+define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
   %x.bc = bitcast <4 x i16> %x to <2 x i32>
   store <2 x i32> %x.bc, <2 x i32> addrspace(3)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll b/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
index 9096448..9f8667d 100644
--- a/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
@@ -6,7 +6,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
-define void @reg_coalescer_breaks_dead(<2 x i32> addrspace(1)* nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 {
+define amdgpu_kernel void @reg_coalescer_breaks_dead(<2 x i32> addrspace(1)* nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 {
 bb:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %cmp0 = icmp eq i32 %id.x, 0
diff --git a/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir b/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
index de52cc0b..ecf94b5 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
@@ -8,7 +8,7 @@
 # CHECK: DBG_VALUE{{.*}}debug-use %13.sub2
 
 --- |
-  define void @test(i32 addrspace(1)* %out) { ret void }
+  define amdgpu_kernel void @test(i32 addrspace(1)* %out) { ret void }
   
   !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !4, producer: "llvm", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4)
   !1 = !DILocalVariable(name: "a", scope: !2, file: !4, line: 126, type: !6)
diff --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
index bff3a9f..26a76cf 100644
--- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -9,7 +9,7 @@
 ; SI: ; Kernel info:
 ; SI: ; NumSgprs: {{[0-9]+}}
 ; SI: ; NumVgprs: {{[0-9]+}}
-define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
+define amdgpu_kernel void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
   %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid
@@ -24,7 +24,7 @@
 
 ; SI-LABEL: {{^}}one_vgpr_used:
 ; SI: NumVgprs: 1
-define void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind {
+define amdgpu_kernel void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind {
   store i32 %x, i32 addrspace(1)* %out, align 4
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll b/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
index 47bdfba..5d4955a 100644
--- a/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
@@ -3,7 +3,7 @@
 ; definition on every path (there should at least be IMPLICIT_DEF instructions).
 target triple = "amdgcn--"
 
-define void @func() {
+define amdgpu_kernel void @func() {
 B0:
   br i1 undef, label %B1, label %B2
 
diff --git a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir
index b928bc7..fc2e442 100644
--- a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir
+++ b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir
@@ -1,7 +1,7 @@
 # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass simple-register-coalescing,rename-independent-subregs -o - %s | FileCheck %s
 --- |
-  define void @test0() { ret void }
-  define void @test1() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
+  define amdgpu_kernel void @test1() { ret void }
 ...
 ---
 # In the test below we have two independent def+use pairs of subregister1 which
diff --git a/llvm/test/CodeGen/AMDGPU/reorder-stores.ll b/llvm/test/CodeGen/AMDGPU/reorder-stores.ll
index 412202f..ff40692 100644
--- a/llvm/test/CodeGen/AMDGPU/reorder-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -7,7 +7,7 @@
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16
   %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16
   store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16
@@ -19,7 +19,7 @@
 ; SI: ds_read2_b64
 ; SI: ds_write2_b64
 ; SI: s_endpgm
-define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16
   %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16
   store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16
@@ -39,7 +39,7 @@
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32
   %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32
   store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32
@@ -54,7 +54,7 @@
 ; SI-NOT: ds_read
 ; SI: ds_write_b64
 ; SI: s_endpgm
-define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8
   %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8
   %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.i64.ll b/llvm/test/CodeGen/AMDGPU/rotl.i64.ll
index b60c470d..2664907 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.i64.ll
@@ -7,7 +7,7 @@
 ; BOTH-DAG: s_lshr_b64
 ; BOTH: s_or_b64
 ; BOTH: s_endpgm
-define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
 entry:
   %0 = shl i64 %x, %y
   %1 = sub i64 64, %y
@@ -26,7 +26,7 @@
 ; BOTH: v_or_b32
 ; BOTH: v_or_b32
 ; BOTH: s_endpgm
-define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
   %x = load i64, i64 addrspace(1)* %xptr, align 8
   %y = load i64, i64 addrspace(1)* %yptr, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 7d2b553..c4bc8cd 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -10,7 +10,7 @@
 ; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
 ; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]]
 ; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]]
-define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+define amdgpu_kernel void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %0 = shl i32 %x, %y
   %1 = sub i32 32, %y
@@ -26,7 +26,7 @@
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_alignbit_b32
 ; SI: s_endpgm
-define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
 entry:
   %0 = shl <2 x i32> %x, %y
   %1 = sub <2 x i32> <i32 32, i32 32>, %y
@@ -46,7 +46,7 @@
 ; SI-DAG: s_sub_i32
 ; SI-DAG: v_alignbit_b32
 ; SI: s_endpgm
-define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
   %0 = shl <4 x i32> %x, %y
   %1 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.i64.ll b/llvm/test/CodeGen/AMDGPU/rotr.i64.ll
index 58a1efe..9eda479 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.i64.ll
@@ -6,7 +6,7 @@
 ; BOTH-DAG: s_lshr_b64
 ; BOTH-DAG: s_lshl_b64
 ; BOTH: s_or_b64
-define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
 entry:
   %tmp0 = sub i64 64, %y
   %tmp1 = shl i64 %x, %tmp0
@@ -24,7 +24,7 @@
 ; VI-DAG: v_lshlrev_b64
 ; BOTH: v_or_b32
 ; BOTH: v_or_b32
-define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
   %x = load i64, i64 addrspace(1)* %xptr, align 8
   %y = load i64, i64 addrspace(1)* %yptr, align 8
@@ -37,7 +37,7 @@
 }
 
 ; BOTH-LABEL: {{^}}s_rotr_v2i64:
-define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
 entry:
   %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
   %tmp1 = shl <2 x i64> %x, %tmp0
@@ -48,7 +48,7 @@
 }
 
 ; BOTH-LABEL: {{^}}v_rotr_v2i64:
-define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
 entry:
   %x = load <2 x i64>, <2 x i64> addrspace(1)* %xptr, align 8
   %y = load <2 x i64>, <2 x i64> addrspace(1)* %yptr, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index 55d1800..b4e2c2b 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -6,7 +6,7 @@
 ; R600: BIT_ALIGN_INT
 
 ; SI: v_alignbit_b32
-define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+define amdgpu_kernel void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %tmp0 = sub i32 32, %y
   %tmp1 = shl i32 %x, %tmp0
@@ -22,7 +22,7 @@
 
 ; SI: v_alignbit_b32
 ; SI: v_alignbit_b32
-define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
 entry:
   %tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
   %tmp1 = shl <2 x i32> %x, %tmp0
@@ -42,7 +42,7 @@
 ; SI: v_alignbit_b32
 ; SI: v_alignbit_b32
 ; SI: v_alignbit_b32
-define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
   %tmp0 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
   %tmp1 = shl <4 x i32> %x, %tmp0
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.ll b/llvm/test/CodeGen/AMDGPU/rsq.ll
index 699440c..9462683e 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.ll
@@ -8,7 +8,7 @@
 ; SI-LABEL: {{^}}rsq_f32:
 ; SI: v_rsq_f32_e32
 ; SI: s_endpgm
-define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
   %div = fdiv float 1.0, %sqrt
@@ -20,7 +20,7 @@
 ; SI-UNSAFE: v_rsq_f64_e32
 ; SI-SAFE: v_sqrt_f64_e32
 ; SI: s_endpgm
-define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
   %val = load double, double addrspace(1)* %in, align 4
   %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone
   %div = fdiv double 1.0, %sqrt
@@ -31,7 +31,7 @@
 ; SI-LABEL: {{^}}rsq_f32_sgpr:
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
 ; SI: s_endpgm
-define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
+define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
   %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
   %div = fdiv float 1.0, %sqrt
   store float %div, float addrspace(1)* %out, align 4
@@ -55,7 +55,7 @@
 ; SI-SAFE-NOT: v_rsq_f32
 
 ; SI: s_endpgm
-define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -81,7 +81,7 @@
 ; SI-UNSAFE: v_rsq_f32_e32 [[RSQ:v[0-9]+]], v{{[0-9]+}}
 ; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]]
 ; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]]
-define void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %sqrt = call float @llvm.sqrt.f32(float %val)
   %div = fdiv float -1.0, %sqrt
@@ -96,7 +96,7 @@
 ; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}
 ; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]]
 ; SI-UNSAFE: buffer_store_dwordx2 [[RCP]]
-define void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
   %val = load double, double addrspace(1)* %in, align 4
   %sqrt = call double @llvm.sqrt.f64(double %val)
   %div = fdiv double -1.0, %sqrt
@@ -112,7 +112,7 @@
 ; SI-UNSAFE: v_rsq_f32_e64 [[RSQ:v[0-9]+]], -v{{[0-9]+}}
 ; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]]
 ; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]]
-define void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %val.fneg = fsub float -0.0, %val
   %sqrt = call float @llvm.sqrt.f32(float %val.fneg)
@@ -128,7 +128,7 @@
 ; SI-UNSAFE: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -v{{\[[0-9]+:[0-9]+\]}}
 ; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]]
 ; SI-UNSAFE: buffer_store_dwordx2 [[RCP]]
-define void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
   %val = load double, double addrspace(1)* %in, align 4
   %val.fneg = fsub double -0.0, %val
   %sqrt = call double @llvm.sqrt.f64(double %val.fneg)
diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
index acceb32..deef24c 100644
--- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
@@ -7,7 +7,7 @@
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[VRESULT]]
 ; SI: s_endpgm
-define void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, 65
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -19,7 +19,7 @@
 ; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]]
 ; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]]
 ; SI: s_endpgm
-define void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) {
   %add0 = add i32 %a, 65
   %add1 = add i32 %b, 65
   store i32 %add0, i32 addrspace(1)* %out0
@@ -30,7 +30,7 @@
 ; SI-LABEL: {{^}}s_addk_i32_k1:
 ; SI: s_addk_i32 {{s[0-9]+}}, 0x7fff{{$}}
 ; SI: s_endpgm
-define void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, 32767 ; (1 << 15) - 1
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -39,7 +39,7 @@
 ; SI-LABEL: {{^}}s_addk_i32_k2:
 ; SI: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, 17
 ; SI: s_endpgm
-define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, -17
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@
 ; SI-LABEL: {{^}}s_addk_i32_k3:
 ; SI: s_addk_i32 {{s[0-9]+}}, 0xffbf{{$}}
 ; SI: s_endpgm
-define void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, -65
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -58,7 +58,7 @@
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
 ; SI: s_endpgm
-define void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) {
+define amdgpu_kernel void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) {
   %add = add <2 x i32> %b, <i32 65, i32 66>
   store <2 x i32> %add, <2 x i32> addrspace(1)* %out
   ret void
@@ -70,7 +70,7 @@
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44
 ; SI: s_endpgm
-define void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) {
+define amdgpu_kernel void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) {
   %add = add <4 x i32> %b, <i32 65, i32 66, i32 67, i32 68>
   store <4 x i32> %add, <4 x i32> addrspace(1)* %out
   ret void
@@ -86,7 +86,7 @@
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x47
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x48
 ; SI: s_endpgm
-define void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) {
+define amdgpu_kernel void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) {
   %add = add <8 x i32> %b, <i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72>
   store <8 x i32> %add, <8 x i32> addrspace(1)* %out
   ret void
@@ -95,7 +95,7 @@
 ; SI-LABEL: {{^}}no_s_addk_i32_k0:
 ; SI: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8000{{$}}
 ; SI: s_endpgm
-define void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, 32768 ; 1 << 15
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -105,7 +105,7 @@
 
 ; SI-LABEL: {{^}}commute_s_addk_i32:
 ; SI: s_addk_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
+define amdgpu_kernel void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %add = add i32 %size, %b
   call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add)
diff --git a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
index 0164c45..a131aaa 100644
--- a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -7,7 +7,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -21,7 +21,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -35,7 +35,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 64, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -49,7 +49,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -63,7 +63,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -78,7 +78,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff
   store i64 %or, i64 addrspace(1)* %out
@@ -92,7 +92,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 63, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 270582939713 ; 65 | (63 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -107,7 +107,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -122,7 +122,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000
   store i64 %or, i64 addrspace(1)* %out
@@ -137,7 +137,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001
   store i64 %or, i64 addrspace(1)* %out
@@ -152,7 +152,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888
   store i64 %or, i64 addrspace(1)* %out
@@ -167,7 +167,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff
   store i64 %or, i64 addrspace(1)* %out
@@ -182,7 +182,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001
   store i64 %or, i64 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
index e83b368..f6ed540 100644
--- a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
@@ -7,7 +7,7 @@
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[VRESULT]]
 ; SI: s_endpgm
-define void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, 65
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@
 ; SI-LABEL: {{^}}s_mulk_i32_k1:
 ; SI: s_mulk_i32 {{s[0-9]+}}, 0x7fff{{$}}
 ; SI: s_endpgm
-define void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, 32767 ; (1 << 15) - 1
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -25,7 +25,7 @@
 ; SI-LABEL: {{^}}s_mulk_i32_k2:
 ; SI: s_mulk_i32 {{s[0-9]+}}, 0xffef{{$}}
 ; SI: s_endpgm
-define void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, -17
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@
 ; SI-LABEL: {{^}}no_s_mulk_i32_k0:
 ; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}}
 ; SI: s_endpgm
-define void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, 32769 ; 1 << 15 + 1
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -44,7 +44,7 @@
 
 ; SI-LABEL: {{^}}commute_s_mulk_i32:
 ; SI: s_mulk_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
+define amdgpu_kernel void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %add = mul i32 %size, %b
   call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add)
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 5344834..f7a1c65 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -18,7 +18,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_constant_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 20
-define void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) {
   %icmp0 = icmp ugt i32 %a, 90
   %t0 = select i1 %icmp0, i32 %a, i32 90
 
@@ -34,7 +34,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   %sub1 = sub i32 %b, %a
@@ -51,7 +51,7 @@
 ; GCN: s_min_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -68,7 +68,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_add_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -84,7 +84,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_max_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
   store volatile i32 %t0, i32 *undef
@@ -101,7 +101,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_min_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -119,7 +119,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   store volatile i32 %sub0, i32 *undef
@@ -136,7 +136,7 @@
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   %sub1 = sub i32 %b, %a
@@ -154,7 +154,7 @@
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+define amdgpu_kernel void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
   %icmp0 = icmp ugt <4 x i32> %a, %b
   %t0 = select <4 x i1> %icmp0, <4 x i32> %a, <4 x i32> %b
 
@@ -173,7 +173,7 @@
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+define amdgpu_kernel void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
   %icmp0 = icmp ugt <4 x i32> %a, %b
   %sub0 = sub <4 x i32> %a, %b
   %sub1 = sub <4 x i32> %b, %a
@@ -187,7 +187,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_i16_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
+define amdgpu_kernel void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 
   %icmp0 = icmp ugt i16 %a, %b
   %t0 = select i1 %icmp0, i16 %a, i16 %b
@@ -204,7 +204,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_i16_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) {
+define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) {
   %icmp0 = icmp ugt i16 %a, %b
   %sub0 = sub i16 %a, %b
   %sub1 = sub i16 %b, %a
@@ -218,7 +218,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_i8_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
+define amdgpu_kernel void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
   %icmp0 = icmp ugt i8 %a, %b
   %t0 = select i1 %icmp0, i8 %a, i8 %b
 
@@ -234,7 +234,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_i8_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
+define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
   %icmp0 = icmp ugt i8 %a, %b
   %sub0 = sub i8 %a, %b
   %sub1 = sub i8 %b, %a
@@ -251,7 +251,7 @@
 ; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -269,7 +269,7 @@
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %d
   %sub1 = sub i32 %b, %a
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index f8ced79..586a455 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -6,7 +6,7 @@
 declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
 
 ; FUNC-LABEL: {{^}}saddo_i64_zext:
-define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
   %carry = extractvalue { i64, i1 } %sadd, 1
@@ -17,7 +17,7 @@
 }
 
 ; FUNC-LABEL: {{^}}s_saddo_i32:
-define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %sadd, 0
   %carry = extractvalue { i32, i1 } %sadd, 1
@@ -27,7 +27,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v_saddo_i32:
-define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
@@ -39,7 +39,7 @@
 }
 
 ; FUNC-LABEL: {{^}}s_saddo_i64:
-define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
   %carry = extractvalue { i64, i1 } %sadd, 1
@@ -51,7 +51,7 @@
 ; FUNC-LABEL: {{^}}v_saddo_i64:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64, i64 addrspace(1)* %aptr, align 4
   %b = load i64, i64 addrspace(1)* %bptr, align 4
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
index 14369a8..6e1dd16 100644
--- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -24,7 +24,7 @@
 ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
 ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
 
-define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = call i32 @llvm.amdgcn.workitem.id.y()
@@ -65,7 +65,7 @@
 ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
 ; GCN-NOHSA: buffer_store_dword [[V_OUT]]
 ; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]]
-define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
 entry:
   %tmp = icmp ne i32 %a, 0
   br i1 %tmp, label %if, label %else
@@ -93,7 +93,7 @@
 ; GCN-NOHSA-NOT: v_add
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -113,7 +113,7 @@
 ; GCN-NOHSA: buffer_store_dword
 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
-define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr i32, i32 addrspace(2)* %in, i32 %tmp
@@ -133,7 +133,7 @@
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: buffer_store_dwordx2
 ; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr i64, i64 addrspace(2)* %in, i32 %tmp
@@ -155,7 +155,7 @@
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: buffer_store_dwordx4
 ; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %in, i32 %tmp
@@ -189,7 +189,7 @@
 ; GCN-NOHSA: buffer_store_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %in, i32 %tmp
@@ -230,7 +230,7 @@
 ; GCN-HSA: flat_load_dwordx4
 
 ; GCN: s_endpgm
-define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %in, i32 %tmp
@@ -247,7 +247,7 @@
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]]
 ; GCN-NOHSA: buffer_store_dword [[ADD]]
 ; GCN-HSA: flat_store_dword {{.*}}, [[ADD]]
-define void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
+define amdgpu_kernel void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -261,7 +261,7 @@
 ; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset:
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
 ; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}}
-define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -275,7 +275,7 @@
 ; GCN-NOHSA-NOT: v_add
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
 ; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -290,7 +290,7 @@
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -313,7 +313,7 @@
 ; GCN-NOHSA: buffer_store_dword
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -350,7 +350,7 @@
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -385,7 +385,7 @@
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -441,7 +441,7 @@
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
 ; GCN: {{^}}[[EXIT]]:
 ; GCN: s_endpgm
-define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 bb3:                                              ; preds = %bb2
   %tmp0 = bitcast i32 %cond to float
   %tmp1 = fadd float %tmp0, 2.500000e-01
@@ -459,7 +459,7 @@
 
 ; GCN-LABEL: {{^}}phi_visit_order:
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 1, v{{[0-9]+}}
-define void @phi_visit_order() {
+define amdgpu_kernel void @phi_visit_order() {
 bb:
   br label %bb1
 
@@ -484,7 +484,7 @@
 ; GCN: [[LOOP_LABEL:[0-9a-zA-Z_]+]]:
 ; GCN: s_xor_b32 [[B]], [[B]], [[A]]
 ; GCN: s_cbranch_scc{{[01]}} [[LOOP_LABEL]]
-define void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) {
+define amdgpu_kernel void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) {
 entry:
   br label %loop
 
diff --git a/llvm/test/CodeGen/AMDGPU/sampler-resource-id.ll b/llvm/test/CodeGen/AMDGPU/sampler-resource-id.ll
index c41d345..4ea503b 100644
--- a/llvm/test/CodeGen/AMDGPU/sampler-resource-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/sampler-resource-id.ll
@@ -5,7 +5,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_0(i32 %in0, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_0(i32 %in0, i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in0) #0
   store i32 %0, i32 addrspace(1)* %out
@@ -17,7 +17,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_1(i32 %in0, i32 %in1, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_1(i32 %in0, i32 %in1, i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in1) #0
   store i32 %0, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_2(i32 %in0, i32 %in1, i32 %in2, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_2(i32 %in0, i32 %in1, i32 %in2, i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in2) #0
   store i32 %0, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir b/llvm/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir
index af71086..f4ef1c8 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir
+++ b/llvm/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir
@@ -1,23 +1,23 @@
 # RUN: llc -march=amdgcn -run-pass si-insert-waits %s -o - | FileCheck %s
 
 --- |
-  define void @basic_insert_dcache_wb() {
+  define amdgpu_kernel void @basic_insert_dcache_wb() {
     ret void
   }
 
-  define void @explicit_flush_after() {
+  define amdgpu_kernel void @explicit_flush_after() {
     ret void
   }
 
-  define void @explicit_flush_before() {
+  define amdgpu_kernel void @explicit_flush_before() {
     ret void
   }
 
-  define void @no_scalar_store() {
+  define amdgpu_kernel void @no_scalar_store() {
     ret void
   }
 
-  define void @multi_block_store() {
+  define amdgpu_kernel void @multi_block_store() {
   bb0:
     br i1 undef, label %bb1, label %bb2
 
@@ -28,7 +28,7 @@
     ret void
   }
 
-  define void @one_block_store() {
+  define amdgpu_kernel void @one_block_store() {
   bb0:
     br i1 undef, label %bb1, label %bb2
 
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 35886f8..62d0d93 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -9,7 +9,7 @@
 ; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]]
 ; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}}
-define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tmp1 = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %tmp1 to <2 x i16>
   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -21,7 +21,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
 ; GCN: buffer_store_dwordx2
-define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tmp1 = load float, float addrspace(1)* %in, align 4
   %bc = bitcast float %tmp1 to <2 x i16>
   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -33,7 +33,7 @@
 ; to produce one, but for some reason never made it to selection.
 
 
-; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
 ;   %tmp1 = load i32, i32 addrspace(1)* %in, align 4
 ;   %bc = bitcast i32 %tmp1 to <4 x i8>
 
@@ -42,7 +42,7 @@
 ;   ret void
 ; }
 
-; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind {
 ;   %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0
 ;   %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1
 ;   %bc = bitcast <2 x i64> %newvec1 to <4 x i32>
@@ -51,7 +51,7 @@
 ;   ret void
 ; }
 
-; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind {
 ;   %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0
 ;   %bc = bitcast <4 x i32> %newvec0 to <8 x i16>
 ;   %add = add <8 x i16> %bc, <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>
@@ -59,7 +59,7 @@
 ;   ret void
 ; }
 
-; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind {
 ;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
 ;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
 ;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
@@ -67,7 +67,7 @@
 ;   ret void
 ; }
 
-define void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind {
+define amdgpu_kernel void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind {
   %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0
   %bc = bitcast <4 x i8> %newvec0 to <2 x half>
   store <2 x half> %bc, <2 x half> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
index 5cf2c6c..177957c 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs < %s
 ; REQUIRES: asserts
 
-define void @main() #0 {
+define amdgpu_kernel void @main() #0 {
 main_body:
   %tmp = load <4 x float>, <4 x float> addrspace(9)* null
   %tmp5 = extractelement <4 x float> %tmp, i32 3
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll b/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll
index 32c456b..44d4608 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll
@@ -10,7 +10,7 @@
 ; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; SI: buffer_store_dword [[REG0]]
 ; SI: buffer_store_dword [[REG1]]
-define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
   %load0 = load i32, i32 addrspace(1)* %ptr, align 4
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 2
   %load1 = load i32, i32 addrspace(1)* %gep, align 4
@@ -24,7 +24,7 @@
 ; FUNC-LABEL: {{^}}same_base_ptr_crash:
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
-define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+define amdgpu_kernel void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
 entry:
   %out1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
   %tmp0 = load i32, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll b/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll
index aa67b2e..964298a 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
-define void @main() {
+define amdgpu_kernel void @main() {
 main_body:
   %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %1 = extractelement <4 x float> %0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-if.ll b/llvm/test/CodeGen/AMDGPU/schedule-if.ll
index 6637b38..feac5d9 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-if.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-if.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
-define void @main() {
+define amdgpu_kernel void @main() {
 main_body:
   %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %1 = extractelement <4 x float> %0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
index ccfde7b..5c47c16 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
@@ -12,7 +12,7 @@
 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
-define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
+define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
   store i32 %x, i32 addrspace(1)* %out0, align 4
   store i32 %y, i32 addrspace(1)* %out1, align 4
   ret void
@@ -26,7 +26,7 @@
 ; GCN: s_load_dwordx2
 ; GCN: s_load_dwordx2
 ; GCN: s_endpgm
-define void @same_base_ptr_crash(i64 addrspace(1)* %out,
+define amdgpu_kernel void @same_base_ptr_crash(i64 addrspace(1)* %out,
     i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7,
     i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15,
     i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23,
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
index ab81a69..4520fe8 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
@@ -5,7 +5,7 @@
 ; We expect a two digit VGPR usage here, not a three digit.
 ; CHECK: NumVgprs: {{[0-9][0-9]$}}
 
-define void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) {
+define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) {
 bb:
   %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
   %tmp2 = load float, float addrspace(3)* %tmp, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
index 2e659fe..0d19c1e 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -10,7 +10,7 @@
 ; VI: NumSgprs: {{[1-5][0-9]$}}
 ; VI: NumVgprs: {{[1-3][0-9]$}}
 
-define void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) {
+define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) {
 bb:
   %adr.a.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20004
   %adr.b.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20252
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
index b467290..6b1e859 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -13,7 +13,7 @@
 ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8004
 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
 
-define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
+define amdgpu_kernel void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
 entry:
   %scratch0 = alloca [8192 x i32]
   %scratch1 = alloca [8192 x i32]
@@ -53,7 +53,7 @@
 ; GCN-DAG: v_add_i32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]]
 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
 
-define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
+define amdgpu_kernel void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
 entry:
   %scratch0 = alloca [8192 x i32]
   %scratch1 = alloca [8192 x i32]
@@ -88,7 +88,7 @@
 
 ; GCN-LABEL: {{^}}neg_vaddr_offset:
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16{{$}}
-define void @neg_vaddr_offset(i32 %offset) {
+define amdgpu_kernel void @neg_vaddr_offset(i32 %offset) {
 entry:
   %array = alloca [8192 x i32]
   %ptr_offset = add i32 %offset, 4
@@ -99,7 +99,7 @@
 
 ; GCN-LABEL: {{^}}pos_vaddr_offset:
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:20
-define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
+define amdgpu_kernel void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
 entry:
   %array = alloca [8192 x i32]
   %ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index bafd6a5..f9ac425 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -13,7 +13,7 @@
 
 ; FUNC-LABEL: {{^}}sdiv_i32:
 ; EG: CF_END
-define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in
   %den = load i32, i32 addrspace(1) * %den_ptr
@@ -23,7 +23,7 @@
 }
 
 ; FUNC-LABEL: {{^}}sdiv_i32_4:
-define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = sdiv i32 %num, 4
   store i32 %result, i32 addrspace(1)* %out
@@ -43,14 +43,14 @@
 ; SI: v_add_i32
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = sdiv i32 %num, 3435
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
-define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
@@ -59,14 +59,14 @@
   ret void
 }
 
-define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %result = sdiv <2 x i32> %num, <i32 4, i32 4>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
@@ -75,7 +75,7 @@
   ret void
 }
 
-define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
@@ -86,7 +86,7 @@
 ; SI: v_rcp_f32
 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 8
 ; SI: buffer_store_dword [[BFE]]
-define void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -100,7 +100,7 @@
 ; SI: v_rcp_f32
 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 23
 ; SI: buffer_store_dword [[BFE]]
-define void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
   %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
   %num = load i23, i23 addrspace(1) * %in
   %den = load i23, i23 addrspace(1) * %den_ptr
@@ -114,7 +114,7 @@
 ; SI: v_rcp_f32
 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 24
 ; SI: buffer_store_dword [[BFE]]
-define void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
   %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
   %num = load i24, i24 addrspace(1) * %in
   %den = load i24, i24 addrspace(1) * %den_ptr
@@ -126,7 +126,7 @@
 
 ; FUNC-LABEL: {{^}}v_sdiv_i25:
 ; SI-NOT: v_rcp_f32
-define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
   %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1
   %num = load i25, i25 addrspace(1) * %in
   %den = load i25, i25 addrspace(1) * %den_ptr
@@ -137,19 +137,19 @@
 }
 
 ; Tests for 64-bit divide bypass.
-; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+; define amdgpu_kernel void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ;   %result = sdiv i64 %a, %b
 ;   store i64 %result, i64 addrspace(1)* %out, align 8
 ;   ret void
 ; }
 
-; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+; define amdgpu_kernel void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ;   %result = srem i64 %a, %b
 ;   store i64 %result, i64 addrspace(1)* %out, align 8
 ;   ret void
 ; }
 
-; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+; define amdgpu_kernel void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ;   %resultdiv = sdiv i64 %a, %b
 ;   %resultrem = srem i64 %a, %b
 ;   %result = add i64 %resultdiv, %resultrem
@@ -163,7 +163,7 @@
 ; SI: v_mul_hi_i32
 ; SI: v_mul_hi_i32
 
-define void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
+define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
   %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %2 = sdiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
   store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/sdivrem24.ll b/llvm/test/CodeGen/AMDGPU/sdivrem24.ll
index 349a782..257e6be 100644
--- a/llvm/test/CodeGen/AMDGPU/sdivrem24.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdivrem24.ll
@@ -12,7 +12,7 @@
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -31,7 +31,7 @@
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -50,7 +50,7 @@
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -69,7 +69,7 @@
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -88,7 +88,7 @@
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -107,7 +107,7 @@
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -130,7 +130,7 @@
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -149,7 +149,7 @@
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -168,7 +168,7 @@
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -187,7 +187,7 @@
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -206,7 +206,7 @@
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -225,7 +225,7 @@
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -244,7 +244,7 @@
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -263,7 +263,7 @@
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -283,7 +283,7 @@
 
 ; EG: INT_TO_FLT
 ; EG: RECIP_IEEE
-define void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -303,7 +303,7 @@
 
 ; EG: INT_TO_FLT
 ; EG: RECIP_IEEE
-define void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -323,7 +323,7 @@
 
 ; EG: INT_TO_FLT
 ; EG: RECIP_IEEE
-define void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/sdivrem64.ll b/llvm/test/CodeGen/AMDGPU/sdivrem64.ll
index 28fdb69..5ad0d8e 100644
--- a/llvm/test/CodeGen/AMDGPU/sdivrem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdivrem64.ll
@@ -70,7 +70,7 @@
 ; SI-NOT: v_lshr_b64
 ; VI-NOT: v_lshrrev_b64
 ; GCN: s_endpgm
-define void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = sdiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -144,7 +144,7 @@
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -159,7 +159,7 @@
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 33
   %2 = ashr i64 %y, 33
   %result = sdiv i64 %1, %2
@@ -176,7 +176,7 @@
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 33
   %2 = ashr i64 %y, 33
   %result = srem i64 %1, %2
@@ -196,7 +196,7 @@
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = sdiv i64 %1, %2
@@ -216,7 +216,7 @@
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = srem i64 %1, %2
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index d9c5b4c..38bbd6d 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -8,7 +8,7 @@
 
 ; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 
-define void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %a = load i32, i32 addrspace(1)* %in, align 4
   %shr = lshr i32 %a, 16
   %add = add i32 %a, %shr
@@ -23,7 +23,7 @@
 
 ; SDWA: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 
-define void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %a = load i32, i32 addrspace(1)* %in, align 4
   %shr = lshr i32 %a, 16
   %sub = sub i32 %shr, %a
@@ -39,7 +39,7 @@
 
 ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 
-define void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) {
+define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) {
   %a = load i32, i32 addrspace(1)* %in1, align 4
   %b = load i32, i32 addrspace(1)* %in2, align 4
   %shra = lshr i32 %a, 16
@@ -55,7 +55,7 @@
 ; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; SDWA-NOT: v_mul_u32_u24_sdwa
 
-define void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) {
 entry:
   %a = load i16, i16 addrspace(1)* %ina, align 4
   %b = load i16, i16 addrspace(1)* %inb, align 4
@@ -75,7 +75,7 @@
 ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL]], v{{[0-9]+}}
 
-define void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
 entry:
   %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
   %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
@@ -97,7 +97,7 @@
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL1]], v{{[0-9]+}}
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v{{[0-9]+}}
 
-define void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
 entry:
   %a = load <4 x i16>, <4 x i16> addrspace(1)* %ina, align 4
   %b = load <4 x i16>, <4 x i16> addrspace(1)* %inb, align 4
@@ -123,7 +123,7 @@
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL3]], v{{[0-9]+}}
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL2]], v{{[0-9]+}}
 
-define void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
 entry:
   %a = load <8 x i16>, <8 x i16> addrspace(1)* %ina, align 4
   %b = load <8 x i16>, <8 x i16> addrspace(1)* %inb, align 4
@@ -138,7 +138,7 @@
 ; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; SDWA-NOT: v_mul_f16_sdwa
 
-define void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) {
 entry:
   %a = load half, half addrspace(1)* %ina, align 4
   %b = load half, half addrspace(1)* %inb, align 4
@@ -157,7 +157,7 @@
 
 ; SDWA: v_mul_f16_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 
-define void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
 entry:
   %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
   %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
@@ -178,7 +178,7 @@
 ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
-define void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) {
 entry:
   %a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4
   %b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4
@@ -204,7 +204,7 @@
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
-define void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) {
 entry:
   %a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4
   %b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4
@@ -219,7 +219,7 @@
 ; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; SDWA-NOT: v_mul_u32_u24_sdwa
 
-define void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) {
 entry:
   %a = load i8, i8 addrspace(1)* %ina, align 4
   %b = load i8, i8 addrspace(1)* %inb, align 4
@@ -238,7 +238,7 @@
 
 ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 
-define void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) {
 entry:
   %a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4
   %b = load <2 x i8>, <2 x i8> addrspace(1)* %inb, align 4
@@ -259,7 +259,7 @@
 ; SDWA: v_mul_u32_u24_sdwa
 ; SDWA: v_mul_u32_u24_sdwa
 
-define void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) {
 entry:
   %a = load <4 x i8>, <4 x i8> addrspace(1)* %ina, align 4
   %b = load <4 x i8>, <4 x i8> addrspace(1)* %inb, align 4
@@ -283,7 +283,7 @@
 ; SDWA: v_mul_u32_u24_sdwa
 ; SDWA: v_mul_u32_u24_sdwa
 
-define void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) {
 entry:
   %a = load <8 x i8>, <8 x i8> addrspace(1)* %ina, align 4
   %b = load <8 x i8>, <8 x i8> addrspace(1)* %inb, align 4
@@ -304,7 +304,7 @@
 ; SDWA: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; SDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
 
-define void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
+define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
 entry:
   %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
   %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
@@ -318,7 +318,7 @@
 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
 ; SDWA-NOT: v_mul_u32_u24_sdwa
 
-define void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
   %a = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
   %mul = mul <2 x i16> %a, <i16 123, i16 321>
@@ -337,7 +337,7 @@
 
 ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 
-define void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
+define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
 entry:
   %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
   %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
@@ -353,7 +353,7 @@
 ; SDWA-NOT: v_mul_u32_u24_sdwa
 ; SDWA-NOT: v_add_i32_sdwa
 
-define void @mul_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb, i1 addrspace(1)* %incond) {
+define amdgpu_kernel void @mul_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb, i1 addrspace(1)* %incond) {
 entry:
   %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
   %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
index 559d464..c8839c1 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
@@ -11,7 +11,7 @@
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
 ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
 ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
-define void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
+define amdgpu_kernel void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -29,7 +29,7 @@
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[MUL]], vcc
 ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
 ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
-define void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
+define amdgpu_kernel void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %mul = call float @llvm.amdgcn.fmul.legacy(float %x, float 4.0)
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
index 246e2bb..3417eb0 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -8,7 +8,7 @@
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
-define void @add_select_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -30,7 +30,7 @@
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[W]]
-define void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -57,7 +57,7 @@
 
 ; GCN: buffer_store_dword [[ADD]]
 ; GCN: buffer_store_dword [[X_ABS]]
-define void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -80,7 +80,7 @@
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[Y]]|, [[W]]
-define void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -104,7 +104,7 @@
 ; GCN: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fabs_var_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -123,7 +123,7 @@
 ; GCN: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
-define void @add_select_fabs_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -140,7 +140,7 @@
 
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[X]]
-define void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2.0, float -1.0
@@ -155,7 +155,7 @@
 
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
-define void @add_select_posk_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float 2.0, float 1.0
@@ -172,7 +172,7 @@
 ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
-define void @add_select_negk_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -192,7 +192,7 @@
 ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[FABS_X]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
-define void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -209,7 +209,7 @@
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
-define void @add_select_fabs_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
 
@@ -228,7 +228,7 @@
 ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
-define void @add_select_posk_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_posk_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -246,7 +246,7 @@
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-define void @add_select_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -268,7 +268,7 @@
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[X]], [[W]]
-define void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -295,7 +295,7 @@
 
 ; GCN: buffer_store_dword [[ADD]]
 ; GCN: buffer_store_dword [[NEG_X]]
-define void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -318,7 +318,7 @@
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[Y]], [[W]]
-define void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -342,7 +342,7 @@
 ; GCN: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_NEG]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fneg_var_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -360,7 +360,7 @@
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -378,7 +378,7 @@
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -398,7 +398,7 @@
 ; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
 
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -415,7 +415,7 @@
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
-define void @add_select_negk_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2.0, float -1.0
@@ -432,7 +432,7 @@
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
-define void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2048.0, float -4096.0
@@ -446,7 +446,7 @@
 
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
-define void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2.0, float -1.0
@@ -463,7 +463,7 @@
 ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_negk_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -480,7 +480,7 @@
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -498,7 +498,7 @@
 ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_posk_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -518,7 +518,7 @@
 ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_negfabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -541,7 +541,7 @@
 ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG_ABS]], [[X_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fabs_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -564,7 +564,7 @@
 ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_neg_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -586,7 +586,7 @@
 ; GCN-DAG: v_xor_b32_e32 [[Y_NEG:v[0-9]+]], 0x80000000, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG]], [[X_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fabs_neg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -607,7 +607,7 @@
 ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-define void @add_select_neg_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -629,7 +629,7 @@
 ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[X_ABS]], [[Y]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-define void @add_select_negfabs_neg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -651,7 +651,7 @@
 ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
-define void @mul_select_negfabs_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -672,7 +672,7 @@
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
-define void @mul_select_posk_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -690,7 +690,7 @@
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
-define void @mul_select_negfabs_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_negfabs_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -709,7 +709,7 @@
 ; GCN: v_cmp_ne_u32_e64 vcc
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
-define void @mul_select_negk_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_negk_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -732,7 +732,7 @@
 ; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], -4.0, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_add_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_add_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -749,7 +749,7 @@
 ; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], 4.0, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %add = fsub float %x, 4.0
@@ -765,7 +765,7 @@
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %mul = fmul float %x, 4.0
@@ -782,7 +782,7 @@
 ; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[X]], -4.0, -[[Z]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[FMA]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -799,7 +799,7 @@
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[X]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -818,7 +818,7 @@
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
 ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
 ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
-define void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
diff --git a/llvm/test/CodeGen/AMDGPU/select-i1.ll b/llvm/test/CodeGen/AMDGPU/select-i1.ll
index 07dcb21..5eaad1f 100644
--- a/llvm/test/CodeGen/AMDGPU/select-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-i1.ll
@@ -6,7 +6,7 @@
 ; FUNC-LABEL: {{^}}select_i1:
 ; SI: v_cndmask_b32
 ; SI-NOT: v_cndmask_b32
-define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
+define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i1 %a, i1 %b
   store i1 %sel, i1 addrspace(1)* %out, align 4
@@ -19,7 +19,7 @@
 ; SI-DAG: buffer_load_ubyte [[B:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
 ; SI: v_cmp_eq_u32_e32 vcc, 1, [[COND]]
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
-define void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
+define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
   %cmp = icmp slt i1 %cond, false
   %sel = select i1 %cmp, i1 %a, i1 %b
   store i1 %sel, i1 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll
index ad358d3..d56b952 100644
--- a/llvm/test/CodeGen/AMDGPU/select-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll
@@ -11,7 +11,7 @@
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %and = and i1 %icmp0, %icmp1
@@ -27,7 +27,7 @@
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %and = and i1 %fcmp0, %fcmp1
@@ -43,7 +43,7 @@
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %and = and i1 %icmp0, %icmp1
@@ -59,7 +59,7 @@
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %and = and i1 %fcmp0, %fcmp1
@@ -76,7 +76,7 @@
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: s_endpgm
-define void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %or = or i1 %icmp0, %icmp1
@@ -92,7 +92,7 @@
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %or = or i1 %fcmp0, %fcmp1
@@ -108,7 +108,7 @@
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %or = or i1 %icmp0, %icmp1
@@ -124,7 +124,7 @@
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %or = or i1 %fcmp0, %fcmp1
@@ -138,7 +138,7 @@
 ; GCN: v_cmp_neq_f32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 
-define void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
+define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
 entry:
   %cmp0 = fcmp oeq float %c0, 1.0
   br i1 %cmp0, label %if0, label %endif
diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index 759abe2..8710fc8 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -10,7 +10,7 @@
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
+define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
   %cmp = icmp eq i8 %c, 0
   %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
   store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
@@ -22,7 +22,7 @@
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
   store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4
@@ -36,7 +36,7 @@
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx2
-define void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
   store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8
@@ -49,7 +49,7 @@
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx4
-define void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
   store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16
@@ -64,7 +64,7 @@
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: buffer_store_dwordx4
-define void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
 bb:
   %tmp2 = icmp ult i32 %cond, 32
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %in
@@ -82,7 +82,7 @@
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
   store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16
@@ -102,7 +102,7 @@
 ; SI: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
 ; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx2
-define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
   store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16
@@ -120,7 +120,7 @@
 ; SI: v_cndmask_b32_e32
 
 ; SI: buffer_store_dwordx4
-define void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
   store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16
@@ -135,7 +135,7 @@
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: buffer_store_dwordx4
-define void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
 bb:
   %tmp2 = icmp ult i32 %cond, 32
   %val = load <4 x float>, <4 x float> addrspace(1)* %in
@@ -153,7 +153,7 @@
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
   store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16
@@ -165,7 +165,7 @@
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
   store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16
@@ -181,7 +181,7 @@
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
   store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16
@@ -205,7 +205,7 @@
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
   store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 050398c..ea578c5 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -17,7 +17,7 @@
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16(
+define amdgpu_kernel void @select_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -48,7 +48,7 @@
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_a(
+define amdgpu_kernel void @select_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c,
@@ -78,7 +78,7 @@
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_b(
+define amdgpu_kernel void @select_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c,
@@ -109,7 +109,7 @@
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_c(
+define amdgpu_kernel void @select_f16_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -139,7 +139,7 @@
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_d(
+define amdgpu_kernel void @select_f16_imm_d(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -168,7 +168,7 @@
 ; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16(
+define amdgpu_kernel void @select_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -202,7 +202,7 @@
 ; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_a(
+define amdgpu_kernel void @select_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b,
     <2 x half> addrspace(1)* %c,
@@ -235,7 +235,7 @@
 ; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_b(
+define amdgpu_kernel void @select_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %c,
@@ -272,7 +272,7 @@
 ; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_c(
+define amdgpu_kernel void @select_v2f16_imm_c(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -304,7 +304,7 @@
 ; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_d(
+define amdgpu_kernel void @select_v2f16_imm_d(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
diff --git a/llvm/test/CodeGen/AMDGPU/select.ll b/llvm/test/CodeGen/AMDGPU/select.ll
index 45f3cd5..e53c159 100644
--- a/llvm/test/CodeGen/AMDGPU/select.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.ll
@@ -14,7 +14,7 @@
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
-define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
+define amdgpu_kernel void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
                      <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out,
                      <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out,
                      i32 %cond) {
diff --git a/llvm/test/CodeGen/AMDGPU/select64.ll b/llvm/test/CodeGen/AMDGPU/select64.ll
index a68fdecb..3b4c925 100644
--- a/llvm/test/CodeGen/AMDGPU/select64.ll
+++ b/llvm/test/CodeGen/AMDGPU/select64.ll
@@ -7,7 +7,7 @@
 ; CHECK-NOT: s_lshr_b64
 ; CHECK: v_cndmask
 ; CHECK: v_cndmask
-define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
+define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
 entry:
   %0 = icmp ugt i32 %cond, 5
   %1 = select i1 %0, i64 0, i64 %in
@@ -18,7 +18,7 @@
 ; CHECK-LABEL: {{^}}select_trunc_i64:
 ; CHECK: v_cndmask_b32
 ; CHECK-NOT: v_cndmask_b32
-define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
+define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 0, i64 %in
   %trunc = trunc i64 %sel to i32
@@ -29,7 +29,7 @@
 ; CHECK-LABEL: {{^}}select_trunc_i64_2:
 ; CHECK: v_cndmask_b32
 ; CHECK-NOT: v_cndmask_b32
-define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 %a, i64 %b
   %trunc = trunc i64 %sel to i32
@@ -40,7 +40,7 @@
 ; CHECK-LABEL: {{^}}v_select_trunc_i64_2:
 ; CHECK: v_cndmask_b32
 ; CHECK-NOT: v_cndmask_b32
-define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
@@ -54,7 +54,7 @@
 ; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
 ; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}}
 ; CHECK: s_endpgm
-define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll b/llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll
index 94d0ace..1861685 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll
@@ -3,7 +3,7 @@
 ;CHECK-NOT: SETE
 ;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x,
 ;CHECK: 1073741824
-define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
   %1 = load float, float addrspace(1)* %in
   %2 = fcmp oeq float %1, 0.0
   %3 = select i1 %2, float 1.0, float 2.0
diff --git a/llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll b/llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
index 58a4ee7..1504165 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
@@ -3,7 +3,7 @@
 ;CHECK-NOT: SETE_INT
 ;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x,
 ;CHECK-NEXT: 2
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %1 = load i32, i32 addrspace(1)* %in
   %2 = icmp eq i32 %1, 0
   %3 = select i1 %2, i32 1, i32 2
diff --git a/llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll b/llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
index e870ee8..7af5478 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
@@ -6,7 +6,7 @@
 ; CHECK-NEXT: -1
 ; Test a selectcc with i32 LHS/RHS and float True/False
 
-define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32, i32 addrspace(1)* %in
   %1 = icmp sge i32 %0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
index 0f46d4c..8fef3f8 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
@@ -7,7 +7,7 @@
 ; EG-NOT: CND
 ; EG: SET{{[NEQGTL]+}}_DX10
 
-define void @test_a(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test_a(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 0.000000e+00
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -35,7 +35,7 @@
 ; EG: SET{{[GTEQN]+}}_DX10
 ; EG-NEXT: PRED_
 ; EG-NEXT: ALU clause starting
-define void @test_b(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test_b(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 0.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -59,7 +59,7 @@
 ; Test a CND*_INT instruction with float true/false values
 ; EG-LABEL: {{^}}test_c:
 ; EG: CND{{[GTE]+}}_INT
-define void @test_c(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test_c(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   %1 = select i1 %0, float 2.0, float 3.0
@@ -72,7 +72,7 @@
 ; SI-NEXT: v_cndmask_b32_e64
 ; SI-NOT: cmp
 ; SI-NOT: cndmask
-define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = select i1 %icmp0, i32 -1, i32 0
   store i32 %ext, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/selectcc.ll b/llvm/test/CodeGen/AMDGPU/selectcc.ll
index 446d4ab3..7eca229 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc.ll
@@ -11,7 +11,7 @@
 ; SI: v_cmp_eq_u64
 ; SI: v_cndmask
 ; SI: v_cndmask
-define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
+define amdgpu_kernel void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
 entry:
   %0 = icmp eq i64 %lhs, %rhs
   %1 = select i1 %0, i64 %true, i64 %false
diff --git a/llvm/test/CodeGen/AMDGPU/set-dx10.ll b/llvm/test/CodeGen/AMDGPU/set-dx10.ll
index 57365a6..6867c63 100644
--- a/llvm/test/CodeGen/AMDGPU/set-dx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-dx10.ll
@@ -8,7 +8,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp une float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -22,7 +22,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp une float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -34,7 +34,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oeq float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -48,7 +48,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oeq float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -60,7 +60,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ogt float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -74,7 +74,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ogt float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -86,7 +86,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oge float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -100,7 +100,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oge float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -112,7 +112,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -126,7 +126,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -138,7 +138,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -152,7 +152,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll b/llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll
index 11ea793..853afa8 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll
@@ -3,7 +3,7 @@
 ; EG-LABEL: {{^}}and_setcc_setcc_i32:
 ; EG: AND_INT
 ; EG-NEXT: SETE_INT
-define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %cmp1 = icmp eq i32 %a, -1
   %cmp2 = icmp eq i32 %b, -1
   %and = and i1 %cmp1, %cmp2
@@ -20,7 +20,7 @@
 ; EG: SETE_INT
 ; EG: AND_INT
 ; EG: SETE_INT
-define void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+define amdgpu_kernel void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
   %cmp1 = icmp eq <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   %cmp2 = icmp eq <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
   %and = and <4 x i1> %cmp1, %cmp2
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll b/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
index d96b03f..8d455d8 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
@@ -10,7 +10,7 @@
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @multi_use_fneg_src() #0 {
+define amdgpu_kernel void @multi_use_fneg_src() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %b = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
@@ -33,7 +33,7 @@
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[A]]
 ; GCN: v_mul_f32_e64 [[USE1:v[0-9]+]], [[MUL]], -[[MUL]]
-define void @multi_foldable_use_fneg_src() #0 {
+define amdgpu_kernel void @multi_foldable_use_fneg_src() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %b = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
@@ -59,7 +59,7 @@
 ; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 4.0, [[MUL]]
 ; GCN-NOT: xor
 ; GCN: buffer_store_dword [[MUL]]
-define void @multi_use_fneg() #0 {
+define amdgpu_kernel void @multi_use_fneg() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %b = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
@@ -82,7 +82,7 @@
 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL0]]
 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[MUL0]], [[MUL0]]
 ; GCN: buffer_store_dword [[MUL1]]
-define void @multi_foldable_use_fneg() #0 {
+define amdgpu_kernel void @multi_foldable_use_fneg() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %b = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
@@ -101,7 +101,7 @@
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_oeq_posk_f32:
 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_oeq_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_oeq_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -114,7 +114,7 @@
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ogt_posk_f32:
 ; GCN: v_cmp_gt_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ogt_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ogt_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -127,7 +127,7 @@
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_oge_posk_f32:
 ; GCN: v_cmp_ge_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_oge_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_oge_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -140,7 +140,7 @@
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_olt_posk_f32:
 ; GCN: v_cmp_lt_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_olt_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_olt_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -153,7 +153,7 @@
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ole_posk_f32:
 ; GCN: v_cmp_le_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ole_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ole_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -166,7 +166,7 @@
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_one_posk_f32:
 ; GCN: v_cmp_lg_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_one_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_one_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -179,7 +179,7 @@
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ueq_posk_f32:
 ; GCN: v_cmp_nlg_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ueq_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ueq_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -192,7 +192,7 @@
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ugt_posk_f32:
 ; GCN: v_cmp_nle_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ugt_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ugt_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -205,7 +205,7 @@
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_uge_posk_f32:
 ; GCN: v_cmp_nlt_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_uge_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_uge_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -218,7 +218,7 @@
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ult_posk_f32:
 ; GCN: v_cmp_nge_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ult_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ult_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -231,7 +231,7 @@
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ule_posk_f32:
 ; GCN: v_cmp_ngt_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ule_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ule_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -244,7 +244,7 @@
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_une_posk_f32:
 ; GCN: v_cmp_neq_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_une_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_une_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
index 4ab6da0..caddb6f 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -11,7 +11,7 @@
 
 ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
 ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
-define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 0
@@ -28,7 +28,7 @@
 
 ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
 ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
-define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 0
@@ -42,7 +42,7 @@
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, -1
@@ -56,7 +56,7 @@
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, -1
@@ -70,7 +70,7 @@
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 0
@@ -84,7 +84,7 @@
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 0
@@ -98,7 +98,7 @@
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 1
@@ -111,7 +111,7 @@
 ; GCN: v_cmp_eq_u32_e32 vcc,
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
-define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 1
@@ -124,7 +124,7 @@
 ; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_byte [[TMP]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, -1
@@ -137,7 +137,7 @@
 ; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[TMP]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, -1
@@ -159,7 +159,7 @@
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
+define amdgpu_kernel void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = zext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, 255
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -172,7 +172,7 @@
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
+define amdgpu_kernel void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
   %b = load i8, i8 addrspace(1)* %b.ptr
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
@@ -186,7 +186,7 @@
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind {
+define amdgpu_kernel void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind {
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -207,7 +207,7 @@
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
+define amdgpu_kernel void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -218,7 +218,7 @@
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
+define amdgpu_kernel void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = zext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -229,7 +229,7 @@
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 2
@@ -241,7 +241,7 @@
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 2
@@ -256,7 +256,7 @@
 ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_byte [[K]]
-define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 1
@@ -267,7 +267,7 @@
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[K]]
-define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 1
@@ -278,7 +278,7 @@
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[K]]
-define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 2
diff --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll
index 10d04ba..add90e9 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc.ll
@@ -9,7 +9,7 @@
 
 ; GCN-DAG: v_cmp_eq_u32_e32
 ; GCN-DAG: v_cmp_eq_u32_e64
-define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %result = icmp eq <2 x i32> %a, %b
   %sext = sext <2 x i1> %result to <2 x i32>
   store <2 x i32> %sext, <2 x i32> addrspace(1)* %out
@@ -26,7 +26,7 @@
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cmp_eq_u32_e64
-define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -43,7 +43,7 @@
 ; FUNC-LABEL: {{^}}f32_oeq:
 ; R600: SETE_DX10
 ; GCN: v_cmp_eq_f32
-define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp oeq float %a, %b
   %1 = sext i1 %0 to i32
@@ -54,7 +54,7 @@
 ; FUNC-LABEL: {{^}}f32_ogt:
 ; R600: SETGT_DX10
 ; GCN: v_cmp_gt_f32
-define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ogt float %a, %b
   %1 = sext i1 %0 to i32
@@ -65,7 +65,7 @@
 ; FUNC-LABEL: {{^}}f32_oge:
 ; R600: SETGE_DX10
 ; GCN: v_cmp_ge_f32
-define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp oge float %a, %b
   %1 = sext i1 %0 to i32
@@ -76,7 +76,7 @@
 ; FUNC-LABEL: {{^}}f32_olt:
 ; R600: SETGT_DX10
 ; GCN: v_cmp_lt_f32
-define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp olt float %a, %b
   %1 = sext i1 %0 to i32
@@ -87,7 +87,7 @@
 ; FUNC-LABEL: {{^}}f32_ole:
 ; R600: SETGE_DX10
 ; GCN: v_cmp_le_f32
-define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ole float %a, %b
   %1 = sext i1 %0 to i32
@@ -105,7 +105,7 @@
 
 ; GCN: v_cmp_lg_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_one(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp one float %a, %b
   %1 = sext i1 %0 to i32
@@ -119,7 +119,7 @@
 ; R600-DAG: AND_INT
 ; R600-DAG: SETNE_INT
 ; GCN: v_cmp_o_f32
-define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ord float %a, %b
   %1 = sext i1 %0 to i32
@@ -137,7 +137,7 @@
 
 ; GCN: v_cmp_nlg_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ueq float %a, %b
   %1 = sext i1 %0 to i32
@@ -150,7 +150,7 @@
 ; R600: SETE_DX10
 ; GCN: v_cmp_nle_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ugt float %a, %b
   %1 = sext i1 %0 to i32
@@ -164,7 +164,7 @@
 
 ; GCN: v_cmp_nlt_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp uge float %a, %b
   %1 = sext i1 %0 to i32
@@ -178,7 +178,7 @@
 
 ; GCN: v_cmp_nge_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ult float %a, %b
   %1 = sext i1 %0 to i32
@@ -192,7 +192,7 @@
 
 ; GCN: v_cmp_ngt_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ule float %a, %b
   %1 = sext i1 %0 to i32
@@ -203,7 +203,7 @@
 ; FUNC-LABEL: {{^}}f32_une:
 ; R600: SETNE_DX10
 ; GCN: v_cmp_neq_f32
-define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_une(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp une float %a, %b
   %1 = sext i1 %0 to i32
@@ -217,7 +217,7 @@
 ; R600: OR_INT
 ; R600: SETNE_INT
 ; GCN: v_cmp_u_f32
-define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp uno float %a, %b
   %1 = sext i1 %0 to i32
@@ -232,7 +232,7 @@
 ; FUNC-LABEL: {{^}}i32_eq:
 ; R600: SETE_INT
 ; GCN: v_cmp_eq_u32
-define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp eq i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -243,7 +243,7 @@
 ; FUNC-LABEL: {{^}}i32_ne:
 ; R600: SETNE_INT
 ; GCN: v_cmp_ne_u32
-define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ne i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -254,7 +254,7 @@
 ; FUNC-LABEL: {{^}}i32_ugt:
 ; R600: SETGT_UINT
 ; GCN: v_cmp_gt_u32
-define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ugt i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -265,7 +265,7 @@
 ; FUNC-LABEL: {{^}}i32_uge:
 ; R600: SETGE_UINT
 ; GCN: v_cmp_ge_u32
-define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp uge i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -276,7 +276,7 @@
 ; FUNC-LABEL: {{^}}i32_ult:
 ; R600: SETGT_UINT
 ; GCN: v_cmp_lt_u32
-define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ult i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -287,7 +287,7 @@
 ; FUNC-LABEL: {{^}}i32_ule:
 ; R600: SETGE_UINT
 ; GCN: v_cmp_le_u32
-define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ule i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -298,7 +298,7 @@
 ; FUNC-LABEL: {{^}}i32_sgt:
 ; R600: SETGT_INT
 ; GCN: v_cmp_gt_i32
-define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sgt i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -309,7 +309,7 @@
 ; FUNC-LABEL: {{^}}i32_sge:
 ; R600: SETGE_INT
 ; GCN: v_cmp_ge_i32
-define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sge i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -320,7 +320,7 @@
 ; FUNC-LABEL: {{^}}i32_slt:
 ; R600: SETGT_INT
 ; GCN: v_cmp_lt_i32
-define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp slt i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -331,7 +331,7 @@
 ; FUNC-LABEL: {{^}}i32_sle:
 ; R600: SETGE_INT
 ; GCN: v_cmp_le_i32
-define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sle i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -348,7 +348,7 @@
 ; GCN-DAG: v_cmp_eq_u32
 ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
 ; GCN: s_endpgm
-define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) #0 {
+define amdgpu_kernel void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep.a = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptra, i32 %tid
   %gep.b = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptrb, i32 %tid
@@ -369,7 +369,7 @@
 ; GCN-DAG: v_cmp_eq_u32
 ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
 ; GCN: s_endpgm
-define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) #0 {
+define amdgpu_kernel void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep.a = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptra, i32 %tid
   %gep.b = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptrb, i32 %tid
@@ -386,7 +386,7 @@
 ; FUNC-LABEL: setcc-i1
 ; GCN: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 1
 ; GCN: s_cmp_eq_u32 [[AND]], 0
-define void @setcc-i1(i32 %in) #0 {
+define amdgpu_kernel void @setcc-i1(i32 %in) #0 {
   %and = and i32 %in, 1
   %cmp = icmp eq i32 %and, 0
   br i1 %cmp, label %endif, label %if
@@ -400,7 +400,7 @@
 ; GCN-DAG: v_cmp_ge_f32_e64 [[A:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_cmp_le_f32_e64 [[B:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
 ; GCN: s_and_b64 s[2:3], [[A]], [[B]]
-define void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 {
+define amdgpu_kernel void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 {
 bb0:
   %tmp5 = fcmp oge float %cond, 0.000000e+00
   %tmp7 = fcmp ole float %cond, 1.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/setcc64.ll b/llvm/test/CodeGen/AMDGPU/setcc64.ll
index 1f86277..1f1bdb0 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc64.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc64.ll
@@ -9,7 +9,7 @@
 
 ; GCN-LABEL: {{^}}f64_oeq:
 ; GCN: v_cmp_eq_f64
-define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp oeq double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -19,7 +19,7 @@
 
 ; GCN-LABEL: {{^}}f64_ogt:
 ; GCN: v_cmp_gt_f64
-define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ogt double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -29,7 +29,7 @@
 
 ; GCN-LABEL: {{^}}f64_oge:
 ; GCN: v_cmp_ge_f64
-define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp oge double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -39,7 +39,7 @@
 
 ; GCN-LABEL: {{^}}f64_olt:
 ; GCN: v_cmp_lt_f64
-define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp olt double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -49,7 +49,7 @@
 
 ; GCN-LABEL: {{^}}f64_ole:
 ; GCN: v_cmp_le_f64
-define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ole double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -60,7 +60,7 @@
 ; GCN-LABEL: {{^}}f64_one:
 ; GCN: v_cmp_lg_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_one(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp one double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -70,7 +70,7 @@
 
 ; GCN-LABEL: {{^}}f64_ord:
 ; GCN: v_cmp_o_f64
-define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ord double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -81,7 +81,7 @@
 ; GCN-LABEL: {{^}}f64_ueq:
 ; GCN: v_cmp_nlg_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ueq double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -93,7 +93,7 @@
 
 ; GCN: v_cmp_nle_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ugt double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -104,7 +104,7 @@
 ; GCN-LABEL: {{^}}f64_uge:
 ; GCN: v_cmp_nlt_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp uge double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -115,7 +115,7 @@
 ; GCN-LABEL: {{^}}f64_ult:
 ; GCN: v_cmp_nge_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ult double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -126,7 +126,7 @@
 ; GCN-LABEL: {{^}}f64_ule:
 ; GCN: v_cmp_ngt_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ule double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -136,7 +136,7 @@
 
 ; GCN-LABEL: {{^}}f64_une:
 ; GCN: v_cmp_neq_f64
-define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_une(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp une double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -146,7 +146,7 @@
 
 ; GCN-LABEL: {{^}}f64_uno:
 ; GCN: v_cmp_u_f64
-define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp uno double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -160,7 +160,7 @@
 
 ; GCN-LABEL: {{^}}i64_eq:
 ; GCN: v_cmp_eq_u64
-define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp eq i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -170,7 +170,7 @@
 
 ; GCN-LABEL: {{^}}i64_ne:
 ; GCN: v_cmp_ne_u64
-define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ne i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -180,7 +180,7 @@
 
 ; GCN-LABEL: {{^}}i64_ugt:
 ; GCN: v_cmp_gt_u64
-define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ugt i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -190,7 +190,7 @@
 
 ; GCN-LABEL: {{^}}i64_uge:
 ; GCN: v_cmp_ge_u64
-define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp uge i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -200,7 +200,7 @@
 
 ; GCN-LABEL: {{^}}i64_ult:
 ; GCN: v_cmp_lt_u64
-define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ult i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -210,7 +210,7 @@
 
 ; GCN-LABEL: {{^}}i64_ule:
 ; GCN: v_cmp_le_u64
-define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ule i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -220,7 +220,7 @@
 
 ; GCN-LABEL: {{^}}i64_sgt:
 ; GCN: v_cmp_gt_i64
-define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp sgt i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -230,7 +230,7 @@
 
 ; GCN-LABEL: {{^}}i64_sge:
 ; GCN: v_cmp_ge_i64
-define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp sge i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -240,7 +240,7 @@
 
 ; GCN-LABEL: {{^}}i64_slt:
 ; GCN: v_cmp_lt_i64
-define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp slt i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -250,7 +250,7 @@
 
 ; GCN-LABEL: {{^}}i64_sle:
 ; GCN: v_cmp_le_i64
-define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp sle i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
diff --git a/llvm/test/CodeGen/AMDGPU/sext-eliminate.ll b/llvm/test/CodeGen/AMDGPU/sext-eliminate.ll
index 7dc6eb8..0b780af 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-eliminate.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-eliminate.ll
@@ -6,7 +6,7 @@
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: SUB_INT {{[* ]*}}[[RES]]
 ; EG-NOT: BFE
-define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+define amdgpu_kernel void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
   %sext = sext i1 %a to i32
   %res = add i32 %b, %sext
   store i32 %res, i32 addrspace(1)* %out
@@ -18,7 +18,7 @@
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: ADD_INT {{[* ]*}}[[RES]]
 ; EG-NOT: BFE
-define void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+define amdgpu_kernel void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) {
   %sext = sext i1 %a to i32
   %res = sub i32 %b, %sext
   store i32 %res, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
index adba6bb..7ac4e1d 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
@@ -11,7 +11,7 @@
 ; EG: LSHR {{\*?}} [[ADDR]]
 
 ; Works with the align 2 removed
-define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %c = add <2 x i32> %a, %b
   %x = shl <2 x i32> %c, <i32 6, i32 6>
   %y = ashr <2 x i32> %x, <i32 7, i32 7>
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
index 015448c..82af20f 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -15,7 +15,7 @@
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: LSHR * [[ADDR]]
 ; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1
-define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 {
   %shl = shl i32 %in, 31
   %sext = ashr i32 %shl, 31
   store i32 %sext, i32 addrspace(1)* %out
@@ -32,7 +32,7 @@
 ; EG: ADD_INT
 ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
 ; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 24
   %ashr = ashr i32 %shl, 24
@@ -50,7 +50,7 @@
 ; EG: ADD_INT
 ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
 ; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 16
   %ashr = ashr i32 %shl, 16
@@ -68,7 +68,7 @@
 ; EG: ADD_INT
 ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
 ; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
   %c = add <1 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <1 x i32> %c, <i32 24>
   %ashr = ashr <1 x i32> %shl, <i32 24>
@@ -82,7 +82,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 63
   %ashr = ashr i64 %shl, 63
@@ -96,7 +96,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 56
   %ashr = ashr i64 %shl, 56
@@ -111,7 +111,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 
-define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 48
   %ashr = ashr i64 %shl, 48
@@ -125,7 +125,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 32
   %ashr = ashr i64 %shl, 32
@@ -140,7 +140,7 @@
 ; XGCN: buffer_store_dword
 ; XEG: BFE_INT
 ; XEG: ASHR
-; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) #0 {
+; define amdgpu_kernel void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) #0 {
 ;   %c = add <1 x i64> %a, %b
 ;   %shl = shl <1 x i64> %c, <i64 56>
 ;   %ashr = ashr <1 x i64> %shl, <i64 56>
@@ -160,7 +160,7 @@
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -187,7 +187,7 @@
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -214,7 +214,7 @@
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -238,7 +238,7 @@
 
 ; GCN: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
 ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[SHR]]{{\]}}
-define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -264,7 +264,7 @@
 ; EG: LSHL
 ; EG: ASHR [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b
   %x = shl i32 %c, 6
   %y = ashr i32 %x, 7
@@ -287,7 +287,7 @@
 ; EG: LSHL
 ; EG: ASHR [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b
   %x = shl <2 x i32> %c, <i32 6, i32 6>
   %y = ashr <2 x i32> %x, <i32 7, i32 7>
@@ -305,7 +305,7 @@
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 31, i32 31>
   %ashr = ashr <2 x i32> %shl, <i32 31, i32 31>
@@ -326,7 +326,7 @@
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
   %c = add <4 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
   %ashr = ashr <4 x i32> %shl, <i32 31, i32 31, i32 31, i32 31>
@@ -343,7 +343,7 @@
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 24, i32 24>
   %ashr = ashr <2 x i32> %shl, <i32 24, i32 24>
@@ -364,7 +364,7 @@
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
   %c = add <4 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
   %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
@@ -381,7 +381,7 @@
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 16, i32 16>
   %ashr = ashr <2 x i32> %shl, <i32 16, i32 16>
@@ -390,7 +390,7 @@
 }
 
 ; FUNC-LABEL: {{^}}testcase:
-define void @testcase(i8 addrspace(1)* %out, i8 %a) #0 {
+define amdgpu_kernel void @testcase(i8 addrspace(1)* %out, i8 %a) #0 {
   %and_a_1 = and i8 %a, 1
   %cmp_eq = icmp eq i8 %and_a_1, 0
   %cmp_slt = icmp slt i8 %a, 0
@@ -402,7 +402,7 @@
 }
 
 ; FUNC-LABEL: {{^}}testcase_3:
-define void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 {
+define amdgpu_kernel void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 {
   %and_a_1 = and i8 %a, 1
   %cmp_eq = icmp eq i8 %and_a_1, 0
   %cmp_slt = icmp slt i8 %a, 0
@@ -418,7 +418,7 @@
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
-define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
+define amdgpu_kernel void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
   %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
   %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
   %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
@@ -431,7 +431,7 @@
 ; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32:
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
-define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
+define amdgpu_kernel void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
   %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
   %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
   %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
@@ -446,7 +446,7 @@
 ; GCN: v_max_i32
 ; GCN-NOT: bfe
 ; GCN: buffer_store_short
-define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) #0 {
+define amdgpu_kernel void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) #0 {
   %tmp5 = load i8, i8 addrspace(1)* %src, align 1
   %tmp2 = sext i8 %tmp5 to i32
   %tmp2.5 = icmp sgt i32 %tmp2, 0
@@ -462,7 +462,7 @@
 ; FUNC-LABEL: {{^}}bfe_0_width:
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %load = load i32, i32 addrspace(1)* %ptr, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -473,7 +473,7 @@
 ; GCN: v_bfe_i32
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %load = load i32, i32 addrspace(1)* %ptr, align 4
   %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
   %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
@@ -484,7 +484,7 @@
 ; FUNC-LABEL: {{^}}bfe_8_bfe_16:
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
 ; GCN: s_endpgm
-define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %load = load i32, i32 addrspace(1)* %ptr, align 4
   %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
   %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone
@@ -497,7 +497,7 @@
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %load = load i32, i32 addrspace(1)* %ptr, align 4
   %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone
   %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
@@ -510,7 +510,7 @@
 ; GCN: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone
   %shl = shl i32 %bfe, 24
@@ -520,7 +520,7 @@
 }
 
 ; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong:
-define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone
   %shl = shl i32 %bfe, 24
@@ -533,7 +533,7 @@
 ; GCN: buffer_load_sbyte
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
   %load = load i8, i8 addrspace(1)* %ptr, align 1
   %sext = sext i8 %load to i32
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone
@@ -547,7 +547,7 @@
 ; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
   %load = load i8, i8 addrspace(1)* %ptr, align 1
   %sext = sext i8 %load to i32
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone
@@ -562,7 +562,7 @@
 ; GCN-NOT: shl
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
 ; GCN: s_endpgm
-define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %shr = ashr i32 %shl, 31
@@ -577,7 +577,7 @@
 ; GCN-NOT: shr
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
 ; GCN: s_endpgm
-define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 30
   %shr = ashr i32 %shl, 30
@@ -593,7 +593,7 @@
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
 ; GCN: s_endpgm
-define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 30
   %shr = ashr i32 %shl, 30
@@ -617,7 +617,7 @@
 ; GCN-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[HI]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
 ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -647,7 +647,7 @@
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
 ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -673,7 +673,7 @@
 ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
-define void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
   %ld = load i32, i32 addrspace(2)* %ptr
   %in = trunc i32 %ld to i16
   %shl = shl i16 %in, 15
@@ -692,7 +692,7 @@
 ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
-define void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
   %ld = load i32, i32 addrspace(2)* %ptr
   %in = trunc i32 %ld to i16
   %shl = shl i16 %in, 14
@@ -706,7 +706,7 @@
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[VAL]], 0, 1{{$}}
 
 ; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
-define void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %gep = getelementptr i16, i16 addrspace(1)* %ptr, i32 %tid
   %out.gep = getelementptr i16, i16 addrspace(3)* %out, i32 %tid
@@ -727,7 +727,7 @@
 
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}}
 ; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
-define void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 %s.val) nounwind {
+define amdgpu_kernel void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 %s.val) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -753,7 +753,7 @@
 ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
-define void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
   %shl = shl i16 %in, 14
   %sext = ashr i16 %shl, 14
   store i16 %sext, i16 addrspace(1)* %out
@@ -770,7 +770,7 @@
 ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
-define void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
   %shl = shl i16 %in, 8
   %sext = ashr i16 %shl, 8
   store i16 %sext, i16 addrspace(1)* %out
@@ -787,7 +787,7 @@
 ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
-define void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+define amdgpu_kernel void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
   %shl = shl i16 %in, 1
   %sext = ashr i16 %shl, 1
   store i16 %sext, i16 addrspace(1)* %out
@@ -798,7 +798,7 @@
 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]]
 ; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 15, [[ADD]]
 ; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 15, [[SHL]]
-define void @sext_in_reg_v2i1_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
   %c = add <2 x i16> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i16> %c, <i16 15, i16 15>
   %ashr = ashr <2 x i16> %shl, <i16 15, i16 15>
@@ -813,7 +813,7 @@
 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
-define void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
   %c = add <3 x i16> %a, %b ; add to prevent folding into extload
   %shl = shl <3 x i16> %c, <i16 15, i16 15, i16 15>
   %ashr = ashr <3 x i16> %shl, <i16 15, i16 15, i16 15>
@@ -825,7 +825,7 @@
 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]]
 ; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 14, [[ADD]]
 ; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 14, [[SHL]]
-define void @sext_in_reg_v2i2_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i2_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
   %c = add <2 x i16> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i16> %c, <i16 14, i16 14>
   %ashr = ashr <2 x i16> %shl, <i16 14, i16 14>
@@ -837,7 +837,7 @@
 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]]
 ; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 8, [[ADD]]
 ; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 8, [[SHL]]
-define void @sext_in_reg_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
   %c = add <2 x i16> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i16> %c, <i16 8, i16 8>
   %ashr = ashr <2 x i16> %shl, <i16 8, i16 8>
@@ -852,7 +852,7 @@
 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
-define void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
   %c = add <3 x i16> %a, %b ; add to prevent folding into extload
   %shl = shl <3 x i16> %c, <i16 8, i16 8, i16 8>
   %ashr = ashr <3 x i16> %shl, <i16 8, i16 8, i16 8>
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index d5d2f6b..8e18ab5 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -13,7 +13,7 @@
 
 ; SI: s_sub
 
-define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %if, label %else
@@ -52,7 +52,7 @@
 ; SI: s_add_i32 s{{[0-9]+}}, [[LOAD0]], [[LOAD1]]
 ; SI: buffer_store_dword
 ; SI-NEXT: s_endpgm
-define void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %if, label %else
@@ -79,7 +79,7 @@
 ; SI: s_add_i32 [[SGPR:s[0-9]+]]
 ; SI-NOT: s_add_i32 [[SGPR]]
 
-define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid_f = uitofp i32 %tid to float
@@ -116,7 +116,7 @@
 ; SI: v_cmp_ne_u32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]]
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp1 = icmp eq i32 %tid, 0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
index f44ae6e..fb0bbaa 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
@@ -6,7 +6,7 @@
 
 ; SI-LABEL: {{^}}test_dup_operands:
 ; SI: v_add_i32_e32
-define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
+define amdgpu_kernel void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %lo = extractelement <2 x i32> %a, i32 0
   %hi = extractelement <2 x i32> %a, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
index 79bb5af..5c20e9a 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -265,7 +265,7 @@
 ; CHECK: buffer_load_dword
 ; CHECK: v_add
 ; CHECK: s_endpgm
-define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
+define amdgpu_kernel void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
 entry:
   %tmp = load float, float addrspace(1)* %in0
   %tmp1 = fcmp oeq float %tmp, 0.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
index 48bbc32..b3cb19a 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
@@ -11,7 +11,7 @@
 
 ; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
@@ -33,7 +33,7 @@
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
@@ -55,7 +55,7 @@
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
@@ -77,7 +77,7 @@
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
@@ -100,7 +100,7 @@
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index b85714e..744c1c2 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -8,7 +8,7 @@
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -25,7 +25,7 @@
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -41,7 +41,7 @@
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -57,7 +57,7 @@
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -73,7 +73,7 @@
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -89,7 +89,7 @@
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -105,7 +105,7 @@
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -121,7 +121,7 @@
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -137,7 +137,7 @@
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -155,7 +155,7 @@
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -171,7 +171,7 @@
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -188,7 +188,7 @@
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -204,7 +204,7 @@
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
 ; GCN-DAG: v_mov_b32_e32 v[[BFE:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -220,7 +220,7 @@
 ; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31
 ; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
-define void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -236,7 +236,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 ; GCN: buffer_store_dword v[[SHIFT]]
-define void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -252,7 +252,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}}
 ; GCN: buffer_store_dword [[BFE]]
-define void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -268,7 +268,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}}
 ; GCN: buffer_store_dword [[BFE]]
-define void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -286,7 +286,7 @@
 ; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]]
 ; GCN-NOT: v[[SHRLO]]
 ; GCN: buffer_store_dword v[[SHRLO]]
-define void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -306,7 +306,7 @@
 ; GCN-NOT: v[[SHRLO]]
 ; GCN-NOT: v[[SHRHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
-define void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -327,7 +327,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -347,7 +347,7 @@
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
 ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO_SHR]]{{\]}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO_BFE]]{{\]}}
-define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -365,7 +365,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
 ; GCN: buffer_store_dword v[[ZERO]]
-define void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out0.gep = getelementptr i64, i64 addrspace(1)* %out0, i32 %id.x
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
index 28a7b92..a803849 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -8,7 +8,7 @@
 ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 3, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 35
   store i64 %shl, i64 addrspace(1)* %out
@@ -20,7 +20,7 @@
 ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 31, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 63
   store i64 %shl, i64 addrspace(1)* %out
@@ -32,7 +32,7 @@
 ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 33
   store i64 %shl, i64 addrspace(1)* %out
@@ -43,7 +43,7 @@
 ; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 32
   store i64 %shl, i64 addrspace(1)* %out
@@ -58,7 +58,7 @@
 ; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[HI]], 8, 23
 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %and = and i64 %val, 9223372036854775807 ; 0x7fffffffffffffff
   %shl = lshr i64 %and, 40
@@ -73,7 +73,7 @@
 ; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 3, [[VAL]]
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 35
   store i64 %shl, i64 addrspace(1)* %out
@@ -84,7 +84,7 @@
 ; GCN-DAG: buffer_load_dword v[[HI:[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 32
   store i64 %shl, i64 addrspace(1)* %out
@@ -96,7 +96,7 @@
 ; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 31, [[VAL]]
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 63
   store i64 %shl, i64 addrspace(1)* %out
@@ -106,7 +106,7 @@
 ; ashr (i64 x), 63 => (ashr lo(x), 31), lo(x)
 
 ; GCN-LABEL: {{^}}ashr_i64_const_32:
-define void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = ashr i64 %val, 32
   store i64 %shl, i64 addrspace(1)* %out
@@ -114,7 +114,7 @@
 }
 
 ; GCN-LABEL: {{^}}ashr_i64_const_63:
-define void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = ashr i64 %val, 63
   store i64 %shl, i64 addrspace(1)* %out
@@ -125,7 +125,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 31, [[VAL]]
 ; GCN: buffer_store_dword [[SHL]]
-define void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 31
   %trunc = trunc i64 %shl to i32
@@ -137,7 +137,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
 ; GCN: buffer_store_short [[SHL]]
-define void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 15
   %trunc = trunc i64 %shl to i16
@@ -149,7 +149,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
 ; GCN: buffer_store_short [[SHL]]
-define void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %val = load i32, i32 addrspace(1)* %in
   %shl = shl i32 %val, 15
   %trunc = trunc i32 %shl to i16
@@ -161,7 +161,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 7, [[VAL]]
 ; GCN: buffer_store_byte [[SHL]]
-define void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 7
   %trunc = trunc i64 %shl to i8
@@ -174,7 +174,7 @@
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]]
 ; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 2, [[SHL]]
 ; GCN: buffer_store_byte [[AND]]
-define void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 1
   %trunc = trunc i64 %shl to i2
@@ -186,7 +186,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]]
 ; GCN: buffer_store_dword [[SHL]]
-define void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 1
   %trunc = trunc i64 %shl to i32
@@ -198,7 +198,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[VAL]]
 ; GCN: buffer_store_dword [[SHL]]
-define void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 16
   %trunc = trunc i64 %shl to i32
@@ -209,7 +209,7 @@
 ; GCN-LABEL: {{^}}trunc_shl_33_i32_i64:
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[ZERO]]
-define void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 33
   %trunc = trunc i64 %shl to i32
@@ -222,7 +222,7 @@
 ; GCN-DAG: v_lshlrev_b32_e32 v[[RESHI:[0-9]+]], 16, v{{[0-9]+}}
 ; GCN-DAG: v_lshlrev_b32_e32 v[[RESLO:[0-9]+]], 16, v[[LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
-define void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %shl = shl <2 x i64> %val, <i64 16, i64 16>
   %trunc = trunc <2 x i64> %shl to <2 x i32>
@@ -235,7 +235,7 @@
 ; GCN: v_lshl_b64 v{{\[}}[[RESLO:[0-9]+]]:[[RESHI:[0-9]+]]{{\]}}, [[VAL]], 31
 ; GCN: buffer_store_dword v[[RESLO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
-define void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 31
   %trunc = trunc i64 %shl to i32
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 8844273..f6520ee 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -17,7 +17,7 @@
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -44,7 +44,7 @@
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -57,7 +57,7 @@
 ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %a = load i16, i16 addrspace(1)* %in
   %b = load i16, i16 addrspace(1)* %b_ptr
@@ -70,7 +70,7 @@
 ; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
 
 ; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
+define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
   %a = load i16, i16 addrspace(1)* %in
   %result = shl i16 %a, %b
   store i16 %result, i16 addrspace(1)* %out
@@ -81,7 +81,7 @@
 ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
+define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
   %a = load i16, i16 addrspace(1)* %in
   %b.add = add i16 %b, 3
   %result = shl i16 %a, %b.add
@@ -92,7 +92,7 @@
 ; GCN-LABEL: {{^}}shl_i16_computed_amount:
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 3, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, [[ADD]], v{{[0-9]+}}
-define void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
@@ -107,7 +107,7 @@
 
 ; GCN-LABEL: {{^}}shl_i16_i_s:
 ; GCN: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 12
-define void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
+define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
   %result = shl i16 %a, 12
   store i16 %result, i16 addrspace(1)* %out
   ret void
@@ -116,7 +116,7 @@
 ; GCN-LABEL: {{^}}shl_v2i16:
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -133,7 +133,7 @@
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid
@@ -160,7 +160,7 @@
 ; GCN-LABEL: {{^}}shl_i64:
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 ; VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
   %b = load i64, i64 addrspace(1)* %b_ptr
@@ -199,7 +199,7 @@
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
-define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -262,7 +262,7 @@
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
-define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
@@ -277,7 +277,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = shl i64 %a, 32
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -287,7 +287,7 @@
 ; GCN-DAG: buffer_load_dword v[[LO_A:[0-9]+]],
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}}
-define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -299,7 +299,7 @@
 
 ; FUNC-LABEL: {{^}}s_shl_constant_i64
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-define void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %shl = shl i64 281474976710655, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -311,7 +311,7 @@
 ; SI-DAG: s_movk_i32 s[[KHI:[0-9]+]], 0x11e{{$}}
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]]
 ; SI: buffer_store_dwordx2
-define void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %shl = shl i64 1231231234567, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
@@ -323,7 +323,7 @@
 ; SI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x12d687{{$}}
 ; SI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0{{$}}
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]]
-define void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %shl = shl i64 1234567, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
@@ -332,7 +332,7 @@
 
 ; FUNC-LABEL: {{^}}v_shl_inline_imm_64_i64:
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, 64, {{v[0-9]+}}
-define void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %shl = shl i64 64, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
@@ -341,7 +341,7 @@
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_64_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 64, s{{[0-9]+}}
-define void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 64, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -349,7 +349,7 @@
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_1_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1, s{{[0-9]+}}
-define void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 1, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -357,7 +357,7 @@
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_1.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4607182418800017408, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -365,7 +365,7 @@
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_1.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13830554455654793216, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -373,7 +373,7 @@
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_0.5_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 0.5, s{{[0-9]+}}
-define void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4602678819172646912, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -381,7 +381,7 @@
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_0.5_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -0.5, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13826050856027422720, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -389,7 +389,7 @@
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_2.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 2.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4611686018427387904, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -397,7 +397,7 @@
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_2.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -2.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13835058055282163712, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -405,7 +405,7 @@
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_4.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 4.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4616189618054758400, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -413,7 +413,7 @@
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_4.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -4.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13839561654909534208, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -427,7 +427,7 @@
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}}
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 1082130432, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -439,7 +439,7 @@
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}}
 ; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]]
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 -1065353216, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -450,7 +450,7 @@
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4647714815446351872, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -460,7 +460,7 @@
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13871086852301127680, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -468,7 +468,7 @@
 
 ; FUNC-LABEL: {{^}}test_mul2:
 ; GCN: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
-define void @test_mul2(i32 %p) {
+define amdgpu_kernel void @test_mul2(i32 %p) {
    %i = mul i32 %p, 2
    store volatile i32 %i, i32 addrspace(1)* undef
    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index bf4a623..04145f7 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -13,7 +13,7 @@
 ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CIVI: v_or_b32_e32
-define void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
   %result = shl <2 x i16> %lhs, %rhs
   store <2 x i16> %result, <2 x i16> addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@
 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
 ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -55,7 +55,7 @@
 ; GFX9: s_load_dword [[RHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -70,7 +70,7 @@
 ; GFX9: s_load_dword [[LHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -84,7 +84,7 @@
 ; GCN-LABEL: {{^}}shl_imm_v_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8
-define void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -98,7 +98,7 @@
 ; GCN-LABEL: {{^}}shl_v_imm_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]]
-define void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -115,7 +115,7 @@
 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -133,7 +133,7 @@
 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll b/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
index 9b5f9fe..9da4bc0 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
@@ -9,7 +9,7 @@
 ; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 36, [[REG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
   %val = load i32, i32 addrspace(1)* %ptr, align 4
@@ -25,7 +25,7 @@
 ; SI-DAG: buffer_store_dword [[ADDREG]]
 ; SI-DAG: buffer_store_dword [[SHLREG]]
 ; SI: s_endpgm
-define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
   %val = load i32, i32 addrspace(1)* %ptr, align 4
@@ -43,7 +43,7 @@
 ; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xf9c, [[REG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
   %val = load i32, i32 addrspace(1)* %ptr, align 4
@@ -61,7 +61,7 @@
 ; SI: s_addk_i32 [[RESULT]], 0x3d8
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
 ; SI: buffer_store_dword [[VRESULT]]
-define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %add.0 = add i32 %x, 123
   %shl = shl i32 %add.0, 3
   %add.1 = add i32 %shl, %y
@@ -78,7 +78,7 @@
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[TMP]]
 ; SI: buffer_store_dword [[VRESULT]]
 
-define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %add.0 = add i32 %x, 123
   %shl = shl i32 %add.0, 3
   %add.1 = add i32 %y, %shl
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
index 6e45759..9147eb5 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -19,7 +19,7 @@
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8
 ; SI: s_endpgm
-define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -39,7 +39,7 @@
 ; SI-DAG: buffer_store_dword [[RESULT]]
 ; SI-DAG: buffer_store_dword [[ADDUSE]]
 ; SI: s_endpgm
-define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -55,7 +55,7 @@
 ; SI-LABEL: {{^}}load_shl_base_lds_max_offset
 ; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535
 ; SI: s_endpgm
-define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 65535
   %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0
@@ -73,7 +73,7 @@
 ; SI: s_mov_b32 m0, -1
 ; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
 ; SI: s_endpgm
-define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 64
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -89,7 +89,7 @@
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -104,7 +104,7 @@
 
 @lds2 = addrspace(3) global [512 x i32] undef, align 4
 
-; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+; define amdgpu_kernel void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
 ;   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 ;   %idx.0 = add nsw i32 %tid.x, 2
 ;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -119,7 +119,7 @@
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
+define amdgpu_kernel void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -134,7 +134,7 @@
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -148,7 +148,7 @@
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -162,7 +162,7 @@
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -176,7 +176,7 @@
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -190,7 +190,7 @@
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -204,7 +204,7 @@
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -214,7 +214,7 @@
   ret void
 }
 
-; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+; define amdgpu_kernel void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
 ;   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 ;   %idx.0 = add nsw i32 %tid.x, 2
 ;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -228,7 +228,7 @@
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -242,7 +242,7 @@
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -256,7 +256,7 @@
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -270,7 +270,7 @@
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 08d44cc..14ca635 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -7,7 +7,7 @@
 ; GCN-LABEL: {{^}}v_test_i32_x_sub_64:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
-define void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -23,7 +23,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[Y:v[0-9]+]]
 ; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
 ; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]]
-define void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -40,7 +40,7 @@
 ; GCN-LABEL: {{^}}v_test_i32_64_sub_x:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
-define void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -54,7 +54,7 @@
 ; GCN-LABEL: {{^}}v_test_i32_x_sub_65:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xffffffbf, [[X]]
-define void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -68,7 +68,7 @@
 ; GCN-LABEL: {{^}}v_test_i32_65_sub_x:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x41, [[X]]
-define void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -82,7 +82,7 @@
 ; GCN-LABEL: {{^}}v_test_i32_x_sub_neg16:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 16, [[X]]
-define void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -96,7 +96,7 @@
 ; GCN-LABEL: {{^}}v_test_i32_neg16_sub_x:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, -16, [[X]]
-define void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -110,7 +110,7 @@
 ; GCN-LABEL: {{^}}v_test_i32_x_sub_neg17:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 17, [[X]]
-define void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -124,7 +124,7 @@
 ; GCN-LABEL: {{^}}v_test_i32_neg17_sub_x:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0xffffffef, [[X]]
-define void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -138,7 +138,7 @@
 ; GCN-LABEL: {{^}}s_test_i32_x_sub_64:
 ; GCN: s_load_dword [[X:s[0-9]+]]
 ; GCN: s_sub_i32 s{{[0-9]+}}, [[X]], 64
-define void @s_test_i32_x_sub_64(i32 %x) #0 {
+define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 {
   %result = sub i32 %x, 64
   call void asm sideeffect "; use $0", "s"(i32 %result)
   ret void
@@ -147,7 +147,7 @@
 ; GCN-LABEL: {{^}}v_test_i16_x_sub_64:
 ; VI: {{buffer|flat}}_load_ushort [[X:v[0-9]+]]
 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[X]]
-define void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
@@ -166,7 +166,7 @@
 
 ; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
 ; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]]
-define void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
index 1988a14..6248d8a 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
@@ -7,7 +7,7 @@
 # resume crashes
 
 --- |
-  define void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -20,7 +20,7 @@
     ret void
   }
 
-  define void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -33,7 +33,7 @@
     ret void
   }
 
-  define void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -46,7 +46,7 @@
     ret void
   }
 
-  define void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -59,7 +59,7 @@
     ret void
   }
 
-  define void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -72,7 +72,7 @@
     ret void
   }
 
-  define void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
index ef616eb..2bc734c 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
@@ -9,7 +9,7 @@
 ; GCN: s_cbranch_vccnz
 ; GCN-NOT: s_endpgm
 ; GCN: .Lfunc_end0
-define void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+define amdgpu_kernel void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -40,7 +40,7 @@
 ; GCN: s_cbranch_scc1
 ; GCN: s_endpgm
 ; GCN: .Lfunc_end1
-define void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+define amdgpu_kernel void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
index 50c3887..e50c595 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
@@ -11,7 +11,7 @@
 ; GCN: s_and_saveexec_b64
 ; GCN-NOT: s_endpgm
 ; GCN: .Lfunc_end0
-define void @annotate_unreachable(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+define amdgpu_kernel void @annotate_unreachable(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index d658b22..a4b6d1f 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -10,7 +10,7 @@
 ; SI: s_andn2_b64
 ; s_cbranch_execnz [[LOOP_LABEL]]
 ; SI: s_endpgm
-define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
 main_body:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %0 = and i32 %a, %tid
@@ -40,7 +40,7 @@
 ; SI: s_cbranch_execnz [[LOOP_LABEL]]
 ; SI: s_endpgm
 
-define void @phi_cond_outside_loop(i32 %b) {
+define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %0 = icmp eq i32 %tid , 0
@@ -68,7 +68,7 @@
 ; CHECK-LABEL: {{^}}switch_unreachable:
 ; CHECK-NOT: s_endpgm
 ; CHECK: .Lfunc_end2
-define void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
+define amdgpu_kernel void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
 centry:
   switch i32 %x, label %sw.default [
     i32 0, label %sw.bb
@@ -100,7 +100,7 @@
 
 ; SI: [[ENDPGM]]:
 ; SI: s_endpgm
-define void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
+define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
 entry:
   %cmp = icmp sgt i32 %c0, 0
   br label %while.cond.outer
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
index 025a3d8f..b0473f3 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
@@ -6,7 +6,7 @@
 ; CHECK s_or_b64 exec, exec
 ; CHECK s_andn2_b64 exec, exec
 ; CHECK s_cbranch_execnz
-define void @test(i32 %arg, i32 %arg1) {
+define amdgpu_kernel void @test(i32 %arg, i32 %arg1) {
 bb:
   %tmp = icmp ne i32 %arg, 0
   %tmp7 = icmp ne i32 %arg1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
index 0c08deb..20052e8 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
@@ -1,7 +1,7 @@
 # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies %s -o - | FileCheck %s -check-prefixes=GCN
 
 --- |
-  define void @phi_visit_order() { ret void }
+  define amdgpu_kernel void @phi_visit_order() { ret void }
 
 name: phi_visit_order
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
index 0d1de66..580268d 100644
--- a/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
@@ -5,7 +5,7 @@
 
 ; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec
 
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load volatile i32, i32 addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
index 8d66df2..89c1eeb 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -14,7 +14,7 @@
 ; GCN: [[UNREACHABLE]]:
 ; GCN: ds_write_b32
 ; GCN: s_waitcnt
-define void @lower_control_flow_unreachable_terminator() #0 {
+define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 {
 bb:
   %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
   %tmp63 = icmp eq i32 %tmp15, 32
@@ -41,7 +41,7 @@
 ; GCN-NEXT: s_or_b64 exec, exec
 ; GCN: ds_write_b32
 ; GCN: s_waitcnt
-define void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
+define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
 bb:
   %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
   %tmp63 = icmp eq i32 %tmp15, 32
diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
index f5ae4f5..114c97b 100644
--- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
@@ -24,7 +24,7 @@
 
 ; SMEM: s_dcache_wb
 ; ALL: s_endpgm
-define void @test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 313f5ee..8a4cee2 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -13,7 +13,7 @@
 ; FUNC-LABEL: @reorder_local_load_global_store_local_load
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
 ; CI: buffer_store_dword
-define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
@@ -33,7 +33,7 @@
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
 ; CI: buffer_store_dword
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
-define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
@@ -53,7 +53,7 @@
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
 ; CI: buffer_store_dword
-define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
@@ -77,7 +77,7 @@
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
 ; CI: buffer_store_dword
-define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
@@ -100,7 +100,7 @@
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
 ; CI: ds_write_b32
 ; CI: buffer_store_dword
-define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
+define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
@@ -122,7 +122,7 @@
 ; CI: s_load_dword
 ; CI: ds_write_b32
 ; CI: buffer_store_dword
-define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
+define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
   %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
 
@@ -141,7 +141,7 @@
 ; CI: buffer_load_dword
 ; CI: buffer_load_dword
 ; CI: buffer_store_dword
-define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
+define amdgpu_kernel void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
   %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 3
 
@@ -161,7 +161,7 @@
 ; CI-DAG: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
 ; CI: buffer_store_dword
 ; CI: s_endpgm
-define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
+define amdgpu_kernel void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
   %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100
   %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 102
@@ -187,7 +187,7 @@
 ; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
 ; CI: buffer_store_dword
 ; CI: s_endpgm
-define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
+define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
   %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100
   %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 102
@@ -221,7 +221,7 @@
 
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:36{{$}}
 ; GCN-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:52{{$}}
-define void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
+define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
 
diff --git a/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll b/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll
index dd8783d..7990990 100644
--- a/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll
@@ -12,7 +12,7 @@
 ; CHECK: buffer_store_byte
 ; ModuleID = 'radeon'
 
-define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 {
+define amdgpu_kernel void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 {
 entry:
   %0 = load i8, i8 addrspace(1)* %in0, align 1
   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index 875351c..3e452c2 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -4,7 +4,7 @@
 ; GCN-LABEL: {{^}}s_sext_i1_to_i32:
 ; GCN: v_cndmask_b32_e64
 ; GCN: s_endpgm
-define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -14,7 +14,7 @@
 ; GCN-LABEL: {{^}}test_s_sext_i32_to_i64:
 ; GCN: s_ashr_i32
 ; GCN: s_endpg
-define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
+define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
 entry:
   %mul = mul i32 %a, %b
   %add = add i32 %mul, %c
@@ -28,7 +28,7 @@
 ; GCN: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}}
 ; GCN: s_endpgm
-define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
@@ -38,7 +38,7 @@
 ; GCN-LABEL: {{^}}s_sext_i32_to_i64:
 ; GCN: s_ashr_i32
 ; GCN: s_endpgm
-define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
   %sext = sext i32 %a to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
   ret void
@@ -47,7 +47,7 @@
 ; GCN-LABEL: {{^}}v_sext_i32_to_i64:
 ; GCN: v_ashr
 ; GCN: s_endpgm
-define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %sext = sext i32 %val to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
@@ -56,7 +56,7 @@
 
 ; GCN-LABEL: {{^}}s_sext_i16_to_i64:
 ; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000
-define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
+define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
   %sext = sext i16 %a to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
   ret void
@@ -65,7 +65,7 @@
 ; GCN-LABEL: {{^}}s_sext_i1_to_i16:
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
 ; GCN-NEXT: buffer_store_short [[RESULT]]
-define void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i16
   store i16 %sext, i16 addrspace(1)* %out
@@ -79,7 +79,7 @@
 ; GCN-LABEL: {{^}}s_sext_i1_to_i16_with_and:
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
 ; GCN-NEXT: buffer_store_short [[RESULT]]
-define void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
   %cmp0 = icmp eq i32 %a, %b
   %cmp1 = icmp eq i32 %c, %d
   %cmp = and i1 %cmp0, %cmp1
@@ -91,7 +91,7 @@
 ; GCN-LABEL: {{^}}v_sext_i1_to_i16_with_and:
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
 ; GCN-NEXT: buffer_store_short [[RESULT]]
-define void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
+define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %cmp0 = icmp eq i32 %a, %tid
   %cmp1 = icmp eq i32 %b, %c
@@ -130,7 +130,7 @@
 ; GCN-DAG: buffer_store_dword [[VEXT3]]
 
 ; GCN: s_endpgm
-define void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cast = bitcast i32 %a to <4 x i8>
   %ext = sext <4 x i8> %cast to <4 x i32>
   %elt0 = extractelement <4 x i32> %ext, i32 0
@@ -162,7 +162,7 @@
 ; GCN: buffer_store_dword [[EXT1]]
 ; GCN: buffer_store_dword [[EXT2]]
 ; GCN: buffer_store_dword [[EXT3]]
-define void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %a = load i32, i32 addrspace(1)* %in
   %cast = bitcast i32 %a to <4 x i8>
   %ext = sext <4 x i8> %cast to <4 x i32>
@@ -184,7 +184,7 @@
 ; GCN-DAG: s_sext_i32_i16
 ; GCN-DAG: s_sext_i32_i16
 ; GCN: s_endpgm
-define void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
+define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
   %cast = bitcast i64 %a to <4 x i16>
   %ext = sext <4 x i16> %cast to <4 x i32>
   %elt0 = extractelement <4 x i32> %ext, i32 0
@@ -206,7 +206,7 @@
 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; GCN: s_endpgm
-define void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %a = load i64, i64 addrspace(1)* %in
   %cast = bitcast i64 %a to <4 x i16>
   %ext = sext <4 x i16> %cast to <4 x i32>
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index 68dc3c6..f98a716 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -4,7 +4,7 @@
 
 ; SI-LABEL: {{^}}sint_to_fp_i32_to_f64
 ; SI: v_cvt_f64_i32_e32
-define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
   %result = sitofp i32 %in to double
   store double %result, double addrspace(1)* %out
   ret void
@@ -19,7 +19,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; SI: s_endpgm
-define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = sitofp i1 %cmp to double
   store double %fp, double addrspace(1)* %out, align 4
@@ -31,14 +31,14 @@
 ; SI-NEXT: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
+define amdgpu_kernel void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
   %fp = sitofp i1 %in to double
   store double %fp, double addrspace(1)* %out, align 8
   ret void
 }
 
 ; SI-LABEL: @s_sint_to_fp_i64_to_f64
-define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
   %result = sitofp i64 %in to double
   store double %result, double addrspace(1)* %out
   ret void
@@ -51,7 +51,7 @@
 ; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index 5df8105..04cd199 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -4,7 +4,7 @@
 ; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_i64_to_f16:
-define void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
   %result = sitofp i64 %in to half
   store half %result, half addrspace(1)* %out
   ret void
@@ -28,7 +28,7 @@
 ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]],
 ; GCN: v_cvt_f16_f32_e32 [[SIGN_SEL_F16:v[0-9]+]], [[SIGN_SEL]]
 ; GCN: {{buffer|flat}}_store_short {{.*}}[[SIGN_SEL_F16]]
-define void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -39,7 +39,7 @@
 }
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_i64_to_f32:
-define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
   %result = sitofp i64 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -62,7 +62,7 @@
 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
 ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]],
 ; GCN: {{buffer|flat}}_store_dword {{.*}}[[SIGN_SEL]]
-define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -74,14 +74,14 @@
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64_to_v2f32:
 ; GCN-NOT: v_and_b32_e32 v{{[0-9]+}}, -1,
-define void @s_sint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = sitofp <2 x i64> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64_to_v4f32:
-define void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -93,14 +93,14 @@
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64_to_v2f16:
 ; GCN-NOT: v_and_b32_e32 v{{[0-9]+}}, -1,
-define void @s_sint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = sitofp <2 x i64> %in to <2 x half>
   store <2 x half> %result, <2 x half> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64_to_v4f16:
-define void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x half>, <4 x half> addrspace(1)* %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll
index 4c8fea1..8e85d99 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll
@@ -6,7 +6,7 @@
 ; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}}
 
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %result = sitofp i32 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@
 ; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{v[0-9]+$}}
 
 ; R600: INT_TO_FLT
-define void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -32,7 +32,7 @@
 
 ; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-define void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0{
+define amdgpu_kernel void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0{
   %result = sitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
@@ -49,7 +49,7 @@
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = sitofp <4 x i32> %value to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -81,7 +81,7 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
   store float %fp, float addrspace(1)* %out
@@ -92,7 +92,7 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 {
   %fp = sitofp i1 %in to float
   store float %fp, float addrspace(1)* %out
   ret void
@@ -105,7 +105,7 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
 ; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]]
 ; SI: s_endpgm
-define void @v_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index b216cbb..bbe8d7c 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -7,7 +7,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sitofp_i16_to_f16(
+define amdgpu_kernel void @sitofp_i16_to_f16(
     half addrspace(1)* %r,
     i16 addrspace(1)* %a) {
 entry:
@@ -23,7 +23,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sitofp_i32_to_f16(
+define amdgpu_kernel void @sitofp_i32_to_f16(
     half addrspace(1)* %r,
     i32 addrspace(1)* %a) {
 entry:
@@ -45,7 +45,7 @@
 ; GCN-DAG: v_or_b32_e32
 ; GCN:     buffer_store_dword
 ; GCN:     s_endpgm
-define void @sitofp_v2i16_to_v2f16(
+define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i16> addrspace(1)* %a) {
 entry:
@@ -65,7 +65,7 @@
 ; GCN-DAG: v_or_b32_e32
 ; GCN:     buffer_store_dword
 ; GCN:     s_endpgm
-define void @sitofp_v2i32_to_v2f16(
+define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i32> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll
index 16d7439..8665ab6 100644
--- a/llvm/test/CodeGen/AMDGPU/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/smed3.ll
@@ -6,7 +6,7 @@
 
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i32:
 ; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -25,7 +25,7 @@
 ; GCN-LABEL: {{^}}v_test_smed3_multi_use_r_i_i_i32:
 ; GCN: v_max_i32
 ; GCN: v_min_i32
-define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -45,7 +45,7 @@
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_constant_order_i32:
 ; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
 ; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
-define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -64,7 +64,7 @@
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_sign_mismatch_i32:
 ; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
 ; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
-define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -83,7 +83,7 @@
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i64:
 ; GCN: v_cmp_lt_i64
 ; GCN: v_cmp_gt_i64
-define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -102,7 +102,7 @@
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16:
 ; SICIVI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
@@ -174,7 +174,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -186,7 +186,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_1:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -198,7 +198,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_2:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -210,7 +210,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_3:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -222,7 +222,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_4:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -234,7 +234,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_5:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -246,7 +246,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_6:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -258,7 +258,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_7:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -270,7 +270,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_8:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -282,7 +282,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_9:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -294,7 +294,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_10:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -306,7 +306,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_11:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -318,7 +318,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_12:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -330,7 +330,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_13:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -342,7 +342,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_14:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -354,7 +354,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_15:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -370,7 +370,7 @@
 ; GCN: s_sext_i32_i16
 ; GCN: s_sext_i32_i16
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
 bb:
   %tmp0 = call i16 @smin16(i16 %x, i16 %y)
   %tmp1 = call i16 @smax16(i16 %x, i16 %y)
@@ -385,7 +385,7 @@
 ; GCN: s_sext_i32_i8
 ; GCN: s_sext_i32_i8
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
 bb:
   %tmp0 = call i8 @smin8(i8 %x, i8 %y)
   %tmp1 = call i8 @smax8(i8 %x, i8 %y)
@@ -397,7 +397,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_0:
 ; GCN-NOT: v_med3_i32
-define void @s_test_smed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -410,7 +410,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_1:
 ; GCN-NOT: v_med3_i32
-define void @s_test_smed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -423,7 +423,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_2:
 ; GCN-NOT: v_med3_i32
-define void @s_test_smed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -436,7 +436,7 @@
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_result:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -457,7 +457,7 @@
 ; VI: v_max_i16
 
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+define amdgpu_kernel void @v_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.ll b/llvm/test/CodeGen/AMDGPU/sminmax.ll
index ce5d924..827d672 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.ll
@@ -7,7 +7,7 @@
 ; GCN: s_add_i32
 
 ; EG: MAX_INT
-define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
   %res = select i1 %cond, i32 %val, i32 %neg
@@ -22,7 +22,7 @@
 ; GCN: v_add_i32
 
 ; EG: MAX_INT
-define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
   %val = load i32, i32 addrspace(1)* %src, align 4
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
@@ -36,7 +36,7 @@
 ; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
 ; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[NEG]], [[SRC]]
 ; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MAX]], [[MAX]]
-define void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
   %val = load i32, i32 addrspace(1)* %src, align 4
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
@@ -54,7 +54,7 @@
 
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind {
+define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind {
   %z0 = insertelement <2 x i32> undef, i32 0, i32 0
   %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
   %t0 = insertelement <2 x i32> undef, i32 2, i32 0
@@ -79,7 +79,7 @@
 
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind {
   %z0 = insertelement <2 x i32> undef, i32 0, i32 0
   %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
   %t0 = insertelement <2 x i32> undef, i32 2, i32 0
@@ -109,7 +109,7 @@
 ; EG: MAX_INT
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind {
+define amdgpu_kernel void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind {
   %z0 = insertelement <4 x i32> undef, i32 0, i32 0
   %z1 = insertelement <4 x i32> %z0, i32 0, i32 1
   %z2 = insertelement <4 x i32> %z1, i32 0, i32 2
@@ -146,7 +146,7 @@
 ; EG: MAX_INT
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind {
   %z0 = insertelement <4 x i32> undef, i32 0, i32 0
   %z1 = insertelement <4 x i32> %z0, i32 0, i32 1
   %z2 = insertelement <4 x i32> %z1, i32 0, i32 2
@@ -170,7 +170,7 @@
 
 ; GCN-DAG: s_min_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
 ; GCN-DAG: s_max_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
-define void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind {
+define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind {
   %cond0 = icmp sgt i32 %val0, %val1
   %sel0 = select i1 %cond0, i32 %val0, i32 %val1
   %sel1 = select i1 %cond0, i32 %val1, i32 %val0
@@ -186,7 +186,7 @@
 
 ; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
 ; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
-define void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
+define amdgpu_kernel void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
   %val0 = load volatile i32, i32 addrspace(1)* %ptr0
   %val1 = load volatile i32, i32 addrspace(1)* %ptr1
 
@@ -208,7 +208,7 @@
 ; GCN-DAG: s_max_i32
 ; GCN-DAG: s_max_i32
 ; GCN-DAG: s_max_i32
-define void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind {
+define amdgpu_kernel void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind {
   %cond0 = icmp sgt <4 x i32> %val0, %val1
   %sel0 = select <4 x i1> %cond0, <4 x i32> %val0, <4 x i32> %val1
   %sel1 = select <4 x i1> %cond0, <4 x i32> %val1, <4 x i32> %val0
@@ -223,7 +223,7 @@
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
-define void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
+define amdgpu_kernel void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
   %val0 = load volatile i32, i32 addrspace(1)* %ptr0
   %val1 = load volatile i32, i32 addrspace(1)* %ptr1
 
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 01d19a0..8da5ada 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -17,7 +17,7 @@
 ; CIVI: v_add_i32_e32
 ; CIVI: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
 ; CIVI: v_or_b32_e32
-define void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
+define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
   %neg = sub <2 x i16> zeroinitializer, %val
   %cond = icmp sgt <2 x i16> %val, %neg
   %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
@@ -41,7 +41,7 @@
 ; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
 ; VI-NOT: v_and_b32
 ; VI: v_or_b32_e32
-define void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
+define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %src, i32 %tid
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -59,7 +59,7 @@
 ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
-define void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
+define amdgpu_kernel void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
   %z0 = insertelement <2 x i16> undef, i16 0, i16 0
   %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
   %t0 = insertelement <2 x i16> undef, i16 2, i16 0
@@ -77,7 +77,7 @@
 ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
-define void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
+define amdgpu_kernel void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
   %z0 = insertelement <2 x i16> undef, i16 0, i16 0
   %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
   %t0 = insertelement <2 x i16> undef, i16 2, i16 0
@@ -101,7 +101,7 @@
 ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, [[VAL1]]
 ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], [[VAL1]], [[SUB1]]
 ; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2
-define void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 {
+define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 {
   %z0 = insertelement <4 x i16> undef, i16 0, i16 0
   %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
   %z2 = insertelement <4 x i16> %z1, i16 0, i16 2
@@ -128,7 +128,7 @@
 ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, v[[VAL1]]
 ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], v[[VAL1]], [[SUB1]]
 ; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2
-define void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %src) #0 {
+define amdgpu_kernel void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %src) #0 {
   %z0 = insertelement <4 x i16> undef, i16 0, i16 0
   %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
   %z2 = insertelement <4 x i16> %z1, i16 0, i16 2
@@ -147,7 +147,7 @@
 }
 
 ; GCN-LABEL: {{^}}s_min_max_v2i16:
-define void @s_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) #0 {
+define amdgpu_kernel void @s_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) #0 {
   %cond0 = icmp sgt <2 x i16> %val0, %val1
   %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
   %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
@@ -158,7 +158,7 @@
 }
 
 ; GCN-LABEL: {{^}}v_min_max_v2i16:
-define void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 {
+define amdgpu_kernel void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 {
   %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0
   %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1
 
@@ -172,7 +172,7 @@
 }
 
 ; GCN-LABEL: {{^}}s_min_max_v4i32:
-define void @s_min_max_v4i32(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 {
+define amdgpu_kernel void @s_min_max_v4i32(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 {
   %cond0 = icmp sgt <4 x i16> %val0, %val1
   %sel0 = select <4 x i1> %cond0, <4 x i16> %val0, <4 x i16> %val1
   %sel1 = select <4 x i1> %cond0, <4 x i16> %val1, <4 x i16> %val0
@@ -183,7 +183,7 @@
 }
 
 ; GCN-LABEL: {{^}}v_min_max_v2i16_user:
-define void @v_min_max_v2i16_user(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 {
+define amdgpu_kernel void @v_min_max_v2i16_user(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 {
   %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0
   %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1
 
@@ -200,7 +200,7 @@
 ; GCN-LABEL: {{^}}u_min_max_v2i16:
 ; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
 ; GFX9: v_pk_min_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define void @u_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind {
+define amdgpu_kernel void @u_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind {
   %cond0 = icmp ugt <2 x i16> %val0, %val1
   %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
   %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
diff --git a/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
index daac5b9..343211b 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
@@ -12,7 +12,7 @@
 ; GCN: buffer_store_dword
 ; GCN: [[EXIT]]:
 ; GCN: s_endpgm
-define void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) {
+define amdgpu_kernel void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) {
 entry:
   %cnd = fcmp oeq float 0.0, %cond
   %sgpr = load volatile i32, i32 addrspace(2)* %in
@@ -32,7 +32,7 @@
 ; GCN: buffer_store_dword
 ; GCN: [[EXIT]]:
 ; GCN: s_endpgm
-define void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) {
+define amdgpu_kernel void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
   %vgpr = load volatile float, float addrspace(1)* %in
   %cnd = fcmp oeq float 0.0, %vgpr
diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll
index 2384b74..50f72c6 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}smrd0:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
   %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
   %tmp1 = load i32, i32 addrspace(2)* %tmp
@@ -18,7 +18,7 @@
 ; GCN-LABEL: {{^}}smrd1:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
   %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
   %tmp1 = load i32, i32 addrspace(2)* %tmp
@@ -33,7 +33,7 @@
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
-define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
   %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
   %tmp1 = load i32, i32 addrspace(2)* %tmp
@@ -48,7 +48,7 @@
 ; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
 ; TODO: Add VI checks
 ; GCN: s_endpgm
-define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
   %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296
   %tmp1 = load i32, i32 addrspace(2)* %tmp
@@ -62,7 +62,7 @@
 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
   %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
   %tmp1 = load i32, i32 addrspace(2)* %tmp
@@ -76,7 +76,7 @@
 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
   %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
   %tmp1 = load i32, i32 addrspace(2)* %tmp
diff --git a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
index 74acc5b..c0f773ca 100644
--- a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
@@ -9,7 +9,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_inline_imm:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 4{{$}}
-define void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 4
   br i1 %cmp0, label %endif, label %if
@@ -25,7 +25,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_max:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x7fff{{$}}
-define void @br_scc_eq_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 32767
   br i1 %cmp0, label %endif, label %if
@@ -41,7 +41,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_max_p1:
 ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_eq_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 32768
   br i1 %cmp0, label %endif, label %if
@@ -57,7 +57,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_ne_i32_simm16_max_p1:
 ; GCN: s_cmpk_lg_u32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_ne_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i32 %cond, 32768
   br i1 %cmp0, label %endif, label %if
@@ -73,7 +73,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_min:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_eq_i32_simm16_min(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_min(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, -32768
   br i1 %cmp0, label %endif, label %if
@@ -89,7 +89,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_min_m1:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0xffff7fff{{$}}
-define void @br_scc_eq_i32_simm16_min_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_min_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, -32769
   br i1 %cmp0, label %endif, label %if
@@ -105,7 +105,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm15_max:
 ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0xffff{{$}}
-define void @br_scc_eq_i32_uimm15_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_uimm15_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65535
   br i1 %cmp0, label %endif, label %if
@@ -121,7 +121,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm16_max:
 ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0xffff{{$}}
-define void @br_scc_eq_i32_uimm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_uimm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65535
   br i1 %cmp0, label %endif, label %if
@@ -137,7 +137,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm16_max_p1:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0x10000{{$}}
-define void @br_scc_eq_i32_uimm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_uimm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65536
   br i1 %cmp0, label %endif, label %if
@@ -154,7 +154,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -170,7 +170,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_ne_i32:
 ; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -186,7 +186,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_sgt_i32:
 ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp sgt i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -202,7 +202,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_sgt_i32_simm16_max:
 ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x7fff{{$}}
-define void @br_scc_sgt_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sgt_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp sgt i32 %cond, 32767
   br i1 %cmp0, label %endif, label %if
@@ -218,7 +218,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_sgt_i32_simm16_max_p1:
 ; GCN: s_cmp_gt_i32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_sgt_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sgt_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp sgt i32 %cond, 32768
   br i1 %cmp0, label %endif, label %if
@@ -234,7 +234,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_sge_i32:
 ; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sge i32 %cond, %size
@@ -251,7 +251,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_slt_i32:
 ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp slt i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -267,7 +267,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_sle_i32:
 ; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sle i32 %cond, %size
@@ -284,7 +284,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_ugt_i32:
 ; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ugt i32 %cond, %size
@@ -301,7 +301,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_uge_i32:
 ; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp uge i32 %cond, %size
@@ -318,7 +318,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32:
 ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ult i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -334,7 +334,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16:
 ; GCN: s_cmp_lt_u32 s2, 0xffff8000
-define void @br_scc_ult_i32_min_simm16(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32_min_simm16(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ult i32 %cond, -32768
   br i1 %cmp0, label %endif, label %if
@@ -350,7 +350,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16_m1:
 ; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 0xffff7fff{{$}}
-define void @br_scc_ult_i32_min_simm16_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32_min_simm16_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ult i32 %cond, -32769
   br i1 %cmp0, label %endif, label %if
@@ -366,7 +366,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_ule_i32:
 ; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ule i32 %cond, %size
@@ -383,7 +383,7 @@
 
 ; GCN-LABEL: {{^}}commute_br_scc_eq_i32:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp eq i32 %size, %cond
@@ -400,7 +400,7 @@
 
 ; GCN-LABEL: {{^}}commute_br_scc_ne_i32:
 ; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ne i32 %size, %cond
@@ -417,7 +417,7 @@
 
 ; GCN-LABEL: {{^}}commute_br_scc_sgt_i32:
 ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sgt i32 %size, %cond
@@ -434,7 +434,7 @@
 
 ; GCN-LABEL: {{^}}commute_br_scc_sge_i32:
 ; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sge i32 %size, %cond
@@ -451,7 +451,7 @@
 
 ; GCN-LABEL: {{^}}commute_br_scc_slt_i32:
 ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp slt i32 %size, %cond
@@ -468,7 +468,7 @@
 
 ; GCN-LABEL: {{^}}commute_br_scc_sle_i32:
 ; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sle i32 %size, %cond
@@ -485,7 +485,7 @@
 
 ; GCN-LABEL: {{^}}commute_br_scc_ugt_i32:
 ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ugt i32 %size, %cond
@@ -502,7 +502,7 @@
 
 ; GCN-LABEL: {{^}}commute_br_scc_uge_i32:
 ; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp uge i32 %size, %cond
@@ -519,7 +519,7 @@
 
 ; GCN-LABEL: {{^}}commute_br_scc_ult_i32:
 ; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ult i32 %size, %cond
@@ -536,7 +536,7 @@
 
 ; GCN-LABEL: {{^}}commute_br_scc_ule_i32:
 ; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ule i32 %size, %cond
@@ -553,7 +553,7 @@
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32_non_u16:
 ; GCN: s_cmp_lt_u32 s2, 0xfffff7ff
-define void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %not.size = xor i32 %size, -1
@@ -573,7 +573,7 @@
 ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 4
 
 ; SI: v_cmp_eq_u64_e64
-define void @br_scc_eq_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i64 %cond, 4
   br i1 %cmp0, label %endif, label %if
@@ -593,7 +593,7 @@
 ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
 ; SI: v_cmp_eq_u64_e32
-define void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i64 %cond, 1234
   br i1 %cmp0, label %endif, label %if
@@ -611,7 +611,7 @@
 ; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, 4
 
 ; SI: v_cmp_ne_u64_e64
-define void @br_scc_ne_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i64 %cond, 4
   br i1 %cmp0, label %endif, label %if
@@ -631,7 +631,7 @@
 ; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
 ; SI: v_cmp_ne_u64_e32
-define void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i64 %cond, 1234
   br i1 %cmp0, label %endif, label %if
diff --git a/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll b/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
index ff94298..63ea21b 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
@@ -4,7 +4,7 @@
 ; allocate scratch registers correctly. Check that this test compiles without
 ; error.
 ; TONGA-LABEL: test
-define void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) {
 entry:
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
diff --git a/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll b/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll
index 686c831..1ca0919 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll
@@ -13,7 +13,7 @@
 ; CHECK-NEXT: s_or_b64 exec
 ; CHECK: buffer_
 
-define void @spill_cfg_position(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @spill_cfg_position(i32 addrspace(1)* nocapture %arg) {
 bb:
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp14 = load i32, i32 addrspace(1)* %arg, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
index 3d88b9a..0e715c4 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
@@ -43,7 +43,7 @@
 ; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]]
 
 ; GCN: s_add_i32 s{{[0-9]+}}, m0, 1
-define void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
   %cmp0 = icmp eq i32 %cond, 0
@@ -136,7 +136,7 @@
 ; GCN-NOT: v_readlane_b32 m0
 ; GCN-NOT: s_buffer_store_dword m0
 ; GCN-NOT: s_buffer_load_dword m0
-define void @m0_unavailable_spill(i32 %m0.arg) #0 {
+define amdgpu_kernel void @m0_unavailable_spill(i32 %m0.arg) #0 {
 main_body:
   %m0 = call i32 asm sideeffect "; def $0, 1", "={M0}"() #0
   %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0.arg)
@@ -189,7 +189,7 @@
 
 ; TOSMEM: s_dcache_wb
 ; TOSMEM: s_endpgm
-define void @restore_m0_lds(i32 %arg) {
+define amdgpu_kernel void @restore_m0_lds(i32 %arg) {
   %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
   %sval = load volatile i64, i64 addrspace(2)* undef
   %cmp = icmp eq i32 %arg, 0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 9b3dfab..c05021a 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -11,7 +11,7 @@
 
 ; Just test that it compiles successfully.
 ; CHECK-LABEL: test
-define void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) {
 entry:
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
diff --git a/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
index d4e2dc8..5d7d29d 100644
--- a/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
@@ -10,7 +10,7 @@
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0:
 ; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, v{{[0-9]+}}
 ; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
-define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) {
+define amdgpu_kernel void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) {
   %v.val = load volatile i32, i32 addrspace(1)* %in
   %vec.0 = insertelement <2 x i32> undef, i32 %s.val, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1
@@ -23,7 +23,7 @@
 ; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_0:
 ; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x18f
 ; SI: s_addc_u32 {{s[0-9]+}}, 0xf423f, 0
-define void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
+define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
   %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
@@ -35,7 +35,7 @@
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
+define amdgpu_kernel void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
   %v.val = load volatile i32, i32 addrspace(1)* %in
   %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1
@@ -48,7 +48,7 @@
 ; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_1:
 ; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; SI: s_addc_u32 {{s[0-9]+}}, 0x1869f, {{s[0-9]+}}
-define void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
+define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
   %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
@@ -61,7 +61,7 @@
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_2:
 ; SI: v_add_i32_e32 {{v[0-9]+}}, vcc, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: v_addc_u32_e32 {{v[0-9]+}}, vcc, {{v[0-9]+}}, {{v[0-9]+}}, vcc
-define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
+define amdgpu_kernel void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %load = load i32, i32 addrspace(1)* %gep
diff --git a/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll b/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
index da6c720..c242699 100644
--- a/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
@@ -29,7 +29,7 @@
 ; GCN-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24
 
 ; GCN: s_endpgm
-define void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly %srcValues, i32 addrspace(1)* nocapture readonly %offsets, <4 x i64> addrspace(1)* nocapture %destBuffer, i32 %alignmentOffset) #0 {
+define amdgpu_kernel void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly %srcValues, i32 addrspace(1)* nocapture readonly %offsets, <4 x i64> addrspace(1)* nocapture %destBuffer, i32 %alignmentOffset) #0 {
 entry:
   %tmp = tail call i32 @llvm.r600.read.local.size.y()
   %tmp1 = tail call i32 @llvm.r600.read.local.size.z()
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit.mir b/llvm/test/CodeGen/AMDGPU/splitkit.mir
index 1120ad7..ee58138 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit.mir
+++ b/llvm/test/CodeGen/AMDGPU/splitkit.mir
@@ -1,7 +1,7 @@
 # RUN: llc -o - %s -mtriple=amdgcn-- -mcpu=fiji -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s
 --- |
-  define void @func0() #0 { ret void }
-  define void @func1() #0 { ret void }
+  define amdgpu_kernel void @func0() #0 { ret void }
+  define amdgpu_kernel void @func1() #0 { ret void }
 
   attributes #0 = { "amdgpu-num-sgpr"="12" }
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index ad7c86f..25590c9 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -13,7 +13,7 @@
 
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -37,7 +37,7 @@
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -51,7 +51,7 @@
 ; global load we end up with the vector instructions rather than scalar.
 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
   %a = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
@@ -67,7 +67,7 @@
 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
   %a = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
@@ -80,7 +80,7 @@
 ; GCN: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
 
 ; EG: ASHR
-define void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
 entry:
   %in.ext = sext i32 %in to i64
   %ashr = ashr i64 %in.ext, 8
@@ -105,7 +105,7 @@
 ; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
@@ -143,7 +143,7 @@
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -156,7 +156,7 @@
 ; XFUNC-LABEL: {{^}}s_ashr_v2i64:
 ; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}}
 ; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}}
-; define void @s_ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in, <2 x i64> %a, <2 x i64> %b) {
+; define amdgpu_kernel void @s_ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in, <2 x i64> %a, <2 x i64> %b) {
 ;   %result = ashr <2 x i64> %a, %b
 ;   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
 ;   ret void
@@ -221,7 +221,7 @@
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
@@ -235,7 +235,7 @@
 ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
 ; GCN: s_add_u32 s{{[0-9]+}}, s[[HI]], s{{[0-9]+}}
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[SHIFT]], s{{[0-9]+}}
-define void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = ashr i64 %a, 32
   %add = add i64 %result, %b
   store i64 %add, i64 addrspace(1)* %out
@@ -247,7 +247,7 @@
 ; VI: flat_load_dword v[[HI:[0-9]+]]
 ; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]]
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[HI]]:[[SHIFT]]{{\]}}
-define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -262,7 +262,7 @@
 ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
 ; GCN: s_add_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
 ; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
-define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = ashr i64 %a, 63
   %add = add i64 %result, %b
   store i64 %add, i64 addrspace(1)* %out
@@ -275,7 +275,7 @@
 ; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]]
 ; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[SHIFT]]
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[SHIFT]]:[[COPY]]{{\]}}
-define void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index c78fd54..c89f798 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s
 
-define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in
   %den = load i32, i32 addrspace(1) * %den_ptr
@@ -11,7 +11,7 @@
   ret void
 }
 
-define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = srem i32 %num, 4
   store i32 %result, i32 addrspace(1)* %out
@@ -24,14 +24,14 @@
 ; SI: v_mul_lo_i32
 ; SI: v_sub_i32
 ; SI: s_endpgm
-define void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = srem i32 %num, 7
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
-define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
@@ -40,14 +40,14 @@
   ret void
 }
 
-define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %result = srem <2 x i32> %num, <i32 4, i32 4>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
@@ -56,14 +56,14 @@
   ret void
 }
 
-define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = srem <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
 
-define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %den_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %num = load i64, i64 addrspace(1) * %in
   %den = load i64, i64 addrspace(1) * %den_ptr
@@ -72,14 +72,14 @@
   ret void
 }
 
-define void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %num = load i64, i64 addrspace(1) * %in
   %result = srem i64 %num, 4
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %den_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
   %den = load <2 x i64>, <2 x i64> addrspace(1) * %den_ptr
@@ -88,14 +88,14 @@
   ret void
 }
 
-define void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
   %result = srem <2 x i64> %num, <i64 4, i64 4>
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
 }
 
-define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %den_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
   %den = load <4 x i64>, <4 x i64> addrspace(1) * %den_ptr
@@ -104,7 +104,7 @@
   ret void
 }
 
-define void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
   %result = srem <4 x i64> %num, <i64 4, i64 4, i64 4, i64 4>
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index 6b006fd..1daf4bb 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -8,7 +8,7 @@
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -26,7 +26,7 @@
 
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -50,7 +50,7 @@
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -74,7 +74,7 @@
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]|PS}}
 ; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], [[SHIFT]]
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
-define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
   %b = load i64, i64 addrspace(1)* %b_ptr
@@ -112,7 +112,7 @@
 ; EG-DAG: CNDE_INT {{.*}}, 0.0
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -178,7 +178,7 @@
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
@@ -193,7 +193,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = lshr i64 %a, 32
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -203,7 +203,7 @@
 ; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}}
-define void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/ssubo.ll b/llvm/test/CodeGen/AMDGPU/ssubo.ll
index 26884a1..1356323 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubo.ll
@@ -6,7 +6,7 @@
 declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
 
 ; FUNC-LABEL: {{^}}ssubo_i64_zext:
-define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
   %carry = extractvalue { i64, i1 } %ssub, 1
@@ -17,7 +17,7 @@
 }
 
 ; FUNC-LABEL: {{^}}s_ssubo_i32:
-define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %ssub, 0
   %carry = extractvalue { i32, i1 } %ssub, 1
@@ -27,7 +27,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ssubo_i32:
-define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
@@ -41,7 +41,7 @@
 ; FUNC-LABEL: {{^}}s_ssubo_i64:
 ; SI: s_sub_u32
 ; SI: s_subb_u32
-define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
   %carry = extractvalue { i64, i1 } %ssub, 1
@@ -53,7 +53,7 @@
 ; FUNC-LABEL: {{^}}v_ssubo_i64:
 ; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
-define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64, i64 addrspace(1)* %aptr, align 4
   %b = load i64, i64 addrspace(1)* %bptr, align 4
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
diff --git a/llvm/test/CodeGen/AMDGPU/store-barrier.ll b/llvm/test/CodeGen/AMDGPU/store-barrier.ll
index 57a93cc..afa4e94 100644
--- a/llvm/test/CodeGen/AMDGPU/store-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-barrier.ll
@@ -12,7 +12,7 @@
 ; CHECK: s_barrier
 ; CHECK: s_endpgm
 ; Function Attrs: nounwind
-define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 {
+define amdgpu_kernel void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 {
 bb:
   %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9
   %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/store-global.ll b/llvm/test/CodeGen/AMDGPU/store-global.ll
index 5d49795..160e921 100644
--- a/llvm/test/CodeGen/AMDGPU/store-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-global.ll
@@ -11,7 +11,7 @@
 ; CM-NOT: MEM_RAT MSKOR
 
 ; GCN: buffer_store_byte
-define void @store_i1(i1 addrspace(1)* %out) {
+define amdgpu_kernel void @store_i1(i1 addrspace(1)* %out) {
 entry:
   store i1 true, i1 addrspace(1)* %out
   ret void
@@ -42,7 +42,7 @@
 
 ; GCN: buffer_store_byte
 
-define void @store_i8(i8 addrspace(1)* %out, i8 %in) {
+define amdgpu_kernel void @store_i8(i8 addrspace(1)* %out, i8 %in) {
 entry:
   store i8 %in, i8 addrspace(1)* %out
   ret void
@@ -75,7 +75,7 @@
 ; EG: MOV * T[[RW_GPR]].Z, 0.0
 
 ; GCN: buffer_store_short
-define void @store_i16(i16 addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @store_i16(i16 addrspace(1)* %out, i16 %in) {
 entry:
   store i16 %in, i16 addrspace(1)* %out
   ret void
@@ -88,7 +88,7 @@
 
 ; EG: MEM_RAT MSKOR
 ; EG: MEM_RAT MSKOR
-define void @store_i24(i24 addrspace(1)* %out, i24 %in) {
+define amdgpu_kernel void @store_i24(i24 addrspace(1)* %out, i24 %in) {
 entry:
   store i24 %in, i24 addrspace(1)* %out
   ret void
@@ -104,7 +104,7 @@
 
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 ; CM-NOT: MEM_RAT
-define void @store_i25(i25 addrspace(1)* %out, i25 %in) {
+define amdgpu_kernel void @store_i25(i25 addrspace(1)* %out, i25 %in) {
 entry:
   store i25 %in, i25 addrspace(1)* %out
   ret void
@@ -119,7 +119,7 @@
 ; CM-NOT: MEM_RAT MSKOR
 
 ; GCN: buffer_store_short
-define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(1)* %out
@@ -136,7 +136,7 @@
 ; CM-NOT: MEM_RAT MSKOR
 
 ; SI: buffer_store_byte
-define void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(1)* %out, align 1
@@ -150,7 +150,7 @@
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dword
-define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(1)* %out
@@ -170,7 +170,7 @@
 
 ; SI: buffer_store_short
 ; SI: buffer_store_short
-define void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(1)* %out, align 2
@@ -183,7 +183,7 @@
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dword
-define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(1)* %out
@@ -210,7 +210,7 @@
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 1
@@ -231,7 +231,7 @@
 ; SI: buffer_store_short
 ; SI: buffer_store_short
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 2
@@ -246,7 +246,7 @@
 
 ; GCN: buffer_store_dword
 
-define void @store_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @store_f32(float addrspace(1)* %out, float %in) {
   store float %in, float addrspace(1)* %out
   ret void
 }
@@ -257,7 +257,7 @@
 ; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}}
 
 ; GCN: buffer_store_dwordx2
-define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i16>
   store <4 x i16> %0, <4 x i16> addrspace(1)* %out
@@ -272,7 +272,7 @@
 
 ; GCN: buffer_store_dwordx2
 
-define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
   %1 = insertelement <2 x float> %0, float %b, i32 1
@@ -286,7 +286,7 @@
 
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}},
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XY}}, {{T[0-9]+\.[XYZW]}},
-define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
+define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
   store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16
   ret void
 }
@@ -299,7 +299,7 @@
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dwordx4
-define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out
   ret void
@@ -313,7 +313,7 @@
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; SI: buffer_store_dwordx4
-define void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -328,7 +328,7 @@
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dwordx4
-define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %1 = load <4 x float>, <4 x float> addrspace(1) * %in
   store <4 x float> %1, <4 x float> addrspace(1)* %out
   ret void
@@ -340,7 +340,7 @@
 ; CM: MEM_RAT MSKOR
 
 ; GCN: buffer_store_byte
-define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
   store i8 %0, i8 addrspace(1)* %out
@@ -350,7 +350,7 @@
 ; FUNC-LABEL: {{^}}store_i64_i16:
 ; EG: MEM_RAT MSKOR
 ; GCN: buffer_store_short
-define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
   store i16 %0, i16 addrspace(1)* %out
@@ -369,7 +369,7 @@
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dwordx2
-define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
+define amdgpu_kernel void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
 entry:
   %0 = load i32, i32 addrspace(2)* %mem, align 4
   %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
@@ -388,7 +388,7 @@
 ; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}}, T{{[0-9]+}}.X
 
 ; GCN: buffer_store_dwordx4
-define void @i128-const-store(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @i128-const-store(i32 addrspace(1)* %out) {
 entry:
   store i32 1, i32 addrspace(1)* %out, align 4
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/store-local.ll b/llvm/test/CodeGen/AMDGPU/store-local.ll
index 03fd30c..c144bf2 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.ll
@@ -9,7 +9,7 @@
 ; CM: LDS_BYTE_WRITE
 
 ; GCN: ds_write_b8
-define void @store_local_i1(i1 addrspace(3)* %out) {
+define amdgpu_kernel void @store_local_i1(i1 addrspace(3)* %out) {
 entry:
   store i1 true, i1 addrspace(3)* %out
   ret void
@@ -21,7 +21,7 @@
 ; CM: LDS_BYTE_WRITE
 
 ; GCN: ds_write_b8
-define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
+define amdgpu_kernel void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
   store i8 %in, i8 addrspace(3)* %out
   ret void
 }
@@ -32,7 +32,7 @@
 ; CM: LDS_SHORT_WRITE
 
 ; GCN: ds_write_b16
-define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
+define amdgpu_kernel void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
   store i16 %in, i16 addrspace(3)* %out
   ret void
 }
@@ -43,7 +43,7 @@
 ; CM: LDS_WRITE
 
 ; GCN: ds_write_b32
-define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
+define amdgpu_kernel void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(3)* %out
   ret void
@@ -55,7 +55,7 @@
 ; CM: LDS_WRITE
 
 ; GCN: ds_write_b32
-define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out
   ret void
@@ -78,7 +78,7 @@
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
-define void @store_local_v4i8_unaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @store_local_v4i8_unaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 1
   ret void
@@ -95,7 +95,7 @@
 
 ; GCN: ds_write_b16
 ; GCN: ds_write_b16
-define void @store_local_v4i8_halfaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @store_local_v4i8_halfaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 2
   ret void
@@ -111,7 +111,7 @@
 ; CM-NOT: LDS_WRITE
 
 ; GCN: ds_write_b64
-define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(3)* %out
   ret void
@@ -129,7 +129,7 @@
 ; CM: LDS_WRITE
 
 ; GCN: ds_write2_b64
-define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(3)* %out
   ret void
@@ -148,7 +148,7 @@
 
 ; GCN: ds_write2_b32
 ; GCN: ds_write2_b32
-define void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4
   ret void
@@ -157,7 +157,7 @@
 ; FUNC-LABEL: {{^}}store_local_i64_i8:
 ; EG: LDS_BYTE_WRITE
 ; GCN: ds_write_b8
-define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
+define amdgpu_kernel void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
   store i8 %0, i8 addrspace(3)* %out
@@ -167,7 +167,7 @@
 ; FUNC-LABEL: {{^}}store_local_i64_i16:
 ; EG: LDS_SHORT_WRITE
 ; GCN: ds_write_b16
-define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
+define amdgpu_kernel void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
   store i16 %0, i16 addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/store-private.ll b/llvm/test/CodeGen/AMDGPU/store-private.ll
index 33d27f2..ab73ada 100644
--- a/llvm/test/CodeGen/AMDGPU/store-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-private.ll
@@ -15,7 +15,7 @@
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_byte
-define void @store_i1(i1 addrspace(0)* %out) {
+define amdgpu_kernel void @store_i1(i1 addrspace(0)* %out) {
 entry:
   store i1 true, i1 addrspace(0)* %out
   ret void
@@ -44,7 +44,7 @@
 
 ; SI: buffer_store_byte
 
-define void @store_i8(i8 addrspace(0)* %out, i8 %in) {
+define amdgpu_kernel void @store_i8(i8 addrspace(0)* %out, i8 %in) {
 entry:
   store i8 %in, i8 addrspace(0)* %out
   ret void
@@ -72,7 +72,7 @@
 ; EG: MOV * T(0 + AR.x).X+, [[RES]]
 
 ; SI: buffer_store_short
-define void @store_i16(i16 addrspace(0)* %out, i16 %in) {
+define amdgpu_kernel void @store_i16(i16 addrspace(0)* %out, i16 %in) {
 entry:
   store i16 %in, i16 addrspace(0)* %out
   ret void
@@ -102,7 +102,7 @@
 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
 ; CM: MOVA_INT
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
-define void @store_i24(i24 addrspace(0)* %out, i24 %in) {
+define amdgpu_kernel void @store_i24(i24 addrspace(0)* %out, i24 %in) {
 entry:
   store i24 %in, i24 addrspace(0)* %out
   ret void
@@ -120,7 +120,7 @@
 ; CM: MOVA_INT
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 ; CM-NOT: MOVA_INT
-define void @store_i25(i25 addrspace(0)* %out, i25 %in) {
+define amdgpu_kernel void @store_i25(i25 addrspace(0)* %out, i25 %in) {
 entry:
   store i25 %in, i25 addrspace(0)* %out
   ret void
@@ -141,7 +141,7 @@
 ; CM-NOT: MOVA_INT
 
 ; SI: buffer_store_short
-define void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(0)* %out
@@ -172,7 +172,7 @@
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_byte
-define void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(0)* %out, align 1
@@ -191,7 +191,7 @@
 ; CM-NOT: MOVA_INT
 
 ; SI: buffer_store_dword
-define void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(0)* %out
@@ -223,7 +223,7 @@
 
 ; SI: buffer_store_short
 ; SI: buffer_store_short
-define void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(0)* %out, align 2
@@ -240,7 +240,7 @@
 ; CM-NOT: MOVA_INT
 
 ; SI: buffer_store_dword
-define void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(0)* %out
@@ -299,7 +299,7 @@
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 1
@@ -410,7 +410,7 @@
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI-NOT: buffer_store_dword
-define void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) {
+define amdgpu_kernel void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) {
 entry:
   %0 = trunc <8 x i32> %in to <8 x i8>
   store <8 x i8> %0, <8 x i8> addrspace(0)* %out, align 1
@@ -443,7 +443,7 @@
 ; SI: buffer_store_short
 ; SI: buffer_store_short
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 2
@@ -460,7 +460,7 @@
 
 ; SI: buffer_store_dword
 
-define void @store_f32(float addrspace(0)* %out, float %in) {
+define amdgpu_kernel void @store_f32(float addrspace(0)* %out, float %in) {
   store float %in, float addrspace(0)* %out
   ret void
 }
@@ -480,7 +480,7 @@
 ; XSI: buffer_store_dwordx2
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i16>
   store <4 x i16> %0, <4 x i16> addrspace(0)* %out
@@ -504,7 +504,7 @@
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 
-define void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) {
+define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) {
 entry:
   %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
   %1 = insertelement <2 x float> %0, float %b, i32 1
@@ -533,7 +533,7 @@
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 
-define void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind {
+define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind {
   store <3 x i32> %a, <3 x i32> addrspace(0)* %out, align 16
   ret void
 }
@@ -563,7 +563,7 @@
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(0)* %out
   ret void
@@ -594,7 +594,7 @@
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(0)* %out, align 4
   ret void
@@ -626,7 +626,7 @@
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) {
+define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) {
   %1 = load <4 x float>, <4 x float> addrspace(0) * %in
   store <4 x float> %1, <4 x float> addrspace(0)* %out
   ret void
@@ -644,7 +644,7 @@
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_byte
-define void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
   store i8 %0, i8 addrspace(0)* %out
@@ -663,7 +663,7 @@
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_short
-define void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
   store i16 %0, i16 addrspace(0)* %out
@@ -689,7 +689,7 @@
 ; XSI: buffer_store_dwordx2
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
+define amdgpu_kernel void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
 entry:
   %0 = load i32, i32 addrspace(2)* %mem, align 4
   %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
@@ -727,7 +727,7 @@
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @i128-const-store(i32 addrspace(0)* %out) {
+define amdgpu_kernel void @i128-const-store(i32 addrspace(0)* %out) {
 entry:
   store i32 1, i32 addrspace(0)* %out, align 4
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
index 78db2d3..7518e88 100644
--- a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -5,7 +5,7 @@
 ; GCN-LABEL: {{^}}global_store_v3i64:
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32
   ret void
 }
@@ -40,7 +40,7 @@
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
   ret void
 }
@@ -48,7 +48,7 @@
 ; GCN-LABEL: {{^}}local_store_v3i64:
 ; GCN: ds_write2_b64
 ; GCN: ds_write_b64
-define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
   ret void
 }
@@ -83,7 +83,7 @@
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
-define void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 1
   ret void
 }
@@ -91,7 +91,7 @@
 ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32:
 ; GCN-DAG: buffer_store_dwordx2
 ; GCN-DAG: buffer_store_dword v
-define void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i32>
   store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out
   ret void
@@ -100,7 +100,7 @@
 ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i16:
 ; GCN-DAG: buffer_store_short
 ; GCN-DAG: buffer_store_dword v
-define void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i16>
   store <3 x i16> %trunc, <3 x i16> addrspace(1)* %out
   ret void
@@ -110,7 +110,7 @@
 ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i8:
 ; GCN-DAG: buffer_store_short
 ; GCN-DAG: buffer_store_byte v
-define void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i8>
   store <3 x i8> %trunc, <3 x i8> addrspace(1)* %out
   ret void
@@ -120,7 +120,7 @@
 ; GCN-DAG: buffer_store_byte v
 ; GCN-DAG: buffer_store_byte v
 ; GCN-DAG: buffer_store_byte v
-define void @global_truncstore_v3i64_to_v3i1(<3 x i1> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i1(<3 x i1> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i1>
   store <3 x i1> %trunc, <3 x i1> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll b/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll
index d5af3b2..507f07d 100644
--- a/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll
@@ -5,7 +5,7 @@
 ; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting
 ; scratch loads and stores.
 ; CHECK-LABEL: {{^}}store_vector_ptrs:
-define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
+define amdgpu_kernel void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
   %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   store <4 x i32*> %p, <4 x i32*>* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/store_typed.ll b/llvm/test/CodeGen/AMDGPU/store_typed.ll
index 515fcf0..eaa2161 100644
--- a/llvm/test/CodeGen/AMDGPU/store_typed.ll
+++ b/llvm/test/CodeGen/AMDGPU/store_typed.ll
@@ -6,7 +6,7 @@
 ; EG: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}}, 1
 ; CM: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}}
 
-define void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) {
+define amdgpu_kernel void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) {
   call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 0)
   ret void
 }
@@ -16,7 +16,7 @@
 ; EG: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}}, 1
 ; CM: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}}
 
-define void @store_typed_rat11(<4 x i32> %data, <4 x i32> %index) {
+define amdgpu_kernel void @store_typed_rat11(<4 x i32> %data, <4 x i32> %index) {
   call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 11)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/structurize.ll b/llvm/test/CodeGen/AMDGPU/structurize.ll
index 174e64e..3cceb2d 100644
--- a/llvm/test/CodeGen/AMDGPU/structurize.ll
+++ b/llvm/test/CodeGen/AMDGPU/structurize.ll
@@ -45,7 +45,7 @@
 ; CHECK: CF_END
 
 
-define void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
 %0 = icmp ne i32 %a, 0
   br i1 %0, label %diamond_head, label %branch_from
diff --git a/llvm/test/CodeGen/AMDGPU/structurize1.ll b/llvm/test/CodeGen/AMDGPU/structurize1.ll
index db0f502..2e7d0e6 100644
--- a/llvm/test/CodeGen/AMDGPU/structurize1.ll
+++ b/llvm/test/CodeGen/AMDGPU/structurize1.ll
@@ -19,7 +19,7 @@
 ; CHECK-LABEL: {{^}}if_inside_loop:
 ; CHECK: LOOP_START_DX10
 ; CHECK: END_LOOP
-define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
   br label %for.body
 
diff --git a/llvm/test/CodeGen/AMDGPU/sub.i16.ll b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
index b2797ce..ada7214 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
@@ -7,7 +7,7 @@
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -24,7 +24,7 @@
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffff85, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -39,7 +39,7 @@
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x34d, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -54,7 +54,7 @@
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], 63, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -70,7 +70,7 @@
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_dword [[ADD]]
-define void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -90,7 +90,7 @@
 ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
 ; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -110,7 +110,7 @@
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: buffer_store_dword [[SEXT]]
-define void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -131,7 +131,7 @@
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -149,7 +149,7 @@
 ; GCN-LABEL: {{^}}v_test_sub_i16_constant_commute:
 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}}
 ; CI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 0x800, v{{[0-9]+}}
-define void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %size.trunc = trunc i32 %size to i16
   call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 5816345..f366029 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -8,7 +8,7 @@
 ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -25,7 +25,7 @@
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -45,7 +45,7 @@
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -55,7 +55,7 @@
 }
 
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
     %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
     %a = load i16, i16 addrspace(1)* %in 
     %b = load i16, i16 addrspace(1)* %b_ptr
@@ -69,7 +69,7 @@
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
     %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
     %a = load <2 x i16>, <2 x i16> addrspace(1) * %in 
     %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr
@@ -85,7 +85,7 @@
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
     %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
     %a = load <4 x i16>, <4 x i16> addrspace(1) * %in 
     %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
@@ -103,7 +103,7 @@
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT {{[* ]*}}
-define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
   %result = sub i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -118,7 +118,7 @@
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT {{[* ]*}}
-define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
+define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
@@ -134,7 +134,7 @@
 ; SI: v_subb_u32_e32
 ; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
-define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
@@ -154,7 +154,7 @@
 ; SI: v_subb_u32_e32
 ; SI: v_subrev_i32_e32
 ; SI: v_subb_u32_e32
-define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index b282b51..6736f89 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -7,7 +7,7 @@
 
 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -27,7 +27,7 @@
 
 ; VI: s_sub_i32
 ; VI: s_sub_i32
-define void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
+define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
   %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
   %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
   %add = sub <2 x i16> %a, %b
@@ -38,7 +38,7 @@
 ; GCN-LABEL: {{^}}s_test_sub_self_v2i16:
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]]
 ; GCN: buffer_store_dword [[ZERO]]
-define void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
+define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
   %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
   %add = sub <2 x i16> %a, %a
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
@@ -51,7 +51,7 @@
 
 ; VI: v_subrev_i32_e32
 ; VI: v_subrev_i32_e32
-define void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
+define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
   %add = sub <2 x i16> %a, %b
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
   ret void
@@ -63,7 +63,7 @@
 
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffe38, v{{[0-9]+}}
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}}
-define void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -80,7 +80,7 @@
 
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x3df, v{{[0-9]+}}
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}}
-define void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -99,7 +99,7 @@
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]]
 ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -118,7 +118,7 @@
 ; VI-NOT: v_subrev_i16
 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -138,7 +138,7 @@
 ; VI-NOT: v_subrev_i16
 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -170,7 +170,7 @@
 ; VI-NOT: and
 ; VI-NOT: shl
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
-define void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -205,7 +205,7 @@
 ; VI-DAG: v_subrev_u16_e32
 
 ; VI: buffer_store_dwordx4
-define void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -231,7 +231,7 @@
 ; VI: v_subrev_u16_e32
 ; VI: v_subrev_u16_e32
 ; VI: buffer_store_dwordx2
-define void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -259,7 +259,7 @@
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-define void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
index 0ec45c6..c2d04ab 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL:{{^}}row_filter_C1_D0:
-define void @row_filter_C1_D0() #0 {
+define amdgpu_kernel void @row_filter_C1_D0() #0 {
 entry:
   br i1 undef, label %for.inc.1, label %do.body.preheader
 
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
index 72a1f1e..35615c4 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
@@ -20,7 +20,7 @@
 ; CHECK-NEXT: s_mov_b32 s6, -1
 ; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0
 ; CHECK-NEXT: s_endpgm
-define void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
 entry:
   %v0 = insertelement <4 x float> undef, float %a0, i32 0
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll b/llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
index 8bd995a..57c267e 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
@@ -5,7 +5,7 @@
 ; Just make sure this test doesn't crash.
 ; CHECK-LABEL: foobar:
 ; CHECK: s_endpgm
-define void @foobar() {
+define amdgpu_kernel void @foobar() {
   %v0 = icmp eq <4 x i32> undef, <i32 0, i32 1, i32 2, i32 3>
   %v3 = sext <4 x i1> %v0 to <4 x i32>
   %v4 = extractelement <4 x i32> %v3, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-intervals.mir b/llvm/test/CodeGen/AMDGPU/subreg-intervals.mir
index c4e0021..c477fe9 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-intervals.mir
+++ b/llvm/test/CodeGen/AMDGPU/subreg-intervals.mir
@@ -10,8 +10,8 @@
 # CHECK-LABEL: Machine code for function test1:
 
 --- |
-  define void @test0() { ret void }
-  define void @test1() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
+  define amdgpu_kernel void @test1() { ret void }
 ...
 ---
 name: test0
diff --git a/llvm/test/CodeGen/AMDGPU/target-cpu.ll b/llvm/test/CodeGen/AMDGPU/target-cpu.ll
index cf80ff3..466e89e 100644
--- a/llvm/test/CodeGen/AMDGPU/target-cpu.ll
+++ b/llvm/test/CodeGen/AMDGPU/target-cpu.ll
@@ -14,7 +14,7 @@
 ; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define void @target_none() #0 {
+define amdgpu_kernel void @target_none() #0 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -30,7 +30,7 @@
 ; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define void @target_tahiti() #1 {
+define amdgpu_kernel void @target_tahiti() #1 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -46,7 +46,7 @@
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
 ; CHECK: s_dcache_inv_vol
-define void @target_bonaire() #3 {
+define amdgpu_kernel void @target_bonaire() #3 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -63,7 +63,7 @@
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x400
 ; CHECK: flat_store_dword
 ; CHECK: s_dcache_wb{{$}}
-define void @target_fiji() #4 {
+define amdgpu_kernel void @target_fiji() #4 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -79,7 +79,7 @@
 ; CHECK-LABEL: {{^}}promote_alloca_enabled:
 ; CHECK: ds_read_b32
 ; CHECK: ; LDSByteSize: 5120
-define void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
+define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
@@ -93,7 +93,7 @@
 ; CHECK: SCRATCH_RSRC_DWORD0
 ; CHECK: SCRATCH_RSRC_DWORD1
 ; CHECK: ScratchSize: 24
-define void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 {
+define amdgpu_kernel void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/trap.ll b/llvm/test/CodeGen/AMDGPU/trap.ll
index 15f055e..77ad895 100644
--- a/llvm/test/CodeGen/AMDGPU/trap.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap.ll
@@ -38,7 +38,7 @@
 ; TRAP-BIT: enable_trap_handler = 1
 ; NO-TRAP-BIT: enable_trap_handler = 0
 ; NO-MESA-TRAP: s_endpgm
-define void @hsa_trap() {
+define amdgpu_kernel void @hsa_trap() {
   call void @llvm.trap()
   ret void
 }
@@ -64,7 +64,7 @@
 ; TRAP-BIT: enable_trap_handler = 1
 ; NO-TRAP-BIT: enable_trap_handler = 0
 ; NO-MESA-TRAP: s_endpgm
-define void @hsa_debugtrap() {
+define amdgpu_kernel void @hsa_debugtrap() {
   call void @llvm.debugtrap()
   ret void
 }
@@ -75,7 +75,7 @@
 ; NO-TRAP-BIT: enable_trap_handler = 0
 ; NO-HSA-TRAP: s_endpgm
 ; NO-MESA-TRAP: s_endpgm
-define void @trap() {
+define amdgpu_kernel void @trap() {
   call void @llvm.trap()
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
index a331475..f900403 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -4,7 +4,7 @@
 ; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32:
 ; CHECK: buffer_load_dword v
 ; CHECK: buffer_store_dword v
-define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %bc = bitcast <2 x i32> %ld to i64
   %trunc = trunc i64 %bc to i32
@@ -15,7 +15,7 @@
 ; CHECK-LABEL: {{^}}trunc_i96_bitcast_v3i32:
 ; CHECK: buffer_load_dword v
 ; CHECK: buffer_store_dword v
-define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
   %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
   %bc = bitcast <3 x i32> %ld to i96
   %trunc = trunc i96 %bc to i32
@@ -26,7 +26,7 @@
 ; CHECK-LABEL: {{^}}trunc_i128_bitcast_v4i32:
 ; CHECK: buffer_load_dword v
 ; CHECK: buffer_store_dword v
-define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %bc = bitcast <4 x i32> %ld to i128
   %trunc = trunc i128 %bc to i32
@@ -38,7 +38,7 @@
 ; CHECK-LABEL: {{^}}trunc_i16_bitcast_v2i16:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_short [[VAL]]
-define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
   %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %bc = bitcast <2 x i16> %ld to i32
   %trunc = trunc i32 %bc to i16
@@ -54,7 +54,7 @@
 ; SI: buffer_load_dword v[[VAL:[0-9]+]]
 ; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]]
 ; CHECK: buffer_store_short [[VAL]]
-define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
   %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %bc = bitcast <4 x i16> %ld to i64
   %trunc = trunc i64 %bc to i16
@@ -66,7 +66,7 @@
 ; CHECK-LABEL: {{^}}trunc_i8_bitcast_v2i8:
 ; CHECK: buffer_load_ubyte [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_byte [[VAL]]
-define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
   %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %bc = bitcast <2 x i8> %ld to i16
   %trunc = trunc i16 %bc to i8
@@ -77,7 +77,7 @@
 ; CHECK-LABEL: {{^}}trunc_i32_bitcast_v4i8:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_byte [[VAL]]
-define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
   %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %bc = bitcast <4 x i8> %ld to i32
   %trunc = trunc i32 %bc to i8
@@ -88,7 +88,7 @@
 ; CHECK-LABEL: {{^}}trunc_i24_bitcast_v3i8:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_byte [[VAL]]
-define void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   %bc = bitcast <3 x i8> %ld to i24
   %trunc = trunc i24 %bc to i8
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
index 7a4bced..cb8d365 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
@@ -9,7 +9,7 @@
 ; SI: v_cmp_eq_u32_e32 vcc, 0, [[TMP]]{{$}}
 ; SI: v_cndmask_b32_e64
 ; SI: buffer_store_byte
-define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, 0
@@ -25,7 +25,7 @@
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, 0
@@ -36,7 +36,7 @@
 ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, 1
@@ -48,7 +48,7 @@
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, 1
@@ -60,7 +60,7 @@
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, -1
@@ -71,7 +71,7 @@
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, -1
@@ -84,7 +84,7 @@
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, 0
@@ -96,7 +96,7 @@
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, 0
@@ -107,7 +107,7 @@
 ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, 1
@@ -122,7 +122,7 @@
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, 1
@@ -137,7 +137,7 @@
 ; XSI: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}}
 ; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]]
 ; XSI-NEXT: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, -1
@@ -148,7 +148,7 @@
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, -1
@@ -162,7 +162,7 @@
 ; SI: v_cmp_ne_u32_e32 vcc, -1, [[LOAD]]{{$}}
 ; SI-NEXT: v_cndmask_b32_e64
 ; SI: {{buffer|flat}}_store_byte
-define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr i8, i8 addrspace(1)* %in, i32 %tid.x
   %load = load i8, i8 addrspace(1)* %in.ptr
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
index 03b8af0..d67b8f9 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}global_truncstore_f64_to_f16:
 ; GCN: s_endpgm
-define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %val = load double, double addrspace(1)* %in
   %cvt = fptrunc double %val to half
   store half %cvt, half addrspace(1)* %out
@@ -11,7 +11,7 @@
 
 ; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
   %val = load <2 x double>, <2 x double> addrspace(1)* %in
   %cvt = fptrunc <2 x double> %val to <2 x half>
   store <2 x half> %cvt, <2 x half> addrspace(1)* %out
@@ -20,7 +20,7 @@
 
 ; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
   %val = load <3 x double>, <3 x double> addrspace(1)* %in
   %cvt = fptrunc <3 x double> %val to <3 x half>
   store <3 x half> %cvt, <3 x half> addrspace(1)* %out
@@ -29,7 +29,7 @@
 
 ; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
   %val = load <4 x double>, <4 x double> addrspace(1)* %in
   %cvt = fptrunc <4 x double> %val to <4 x half>
   store <4 x half> %cvt, <4 x half> addrspace(1)* %out
@@ -38,7 +38,7 @@
 
 ; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
   %val = load <8 x double>, <8 x double> addrspace(1)* %in
   %cvt = fptrunc <8 x double> %val to <8 x half>
   store <8 x half> %cvt, <8 x half> addrspace(1)* %out
@@ -47,7 +47,7 @@
 
 ; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
   %val = load <16 x double>, <16 x double> addrspace(1)* %in
   %cvt = fptrunc <16 x double> %val to <16 x half>
   store <16 x half> %cvt, <16 x half> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
index da2a5b4..4ea2352 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
@@ -7,7 +7,7 @@
 ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
 ; SI: buffer_store_byte [[VREG]],
-define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
+define amdgpu_kernel void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
   %trunc = trunc i32 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
@@ -15,7 +15,7 @@
 
 ; SI-LABEL: {{^}}global_truncstore_i64_to_i1:
 ; SI: buffer_store_byte
-define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
+define amdgpu_kernel void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
   %trunc = trunc i64 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
@@ -26,13 +26,13 @@
 ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
 ; SI: buffer_store_byte [[VREG]],
-define void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
+define amdgpu_kernel void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
   %trunc = trunc i16 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
 }
 ; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
-define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind {
+define amdgpu_kernel void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind {
   %add = add i16 %val0, %val1
   %trunc = trunc i16 %add to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store.ll b/llvm/test/CodeGen/AMDGPU/trunc-store.ll
index c6727e1..f45de67 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}truncstore_arg_v16i32_to_v16i8:
 ; SI: buffer_store_dwordx4
-define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) {
+define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) {
   %trunc = trunc <16 x i32> %in to <16 x i8>
   store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
   ret void
@@ -11,7 +11,7 @@
 
 ; FUNC-LABEL: {{^}}truncstore_arg_v16i64_to_v16i8:
 ; SI: buffer_store_dwordx4
-define void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) {
+define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) {
   %trunc = trunc <16 x i64> %in to <16 x i8>
   store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll b/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
index 878ea3f..3dbc10d 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
@@ -6,7 +6,7 @@
 
 ; CHECK-LABEL: {{^}}test:
 ; CHECK: MEM_RAT_CACHELESS STORE_RAW
-define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
+define amdgpu_kernel void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %if, label %done
diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll
index a8f10cc..0c91d52 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@@ -4,7 +4,7 @@
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
 ; GCN-LABEL: {{^}}trunc_i64_to_i32_store:
 ; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[0:1],
 ; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
@@ -28,7 +28,7 @@
 ; SI: buffer_store_dword [[VSHL]]
 ; VI: flat_store_dword v[{{[0-9:]+}}], [[VSHL]]
 
-define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
   %b = shl i64 %a, 2
   %result = trunc i64 %b to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -46,7 +46,7 @@
 ; VI: flat_store_dword v[{{[0-9:]+}}], v[[LO_VREG]]
 ; GCN: v_mov_b32_e32
 ; GCN: v_mov_b32_e32
-define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
   %aa = add i64 %a, 234 ; Prevent shrinking store.
   %b = shl i64 %aa, 2
   %result = trunc i64 %b to i32
@@ -57,7 +57,7 @@
 
 ; GCN-LABEL: {{^}}trunc_i32_to_i1:
 ; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
-define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
+define amdgpu_kernel void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
   %a = load i32, i32 addrspace(1)* %ptr, align 4
   %trunc = trunc i32 %a to i1
   %result = select i1 %trunc, i32 1, i32 0
@@ -67,7 +67,7 @@
 
 ; GCN-LABEL: {{^}}trunc_i8_to_i1:
 ; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
-define void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) {
+define amdgpu_kernel void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) {
   %a = load i8, i8 addrspace(1)* %ptr, align 4
   %trunc = trunc i8 %a to i1
   %result = select i1 %trunc, i8 1, i8 0
@@ -77,7 +77,7 @@
 
 ; GCN-LABEL: {{^}}sgpr_trunc_i16_to_i1:
 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
-define void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) {
+define amdgpu_kernel void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) {
   %trunc = trunc i16 %a to i1
   %result = select i1 %trunc, i16 1, i16 0
   store i16 %result, i16 addrspace(1)* %out, align 4
@@ -86,7 +86,7 @@
 
 ; GCN-LABEL: {{^}}sgpr_trunc_i32_to_i1:
 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
-define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
   %trunc = trunc i32 %a to i1
   %result = select i1 %trunc, i32 1, i32 0
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -99,7 +99,7 @@
 ; GCN: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]]
 ; GCN: v_cmp_eq_u32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], [[MASKED]], 1{{$}}
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]]
-define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
   %trunc = trunc i64 %x to i1
   %sel = select i1 %trunc, i32 63, i32 -12
   store i32 %sel, i32 addrspace(1)* %out
@@ -112,7 +112,7 @@
 ; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
 ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[MASKED]]
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
-define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll b/llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
index 76c32af..7c369a3 100644
--- a/llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
+++ b/llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
@@ -19,7 +19,7 @@
 ; CHECK: store i8 0, i8 addrspace(1)*
 ; CHECK-NOT: store i8 0, i8 addrspace(1)*
 ; CHECK: ret void
-define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) {
 entry:
   %add = add nsw i32 %b, 4
   %cmp = icmp sgt i32 %add, %a
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index d62e4e0..632ccaa 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -9,7 +9,7 @@
 
 ; EG: ADDC_UINT
 ; EG: ADDC_UINT
-define void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
@@ -27,7 +27,7 @@
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
@@ -42,7 +42,7 @@
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -63,7 +63,7 @@
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -85,7 +85,7 @@
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
@@ -100,7 +100,7 @@
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr
@@ -118,7 +118,7 @@
 ; FUNC-LABEL: {{^}}v_uaddo_i16:
 ; VI: v_add_u16_e32
 ; VI: v_cmp_lt_u16_e32
-define void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index ed791bc..2874a0cd 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -10,7 +10,7 @@
 ; EG: CF_END
 
 ; SI: v_rcp_iflag_f32_e32
-define void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -21,7 +21,7 @@
 
 ; FUNC-LABEL: {{^}}s_udiv_i32:
 ; SI: v_rcp_iflag_f32_e32
-define void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %result = udiv i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@
 ; SI: v_rcp_iflag_f32_e32
 ; SI: v_rcp_iflag_f32_e32
 ; SI: s_endpgm
-define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -50,7 +50,7 @@
 ; FUNC-LABEL: {{^}}udiv_v4i32:
 ; EG: CF_END
 ; SI: s_endpgm
-define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -63,7 +63,7 @@
 ; SI: buffer_load_dword [[VAL:v[0-9]+]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 4, [[VAL]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %result = udiv i32 %a, 16
@@ -77,7 +77,7 @@
 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 25, [[MULHI]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %result = udiv i32 %a, 34259182
@@ -91,7 +91,7 @@
 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 24, [[MULHI]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %result = udiv i32 %a, 34259183
@@ -103,7 +103,7 @@
 ; SI: v_rcp_f32
 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xff, v{{[0-9]+}}
 ; SI: buffer_store_dword [[TRUNC]]
-define void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -117,7 +117,7 @@
 ; SI: v_rcp_f32
 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xffff, v{{[0-9]+}}
 ; SI: buffer_store_dword [[TRUNC]]
-define void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in
   %den = load i16, i16 addrspace(1) * %den_ptr
@@ -131,7 +131,7 @@
 ; SI: v_rcp_f32
 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
 ; SI: buffer_store_dword [[TRUNC]]
-define void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
   %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
   %num = load i23, i23 addrspace(1) * %in
   %den = load i23, i23 addrspace(1) * %den_ptr
@@ -143,7 +143,7 @@
 
 ; FUNC-LABEL: {{^}}v_udiv_i24:
 ; SI-NOT: v_rcp_f32
-define void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
   %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
   %num = load i24, i24 addrspace(1) * %in
   %den = load i24, i24 addrspace(1) * %den_ptr
@@ -159,7 +159,7 @@
 ; SI: v_mul_hi_u32
 ; SI: v_mul_hi_u32
 
-define void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
+define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
   %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
   store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
@@ -168,7 +168,7 @@
 
 ; FUNC-LABEL: {{^}}test_udiv2:
 ; SI: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
-define void @test_udiv2(i32 %p) {
+define amdgpu_kernel void @test_udiv2(i32 %p) {
   %i = udiv i32 %p, 2
   store volatile i32 %i, i32 addrspace(1)* undef
   ret void
@@ -178,7 +178,7 @@
 ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
 ; SI: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
 ; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-define void @test_udiv_3_mulhu(i32 %p) {
+define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
    %i = udiv i32 %p, 3
    store volatile i32 %i, i32 addrspace(1)* undef
    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index 17f4ebf..9507a49 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -51,7 +51,7 @@
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
+define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
   %result0 = udiv i32 %x, %y
   store i32 %result0, i32 addrspace(1)* %out0
   %result1 = urem i32 %x, %y
@@ -158,7 +158,7 @@
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
   %result0 = udiv <2 x i32> %x, %y
   store <2 x i32> %result0, <2 x i32> addrspace(1)* %out
   %result1 = urem <2 x i32> %x, %y
@@ -340,7 +340,7 @@
 ; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
   %result0 = udiv <4 x i32> %x, %y
   store <4 x i32> %result0, <4 x i32> addrspace(1)* %out
   %result1 = urem <4 x i32> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem24.ll b/llvm/test/CodeGen/AMDGPU/udivrem24.ll
index 6d145f1..6f144dc 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem24.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem24.ll
@@ -12,7 +12,7 @@
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -31,7 +31,7 @@
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -50,7 +50,7 @@
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -67,7 +67,7 @@
 ; SI: v_rcp_iflag
 ; SI-NOT v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -84,7 +84,7 @@
 ; SI: v_rcp_iflag
 ; SI-NOT v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -101,7 +101,7 @@
 ; SI: v_rcp_iflag
 ; SI-NOT v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -121,7 +121,7 @@
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -141,7 +141,7 @@
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -161,7 +161,7 @@
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -184,7 +184,7 @@
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -203,7 +203,7 @@
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -215,7 +215,7 @@
 ; FUNC-LABEL: {{^}}urem24_i32:
 ; SI-NOT: v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -235,7 +235,7 @@
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -255,7 +255,7 @@
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -275,7 +275,7 @@
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -294,7 +294,7 @@
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
 
 ; EG: RECIP_IEEE
-define void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -313,7 +313,7 @@
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
 
 ; EG: RECIP_IEEE
-define void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem64.ll b/llvm/test/CodeGen/AMDGPU/udivrem64.ll
index da61a84..bd29792 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem64.ll
@@ -70,7 +70,7 @@
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = udiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -144,7 +144,7 @@
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -159,7 +159,7 @@
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = udiv i64 %1, %2
@@ -176,7 +176,7 @@
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = urem i64 %1, %2
@@ -195,7 +195,7 @@
 ;VI-NOT: v_lshrrev_b64
 ;GCN: v_mad_f32
 ;GCN: s_endpgm
-define void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 41
   %2 = lshr i64 %y, 41
   %result = udiv i64 %1, %2
@@ -214,7 +214,7 @@
 ;VI-NOT: v_lshrrev_b64
 ;GCN: v_mad_f32
 ;GCN: s_endpgm
-define void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 41
   %2 = lshr i64 %y, 41
   %result = urem i64 %1, %2
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index a4e18eb..62943ae 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -9,7 +9,7 @@
 ; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep, align 8
@@ -19,21 +19,21 @@
 }
 
 ; SI-LABEL: {{^}}s_uint_to_fp_i64_to_f64
-define void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
   %cast = uitofp i64 %in to double
   store double %cast, double addrspace(1)* %out, align 8
   ret void
 }
 
 ; SI-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f64
-define void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
   %cast = uitofp <2 x i64> %in to <2 x double>
   store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
   ret void
 }
 
 ; SI-LABEL: {{^}}s_uint_to_fp_v4i64_to_v4f64
-define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
   %cast = uitofp <4 x i64> %in to <4 x double>
   store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
   ret void
@@ -42,7 +42,7 @@
 ; SI-LABEL: {{^}}s_uint_to_fp_i32_to_f64
 ; SI: v_cvt_f64_u32_e32
 ; SI: s_endpgm
-define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
   %cast = uitofp i32 %in to double
   store double %cast, double addrspace(1)* %out, align 8
   ret void
@@ -52,7 +52,7 @@
 ; SI: v_cvt_f64_u32_e32
 ; SI: v_cvt_f64_u32_e32
 ; SI: s_endpgm
-define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) {
   %cast = uitofp <2 x i32> %in to <2 x double>
   store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -64,7 +64,7 @@
 ; SI: v_cvt_f64_u32_e32
 ; SI: v_cvt_f64_u32_e32
 ; SI: s_endpgm
-define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) {
   %cast = uitofp <4 x i32> %in to <4 x double>
   store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
   ret void
@@ -79,7 +79,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; SI: s_endpgm
-define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to double
   store double %fp, double addrspace(1)* %out, align 4
@@ -91,7 +91,7 @@
 ; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) {
+define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) {
   %fp = uitofp i1 %in to double
   store double %fp, double addrspace(1)* %out, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index cd816b2..4168326 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -4,7 +4,7 @@
 ; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f16:
-define void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
   %result = uitofp i64 %in to half
   store half %result, half addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@
 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[VR_F16:v[0-9]+]], [[VR]]
 ; GCN: {{buffer|flat}}_store_short {{.*}}[[VR_F16]]
-define void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -35,7 +35,7 @@
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f32:
-define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
   %result = uitofp i64 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -54,7 +54,7 @@
 
 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
 ; GCN: {{buffer|flat}}_store_dword {{.*}}[[VR]]
-define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -65,14 +65,14 @@
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f32:
-define void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = uitofp <2 x i64> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64_to_v4f32:
-define void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -83,14 +83,14 @@
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f16:
-define void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = uitofp <2 x i64> %in to <2 x half>
   store <2 x half> %result, <2 x half> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64_to_v4f16:
-define void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x half>, <4 x half> addrspace(1)* %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll
index 3003226..2e99187 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll
@@ -6,7 +6,7 @@
 ; SI: v_cvt_f32_u32_e32
 
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-define void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %result = uitofp i32 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@
 ; SI: v_cvt_f32_u32_e32 {{v[0-9]+}}, {{v[0-9]+$}}
 
 ; R600: INT_TO_FLT
-define void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -32,7 +32,7 @@
 
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-define void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0 {
   %result = uitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
@@ -49,7 +49,7 @@
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = uitofp <4 x i32> %value to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -81,7 +81,7 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
   store float %fp, float addrspace(1)* %out
@@ -92,7 +92,7 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
   %fp = uitofp i1 %in to float
   store float %fp, float addrspace(1)* %out
   ret void
@@ -105,7 +105,7 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
 ; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]]
 ; SI: s_endpgm
-define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -126,7 +126,7 @@
 ; R600-DAG: SETGT_UINT
 ; R600-DAG: SETE_INT
 
-define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
 entry:
   %cvt = uitofp i64 %in to float
   store float %cvt, float addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index 18bd3488..4df2681 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -8,7 +8,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @uitofp_i16_to_f16(
+define amdgpu_kernel void @uitofp_i16_to_f16(
     half addrspace(1)* %r,
     i16 addrspace(1)* %a) {
 entry:
@@ -24,7 +24,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @uitofp_i32_to_f16(
+define amdgpu_kernel void @uitofp_i32_to_f16(
     half addrspace(1)* %r,
     i32 addrspace(1)* %a) {
 entry:
@@ -48,7 +48,7 @@
 ; GCN-DAG: v_or_b32_e32
 ; GCN:     buffer_store_dword
 ; GCN:     s_endpgm
-define void @uitofp_v2i16_to_v2f16(
+define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i16> addrspace(1)* %a) {
 entry:
@@ -68,7 +68,7 @@
 ; GCN-DAG: v_or_b32_e32
 ; GCN:     buffer_store_dword
 ; GCN:     s_endpgm
-define void @uitofp_v2i32_to_v2f16(
+define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i32> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll
index 326a446..5a579f3 100644
--- a/llvm/test/CodeGen/AMDGPU/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/umed3.ll
@@ -6,7 +6,7 @@
 
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32:
 ; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -25,7 +25,7 @@
 ; GCN-LABEL: {{^}}v_test_umed3_multi_use_r_i_i_i32:
 ; GCN: v_max_u32
 ; GCN: v_min_u32
-define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -45,7 +45,7 @@
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_constant_order_i32:
 ; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
 ; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
-define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -64,7 +64,7 @@
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_sign_mismatch_i32:
 ; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
 ; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
-define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -83,7 +83,7 @@
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i64:
 ; GCN: v_cmp_lt_u64
 ; GCN: v_cmp_gt_u64
-define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -102,7 +102,7 @@
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16:
 ; SICIVI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
@@ -173,7 +173,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -185,7 +185,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_1:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -197,7 +197,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_2:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -209,7 +209,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_3:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -221,7 +221,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_4:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -233,7 +233,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_5:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -245,7 +245,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_6:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -257,7 +257,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_7:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -269,7 +269,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_8:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -281,7 +281,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_9:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -293,7 +293,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_10:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -305,7 +305,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_11:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -317,7 +317,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_12:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -329,7 +329,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_13:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -341,7 +341,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_14:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -353,7 +353,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_15:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -368,7 +368,7 @@
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
 bb:
   %tmp0 = call i16 @umin16(i16 %x, i16 %y)
   %tmp1 = call i16 @umax16(i16 %x, i16 %y)
@@ -383,7 +383,7 @@
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
 bb:
   %tmp0 = call i8 @umin8(i8 %x, i8 %y)
   %tmp1 = call i8 @umax8(i8 %x, i8 %y)
@@ -395,7 +395,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_0:
 ; GCN-NOT: v_med3_u32
-define void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -408,7 +408,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_1:
 ; GCN-NOT: v_med3_u32
-define void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -421,7 +421,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_2:
 ; GCN-NOT: v_med3_u32
-define void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -434,7 +434,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_result:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -447,7 +447,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src0:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 1, i32 %y)
   %tmp1 = call i32 @umax(i32 1, i32 %y)
@@ -459,7 +459,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src1:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 2)
   %tmp1 = call i32 @umax(i32 %x, i32 2)
@@ -471,7 +471,7 @@
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src2:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 9
-define void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -491,7 +491,7 @@
 ; VI: v_max_u16
 
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+define amdgpu_kernel void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
index 0f76a54..68aacd0 100644
--- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -8,7 +8,7 @@
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
   %v = load i16, i16 addrspace(3)* %p, align 1
   store i16 %v, i16 addrspace(3)* %r, align 1
   ret void
@@ -23,7 +23,7 @@
 ; UNALIGNED: buffer_load_ushort
 ; UNALIGNED: buffer_store_short
 ; SI: s_endpgm
-define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
   %v = load i16, i16 addrspace(1)* %p, align 1
   store i16 %v, i16 addrspace(1)* %r, align 1
   ret void
@@ -42,7 +42,7 @@
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
   %v = load i32, i32 addrspace(3)* %p, align 1
   store i32 %v, i32 addrspace(3)* %r, align 1
   ret void
@@ -60,7 +60,7 @@
 
 ; UNALIGNED: buffer_load_dword
 ; UNALIGNED: buffer_store_dword
-define void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(1)* %p, align 1
   store i32 %v, i32 addrspace(1)* %r, align 1
   ret void
@@ -74,7 +74,7 @@
 
 ; UNALIGNED: buffer_load_dword
 ; UNALIGNED: buffer_store_dword
-define void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(1)* %p, align 2
   store i32 %v, i32 addrspace(1)* %r, align 2
   ret void
@@ -85,7 +85,7 @@
 ; GCN: ds_read_u16
 ; GCN: ds_write_b16
 ; GCN: ds_write_b16
-define void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
   %v = load i32, i32 addrspace(3)* %p, align 2
   store i32 %v, i32 addrspace(3)* %r, align 2
   ret void
@@ -132,7 +132,7 @@
 ; SI-NOT: v_lshl
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
   %v = load i64, i64 addrspace(3)* %p, align 1
   store i64 %v, i64 addrspace(3)* %r, align 1
   ret void
@@ -179,7 +179,7 @@
 ; SI-NOT: v_lshl
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
   %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1
   store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1
   ret void
@@ -209,7 +209,7 @@
 
 ; UNALIGNED: buffer_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(1)* %p, align 2
   store i64 %v, i64 addrspace(1)* %r, align 2
   ret void
@@ -239,7 +239,7 @@
 
 ; UNALIGNED: buffer_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(1)* %p, align 1
   store i64 %v, i64 addrspace(1)* %r, align 1
   ret void
@@ -286,7 +286,7 @@
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: s_endpgm
-define void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
   ret void
@@ -329,7 +329,7 @@
 
 ; UNALIGNED: buffer_load_dwordx4
 ; UNALIGNED: buffer_store_dwordx4
-define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
   ret void
@@ -337,7 +337,7 @@
 
 ; FUNC-LABEL: {{^}}local_load_i64_align_4:
 ; GCN: ds_read2_b32
-define void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %val = load i64, i64 addrspace(3)* %in, align 4
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
@@ -345,7 +345,7 @@
 
 ; FUNC-LABEL: {{^}}local_load_i64_align_4_with_offset
 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
-define void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
   %val = load i64, i64 addrspace(3)* %ptr, align 4
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -356,7 +356,7 @@
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
 ; GCN: s_endpgm
-define void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
   %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
@@ -375,7 +375,7 @@
 ; GCN: ds_read_u8
 ; GCN: ds_read_u8
 ; GCN: store_dwordx2
-define void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %val = load i64, i64 addrspace(3)* %in, align 1
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
@@ -383,7 +383,7 @@
 
 ; FUNC-LABEL: {{^}}local_store_i64_align_4:
 ; GCN: ds_write2_b32
-define void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
+define amdgpu_kernel void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
   store i64 %val, i64 addrspace(3)* %out, align 4
   ret void
 }
@@ -391,7 +391,7 @@
 ; FUNC-LABEL: {{^}}local_store_i64_align_4_with_offset
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
 ; GCN: s_endpgm
-define void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
   %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
   store i64 0, i64 addrspace(3)* %ptr, align 4
   ret void
@@ -401,7 +401,7 @@
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; GCN: s_endpgm
-define void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
   %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
   %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
@@ -418,7 +418,7 @@
 ; UNALIGNED: s_load_dword
 
 ; SI: buffer_store_dword
-define void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(2)* %p, align 1
   store i32 %v, i32 addrspace(1)* %r, align 4
   ret void
@@ -430,7 +430,7 @@
 
 ; UNALIGNED: s_load_dword
 ; UNALIGNED: buffer_store_dword
-define void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(2)* %p, align 2
   store i32 %v, i32 addrspace(1)* %r, align 4
   ret void
@@ -444,7 +444,7 @@
 
 ; UNALIGNED: s_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(2)* %p, align 2
   store i64 %v, i64 addrspace(1)* %r, align 4
   ret void
@@ -453,7 +453,7 @@
 ; SI-LABEL: {{^}}constant_align4_load_i64:
 ; SI: s_load_dwordx2
 ; SI: buffer_store_dwordx2
-define void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(2)* %p, align 4
   store i64 %v, i64 addrspace(1)* %r, align 4
   ret void
@@ -462,7 +462,7 @@
 ; SI-LABEL: {{^}}constant_align4_load_v4i32:
 ; SI: s_load_dwordx4
 ; SI: buffer_store_dwordx4
-define void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 4
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
   ret void
@@ -482,7 +482,7 @@
 ; UNALIGNED: buffer_load_dwordx2
 
 ; SI: buffer_store_dwordx2
-define void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 {
   %v = load <2 x i32>, <2 x i32> addrspace(2)* %p, align 1
   store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4
   ret void
@@ -512,7 +512,7 @@
 ; UNALIGNED: buffer_load_dwordx4
 
 ; SI: buffer_store_dwordx4
-define void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
   ret void
@@ -521,7 +521,7 @@
 ; SI-LABEL: {{^}}constant_align4_load_i8:
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_byte
-define void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
   %v = load i8, i8 addrspace(2)* %p, align 4
   store i8 %v, i8 addrspace(1)* %r, align 4
   ret void
@@ -530,7 +530,7 @@
 ; SI-LABEL: {{^}}constant_align2_load_i8:
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_byte
-define void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
   %v = load i8, i8 addrspace(2)* %p, align 2
   store i8 %v, i8 addrspace(1)* %r, align 2
   ret void
@@ -541,7 +541,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]]
 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
   %gep0 = getelementptr i32, i32 addrspace(2)* %p, i64 1
   %v0 = load i32, i32 addrspace(2)* %p, align 4
   %v1 = load i32, i32 addrspace(2)* %gep0, align 4
@@ -571,7 +571,7 @@
 ; SI: ds_read_u8
 
 ; SI: ScratchSize: 0{{$}}
-define void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 1
   store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
   ret void
@@ -596,7 +596,7 @@
 ; SI: ds_write_b8
 
 ; SI: ScratchSize: 0{{$}}
-define void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 {
+define amdgpu_kernel void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 {
   store <16 x i8> zeroinitializer, <16 x i8> addrspace(3)* %out, align 1
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
index f92a847..3e80fcf 100644
--- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -4,7 +4,7 @@
 
 
 ; CHECK-LABEL: {{^}}func:
-define void @func() #0 {
+define amdgpu_kernel void @func() #0 {
 B0:
   br i1 undef, label %B1, label %B2
 
@@ -72,7 +72,7 @@
 ; CHECK: v_mov_b32_e32 v[[OUTPUT_LO]], v6
 
 ; CHECK: buffer_store_dwordx4 v{{\[}}[[OUTPUT_LO]]:[[OUTPUT_HI]]{{\]}}
-define void @partially_undef_copy() #0 {
+define amdgpu_kernel void @partially_undef_copy() #0 {
   %tmp0 = call i32 asm sideeffect "v_mov_b32_e32 v5, 5", "={VGPR5}"()
   %tmp1 = call i32 asm sideeffect "v_mov_b32_e32 v6, 6", "={VGPR6}"()
 
diff --git a/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll b/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
index d96ee6d..60ab763 100644
--- a/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
+++ b/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
@@ -5,7 +5,7 @@
 ; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
 
 ; COMMON-LABEL: {{^}}branch_true:
-define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 true, label %for.end, label %for.body.lr.ph
 
@@ -42,7 +42,7 @@
 ; SI: s_cbranch_vccnz
 ; SI: s_cbranch_scc1
 ; SI: s_endpgm
-define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 false, label %for.end, label %for.body.lr.ph
 
@@ -79,7 +79,7 @@
 ; SI: s_cbranch_scc1
 ; SI: s_cbranch_scc1
 ; SI: s_endpgm
-define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 undef, label %for.end, label %for.body.lr.ph
 
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
index 44330e6..a9d45d7 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -12,7 +12,7 @@
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -40,7 +40,7 @@
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = fcmp oeq float %cond, 0.0
   br i1 %cmp0, label %if, label %else
@@ -68,7 +68,7 @@
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %else, label %if
@@ -96,7 +96,7 @@
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = fcmp oeq float %cond, 0.0
   br i1 %cmp0, label %else, label %if
@@ -123,7 +123,7 @@
 ; GCN: buffer_store_dword
 ; GCN: [[ENDIF_LABEL]]:
 ; GCN: s_endpgm
-define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
+define amdgpu_kernel void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
 entry:
   %a.0 = fadd float %a, 10.0
   %cond = bitcast float %a.0 to i32
@@ -148,7 +148,7 @@
 ; GCN: buffer_store_dword
 ; GCN: [[ENDIF_LABEL]]:
 ; GCN: s_endpgm
-define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
+define amdgpu_kernel void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
 entry:
   %a.0 = fadd float %a, 10.0
   %cond = bitcast float %a.0 to i32
@@ -176,7 +176,7 @@
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
 ; GCN: s_endpgm
-define void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) {
+define amdgpu_kernel void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) {
 entry:
   %cmp = icmp eq i32 %a, 0
   br i1 %cmp, label %if.then, label %if.else
@@ -209,7 +209,7 @@
 ; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
 ; GCN: buffer_store_dword [[THREE]]
 ; GCN: s_endpgm
-define void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) {
+define amdgpu_kernel void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) {
 entry:
   %cmp = icmp eq i32 %a, 0
   br i1 %cmp, label %if.then, label %if.else
@@ -233,7 +233,7 @@
 ; GCN: buffer_store_dword
 ; GCN: [[LABEL]]:
 ; GCN: s_endpgm
-define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
 main_body:
   %0 = icmp sgt i32 %cond, 0
   %1 = sext i1 %0 to i32
@@ -258,7 +258,7 @@
 ; GCN: {{^}}[[BODY]]:
 ; GCN: buffer_store
 ; GCN: s_endpgm
-define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %cmp0 = icmp sgt i32 %cond0, 0
@@ -284,7 +284,7 @@
 ; SI: s_cmp_lg_u32 [[I]], 0
 ; SI: s_cbranch_scc1 [[LOOP_LABEL]]
 ; SI: s_endpgm
-define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
 entry:
   br label %loop
 
@@ -310,7 +310,7 @@
 ; GCN: {{^}}[[IF_UNIFORM_LABEL]]:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
-define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %d_cmp = icmp ult i32 %tid, 16
@@ -338,7 +338,7 @@
 ; GCN: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
-define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %u_cmp = icmp eq i32 %cond, 0
   br i1 %u_cmp, label %if, label %endif
@@ -370,7 +370,7 @@
 ; GCN: [[IF_UNIFORM]]:
 ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
 ; GCN: buffer_store_dword [[TWO]]
-define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %d_cmp = icmp eq i32 %tid, 0
@@ -410,7 +410,7 @@
 
 ; GCN: BB[[FNNUM]]_3:
 ; GCN: s_endpgm
-define void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp1 = icmp sgt i32 %cond, 0
@@ -445,7 +445,7 @@
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -477,7 +477,7 @@
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp ne i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -505,7 +505,7 @@
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp sgt i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -524,7 +524,7 @@
 
 ; GCN-LABEL: {{^}}move_to_valu_i64_eq:
 ; GCN: v_cmp_eq_u64_e32
-define void @move_to_valu_i64_eq(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @move_to_valu_i64_eq(i32 addrspace(1)* %out) {
   %cond = load volatile i64, i64 addrspace(3)* undef
   %cmp0 = icmp eq i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -543,7 +543,7 @@
 
 ; GCN-LABEL: {{^}}move_to_valu_i64_ne:
 ; GCN: v_cmp_ne_u64_e32
-define void @move_to_valu_i64_ne(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @move_to_valu_i64_ne(i32 addrspace(1)* %out) {
   %cond = load volatile i64, i64 addrspace(3)* undef
   %cmp0 = icmp ne i64 %cond, 0
   br i1 %cmp0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-crash.ll b/llvm/test/CodeGen/AMDGPU/uniform-crash.ll
index cfbb2af..028199e 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-crash.ll
@@ -6,7 +6,7 @@
 ; GCN: s_cbranch_scc1 [[LABEL:BB[0-9_A-Z]+]]
 ; GCN: [[LABEL]]:
 ; GCN-NEXT: s_endpgm
-define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
 main_body:
   %0 = icmp sgt i32 %cond, 0
   %1 = sext i1 %0 to i32
@@ -25,7 +25,7 @@
 ; GCN: {{^}}[[LOOP:[A-Z0-9_]+]]:
 ; GCN: s_cbranch_scc1 [[LOOP]]
 ; GCN: {{^}}[[BB0]]:
-define void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1)  {
+define amdgpu_kernel void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1)  {
 bb:
   %cnd = trunc i32 %arg to i1
   br i1 %cnd, label %bb2, label %bb5
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
index 2c3a098..e0067f9 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
@@ -38,7 +38,7 @@
 ; CHECK-NEXT: s_xor_b64
 ; CHECK-NEXT: ; mask branch
 ; CHECK-NEXT: s_cbranch_execz
-define void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 main_body:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %cc = icmp eq i32 %tid, 0
diff --git a/llvm/test/CodeGen/AMDGPU/unknown-processor.ll b/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
index 941f4c6..25a700a 100644
--- a/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
+++ b/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
@@ -13,7 +13,7 @@
 ; GCN: ScratchSize: 8{{$}}
 
 ; R600: MOV
-define void @foo() {
+define amdgpu_kernel void @foo() {
   %alloca = alloca i32, align 4
   store volatile i32 0, i32* %alloca
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/unroll.ll b/llvm/test/CodeGen/AMDGPU/unroll.ll
index 411a15a..e171944 100644
--- a/llvm/test/CodeGen/AMDGPU/unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/unroll.ll
@@ -9,7 +9,7 @@
 ; CHECK-LABEL: @test
 ; CHECK-NOT: alloca
 ; CHECK: store i32 5, i32 addrspace(1)* %out
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
 entry:
   %0 = alloca [32 x i32]
   br label %loop.header
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-cc.ll b/llvm/test/CodeGen/AMDGPU/unsupported-cc.ll
index d120111..68e91e8 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-cc.ll
@@ -6,7 +6,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
-define void @slt(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @slt(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp slt i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -18,7 +18,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
-define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp ult i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -31,7 +31,7 @@
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
 ; CHECK-NEXT: LSHR *
-define void @ult_float(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ult_float(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ult float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
@@ -43,7 +43,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @ult_float_native(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ult_float_native(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ult float %in, 5.0
   %1 = select i1 %0, float 0.0, float 1.0
@@ -55,7 +55,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @olt(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @olt(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
@@ -67,7 +67,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
-define void @sle(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @sle(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sle i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -79,7 +79,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
-define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp ule i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -92,7 +92,7 @@
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
 ; CHECK-NEXT: LSHR *
-define void @ule_float(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ule_float(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ule float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
@@ -104,7 +104,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @ule_float_native(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ule_float_native(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ule float %in, 5.0
   %1 = select i1 %0, float 0.0, float 1.0
@@ -116,7 +116,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT:1084227584(5.000000e+00)
-define void @ole(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ole(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
diff --git a/llvm/test/CodeGen/AMDGPU/urem.ll b/llvm/test/CodeGen/AMDGPU/urem.ll
index 9e2cfa3..fd7f8fa 100644
--- a/llvm/test/CodeGen/AMDGPU/urem.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem.ll
@@ -9,7 +9,7 @@
 ; FUNC-LABEL: {{^}}test_urem_i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -26,7 +26,7 @@
 ; SI: v_sub_i32
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = urem i32 %num, 7
   store i32 %result, i32 addrspace(1)* %out
@@ -36,7 +36,7 @@
 ; FUNC-LABEL: {{^}}test_urem_v2i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -48,7 +48,7 @@
 ; FUNC-LABEL: {{^}}test_urem_v4i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -60,7 +60,7 @@
 ; FUNC-LABEL: {{^}}test_urem_i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
   %b = load i64, i64 addrspace(1)* %b_ptr
@@ -72,7 +72,7 @@
 ; FUNC-LABEL: {{^}}test_urem_v2i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -84,7 +84,7 @@
 ; FUNC-LABEL: {{^}}test_urem_v4i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index 82bdc26..f8e6b7e 100644
--- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -11,7 +11,7 @@
 ; GCN: s_load_dword [[SGPR:s[0-9]+]],
 ; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
   %dbl = fadd float %a, %a
   store float %dbl, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@
 ; GCN: s_load_dword [[SGPR:s[0-9]+]],
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -58,7 +58,7 @@
 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]]
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
-define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
   %va0 = load volatile float, float addrspace(1)* %in
   %va1 = load volatile float, float addrspace(1)* %in
   %fma0 = call float @llvm.fma.f32(float %a, float %va0, float %b) #1
@@ -76,7 +76,7 @@
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -90,7 +90,7 @@
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -100,7 +100,7 @@
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -110,7 +110,7 @@
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -121,7 +121,7 @@
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 {
   %val = call float @llvm.amdgcn.div.fixup.f32(float 2.0, float %a, float %a) #1
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -132,7 +132,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float 1024.0) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -143,7 +143,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT0]]
-define void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
   store float %fma, float addrspace(1)* %out
   ret void
@@ -158,7 +158,7 @@
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
   %fma1 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %b) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -171,7 +171,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
   store float %fma, float addrspace(1)* %out
   ret void
@@ -186,7 +186,7 @@
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float 1024.0, float %b, float 1024.0) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -199,7 +199,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
   store float %fma, float addrspace(1)* %out
   ret void
@@ -214,7 +214,7 @@
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float %b, float 1024.0, float 1024.0) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -234,7 +234,7 @@
 
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
-define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float %a, float %b, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float %a, float %b, float 4096.0) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -259,7 +259,7 @@
 
 ; GCN: buffer_store_dwordx2 [[RESULT0]]
 ; GCN: buffer_store_dwordx2 [[RESULT1]]
-define void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 {
   %fma0 = call double @llvm.fma.f64(double %a, double %b, double 1024.0) #1
   %fma1 = call double @llvm.fma.f64(double %a, double %b, double 4096.0) #1
   store volatile double %fma0, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 9391eda..d1f454f0 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -9,7 +9,7 @@
 
 ; EG: SUBB_UINT
 ; EG: ADDC_UINT
-define void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
@@ -27,7 +27,7 @@
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
@@ -42,7 +42,7 @@
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -63,7 +63,7 @@
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -87,7 +87,7 @@
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT
 ; EG: SUB_INT
-define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
@@ -104,7 +104,7 @@
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT
 ; EG: SUB_INT
-define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr
@@ -122,7 +122,7 @@
 ; FUNC-LABEL: {{^}}v_usubo_i16:
 ; VI: v_subrev_u16_e32
 ; VI: v_cmp_gt_u16_e32
-define void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr
diff --git a/llvm/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll b/llvm/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
index a48e7ac..b7d766a 100644
--- a/llvm/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}kernel_arg_i64:
-define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; i64 arg works, v1i64 arg does not.
 ; CHECK-LABEL: {{^}}kernel_arg_v1i64:
-define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index 78d534b..d4a68a4 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -9,7 +9,7 @@
 ; GCN-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
 ; GCN: s_endpgm
-define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
+define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
   %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
   %f = load float, float addrspace(1)* %f.gep
@@ -30,7 +30,7 @@
 ; GCN-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
 ; GCN: s_endpgm
-define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
+define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
   %setcc = icmp ne i32 %c, 0
   %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
   store float %select, float addrspace(1)* %out
@@ -47,7 +47,7 @@
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -62,7 +62,7 @@
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -77,7 +77,7 @@
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -92,7 +92,7 @@
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -107,7 +107,7 @@
 ; GCN-DAG: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
 ; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
 ; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 0, [[Z]], [[COND]]
-define void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
@@ -124,7 +124,7 @@
 ; GCN-DAG: s_load_dword [[X:s[0-9]+]]
 ; GCN: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
 ; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]]
-define void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
@@ -142,7 +142,7 @@
 ; GCN-DAG: v_cmp_ngt_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -159,7 +159,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
 ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc
-define void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -178,7 +178,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
 ; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc
-define void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -203,7 +203,7 @@
 ; VI-DAG: v_cmp_lt_i64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}}
 ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[Z_HI]], s
 ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, v[[Z_LO]], s
-define void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i64, i64 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -226,7 +226,7 @@
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -249,7 +249,7 @@
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -275,7 +275,7 @@
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -298,7 +298,7 @@
 ; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, vcc
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s
 ; GCN: store_byte
-define void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -321,7 +321,7 @@
 ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -343,7 +343,7 @@
 ; GCN: v_cmp_nlg_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -364,7 +364,7 @@
 
 ; GCN: v_cmp_gt_u32_e32 vcc, 2, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[Z]], vcc
-define void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -386,7 +386,7 @@
 ; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc
-define void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll b/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
index 9246ce3..2cda52a 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_0:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 0, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_1:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_2:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_3:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 3, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -40,7 +40,7 @@
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result0 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0
   %result1 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %result0) #0
   %result2 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %result1) #0
@@ -51,7 +51,7 @@
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx(i32 addrspace(1)* %out, float %src, i32 %idx, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx(i32 addrspace(1)* %out, float %src, i32 %idx, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 %idx, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll
index 290753c..2b96f7d 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll
@@ -8,7 +8,7 @@
 ; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
 ; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]]
 ; GCN: buffer_store_dword [[C]]
-define void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -26,7 +26,7 @@
 ; GCN-LABEL: {{^}}mad_inline_sgpr_inline:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5
-define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 {
 entry:
   %tmp0 = fmul float 0.5, %in
   %tmp1 = fadd float %tmp0, 0.5
@@ -37,7 +37,7 @@
 ; GCN-LABEL: {{^}}mad_vvs:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 {
+define amdgpu_kernel void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
 
@@ -52,7 +52,7 @@
 
 ; GCN-LABEL: {{^}}mac_ssv:
 ; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 {
+define amdgpu_kernel void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 {
 entry:
   %c = load float, float addrspace(1)* %in
 
@@ -65,7 +65,7 @@
 ; GCN-LABEL: {{^}}mac_mad_same_add:
 ; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
 ; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
-define void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -96,7 +96,7 @@
 ; GCN-LABEL: {{^}}mad_neg_src0:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -116,7 +116,7 @@
 ; GCN-LABEL: {{^}}nsz_mad_sub0_src0:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @nsz_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @nsz_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -136,7 +136,7 @@
 ; GCN-LABEL: {{^}}safe_mad_sub0_src0:
 ; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0,
 ; GCN: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SUB0]]
-define void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -156,7 +156,7 @@
 ; GCN-LABEL: {{^}}mad_neg_src1:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -176,7 +176,7 @@
 ; GCN-LABEL: {{^}}nsz_mad_sub0_src1:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @nsz_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @nsz_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -196,7 +196,7 @@
 ; GCN-LABEL: {{^}}mad_neg_src2:
 ; GCN-NOT: v_mac
 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
-define void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -222,7 +222,7 @@
 
 ; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
 ; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
-define void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 {
+define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -257,7 +257,7 @@
 
 ; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
 ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
-define void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 {
+define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
index f5304bd..6a11284 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -14,7 +14,7 @@
 ; VI:  v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]]
 ; VI:  buffer_store_short v[[C_F16]]
 ; GCN: s_endpgm
-define void @mac_f16(
+define amdgpu_kernel void @mac_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -38,7 +38,7 @@
 ; VI:  v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
 ; VI:  v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_f16_same_add(
+define amdgpu_kernel void @mac_f16_same_add(
     half addrspace(1)* %r0,
     half addrspace(1)* %r1,
     half addrspace(1)* %a,
@@ -73,7 +73,7 @@
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_a(
+define amdgpu_kernel void @mac_f16_neg_a(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -100,7 +100,7 @@
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_b(
+define amdgpu_kernel void @mac_f16_neg_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -127,7 +127,7 @@
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_c(
+define amdgpu_kernel void @mac_f16_neg_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -151,7 +151,7 @@
 ; VI:  v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
 ; GCN: s_endpgm
-define void @mac_f16_neg_a_safe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -175,7 +175,7 @@
 ; VI:  v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_f16_neg_b_safe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -199,7 +199,7 @@
 ; VI:  v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_f16_neg_c_safe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_c_safe_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -226,7 +226,7 @@
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_a_nsz_fp_math(
+define amdgpu_kernel void @mac_f16_neg_a_nsz_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -253,7 +253,7 @@
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_b_nsz_fp_math(
+define amdgpu_kernel void @mac_f16_neg_b_nsz_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -280,7 +280,7 @@
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_c_nsz_fp_math(
+define amdgpu_kernel void @mac_f16_neg_c_nsz_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -328,7 +328,7 @@
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
 ; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @mac_v2f16(
+define amdgpu_kernel void @mac_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -355,7 +355,7 @@
 ; VI:  v_mac_f16_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_v2f16_same_add(
+define amdgpu_kernel void @mac_v2f16_same_add(
     <2 x half> addrspace(1)* %r0,
     <2 x half> addrspace(1)* %r1,
     <2 x half> addrspace(1)* %a,
@@ -392,7 +392,7 @@
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_a(
+define amdgpu_kernel void @mac_v2f16_neg_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -421,7 +421,7 @@
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_b(
+define amdgpu_kernel void @mac_v2f16_neg_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -454,7 +454,7 @@
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_c(
+define amdgpu_kernel void @mac_v2f16_neg_c(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -482,7 +482,7 @@
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
 ; GCN: s_endpgm
-define void @mac_v2f16_neg_a_safe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -510,7 +510,7 @@
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_v2f16_neg_b_safe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -538,7 +538,7 @@
 ; VI:  v_mac_f16_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_v2f16_neg_c_safe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_c_safe_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -571,7 +571,7 @@
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_a_nsz_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_a_nsz_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -604,7 +604,7 @@
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_b_nsz_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_b_nsz_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -637,7 +637,7 @@
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_c_nsz_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_c_nsz_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index df87a94c..bfb1050 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -7,7 +7,7 @@
 ; VI:  v_madak_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}}
 ; VI:  buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @madak_f16(
+define amdgpu_kernel void @madak_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -28,7 +28,7 @@
 ; VI:  v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @madak_f16_use_2(
+define amdgpu_kernel void @madak_f16_use_2(
     half addrspace(1)* %r0,
     half addrspace(1)* %r1,
     half addrspace(1)* %a,
diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index e25589c..41220ff 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -29,7 +29,7 @@
 ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
 ; SI-NEXT: ; mask branch
 ;
-define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
+define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   switch i32 %tid, label %default [
@@ -76,7 +76,7 @@
 ; SI: BB1_2:
 ; SI: s_or_b64 exec, exec, [[BR_SREG]]
 ; SI: s_endpgm
-define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
   br i1 %is.0, label %store, label %exit
@@ -106,7 +106,7 @@
 ; SI: [[LABEL_EXIT]]:
 ; SI: s_endpgm
 
-define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
@@ -173,7 +173,7 @@
 ; SI-NOT: [[COND_STATE]]
 ; SI: s_endpgm
 
-define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
+define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp4 = sext i32 %tmp to i64
diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index 03e473e..5e54658 100644
--- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -1,7 +1,7 @@
 # RUN: llc -run-pass si-insert-waits -march=amdgcn -mcpu=tahiti -o - %s | FileCheck %s
 --- |
 
-  define void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 {
+  define amdgpu_kernel void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 {
   entry:
     %cmp0 = fcmp oeq float %cond, 0.000000e+00
     br i1 %cmp0, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
@@ -20,7 +20,7 @@
     ret void
   }
 
-  define void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
+  define amdgpu_kernel void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
   entry:
     br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
 
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
index 7dcf36f..03cf725 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -15,7 +15,7 @@
 ; EG: MOV
 ; EG: MOV
 ; EG: MOVA_INT
-define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -44,7 +44,7 @@
 ; EG: MOV
 ; EG: MOVA_INT
 ; EG: MOVA_INT
-define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -71,7 +71,7 @@
 
 ; FUNC-LABEL: {{^}}bitcast_gep:
 ; EG: STORE_RAW
-define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -93,7 +93,7 @@
 ; OPT-LABEL: @vector_read_bitcast_gep(
 ; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
 ; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
-define void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -121,7 +121,7 @@
 ; OPT: store float
 ; OPT: store float
 ; OPT: load float
-define void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32]
   %tmp.bc = bitcast [4 x i32]* %tmp to [4 x float]*
diff --git a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
index 2d39f82..ab2bfcf 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
@@ -13,7 +13,7 @@
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
@@ -30,7 +30,7 @@
 ; GCN: v_movreld_b32
 ; GCN: v_movrels_b32
 ; GCN: buffer_store_dword v
-define void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
+define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
@@ -49,7 +49,7 @@
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
@@ -68,7 +68,7 @@
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %id.ext
diff --git a/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll b/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll
index 7c254ab..90cf34e 100644
--- a/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll
@@ -12,7 +12,7 @@
 ; CHECK-DAG: ds_write2_b32
 ; CHECK-DAG: ds_write2_b32
 
-define void @vectorize_global_local(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(3)* nocapture %arg1) {
+define amdgpu_kernel void @vectorize_global_local(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(3)* nocapture %arg1) {
 bb:
   %tmp = load i32, i32 addrspace(1)* %arg, align 4
   store i32 %tmp, i32 addrspace(3)* %arg1, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll b/llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
index 3d71062..46a1c87 100644
--- a/llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
+++ b/llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
@@ -6,7 +6,7 @@
 ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
 
-define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %v = load i32, i32 addrspace(1)* %in
   store i32 %v, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@
 ; EG: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[SRC]],0x00,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x00,0x00
 
-define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %v = load <4 x i32>, <4 x i32> addrspace(1)* %in
   store <4 x i32> %v, <4 x i32> addrspace(1)* %out
   ret void
@@ -26,7 +26,7 @@
 ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #3 ; encoding: [0x40,0x03,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #3 ; encoding: [0x40,0x03,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
 
-define void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) {
+define amdgpu_kernel void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) {
   %v = load i32, i32 addrspace(7)* %in
   store i32 %v, i32 addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@
 
 @t = internal addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3]
 
-define void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) {
   %a = getelementptr inbounds [4 x i32], [4 x i32] addrspace(2)* @t, i32 0, i32 %in
   %v = load i32, i32 addrspace(2)* %a
   store i32 %v, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index f04cbcb..e82e548 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -45,7 +45,7 @@
 ; GCN: ScratchSize: 1536
 
 ; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset.
-define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
+define amdgpu_kernel void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
 bb:
   %tmp = add i32 %arg1, %arg2
   %tmp7 = extractelement <4 x float> %arg6, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
index 7825e36..8d66c34 100644
--- a/llvm/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
@@ -4,7 +4,7 @@
 
 declare float @llvm.amdgcn.rsq.legacy(float) #0
 
-define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float %src), !dbg !4
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/vop-shrink.ll b/llvm/test/CodeGen/AMDGPU/vop-shrink.ll
index ae8ec58..d2708b06 100644
--- a/llvm/test/CodeGen/AMDGPU/vop-shrink.ll
+++ b/llvm/test/CodeGen/AMDGPU/vop-shrink.ll
@@ -8,7 +8,7 @@
 
 ; ModuleID = 'vop-shrink.ll'
 
-define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
+define amdgpu_kernel void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
 entry:
   %vgpr = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp = icmp eq i32 %cond, 0
@@ -35,7 +35,7 @@
 
 ; FUNC-LABEL: {{^}}add_fold:
 ; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000
-define void @add_fold(float addrspace(1)* %out) {
+define amdgpu_kernel void @add_fold(float addrspace(1)* %out) {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = uitofp i32 %tmp to float
diff --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll
index fe5be75..bb62347 100644
--- a/llvm/test/CodeGen/AMDGPU/vselect.ll
+++ b/llvm/test/CodeGen/AMDGPU/vselect.ll
@@ -10,7 +10,7 @@
 ; SI: v_cndmask_b32_e64
 ; SI: v_cndmask_b32_e32
 
-define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
+define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
 entry:
   %load0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0
   %load1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1
@@ -28,7 +28,7 @@
 ;SI: v_cndmask_b32_e64
 ;SI: v_cndmask_b32_e32
 
-define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
+define amdgpu_kernel void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
 entry:
   %0 = load <2 x float>, <2 x float> addrspace(1)* %in0
   %1 = load <2 x float>, <2 x float> addrspace(1)* %in1
@@ -52,7 +52,7 @@
 ; SI: v_cndmask_b32
 ; SI: v_cndmask_b32
 
-define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
+define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
 entry:
   %load0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0
   %load1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1
@@ -68,7 +68,7 @@
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
+define amdgpu_kernel void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
 entry:
   %0 = load <4 x float>, <4 x float> addrspace(1)* %in0
   %1 = load <4 x float>, <4 x float> addrspace(1)* %in1
diff --git a/llvm/test/CodeGen/AMDGPU/vselect64.ll b/llvm/test/CodeGen/AMDGPU/vselect64.ll
index ef85ebe..4a04355 100644
--- a/llvm/test/CodeGen/AMDGPU/vselect64.ll
+++ b/llvm/test/CodeGen/AMDGPU/vselect64.ll
@@ -5,7 +5,7 @@
 ; Make sure the vectors aren't being stored on the stack.  We know they are
 ; being stored on the stack if the shaders uses at leat 10 registers.
 ; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X
-define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
+define amdgpu_kernel void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
 entry:
        %cmp = icmp ne  <4 x i32> %c, <i32 0, i32 0, i32 0, i32 0>
        %result = select <4 x i1> %cmp, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> <i64 4, i64 5, i64 6, i64 7>
diff --git a/llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll b/llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
index 4584d6e..4c5eb3d 100644
--- a/llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
@@ -10,7 +10,7 @@
 ; CHECK-NOT: ALU_POP_AFTER
 ; CHECK: TEX
 ; CHECK-NEXT: POP
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %endif, label %if
diff --git a/llvm/test/CodeGen/AMDGPU/vtx-schedule.ll b/llvm/test/CodeGen/AMDGPU/vtx-schedule.ll
index 912e258..c4b619b 100644
--- a/llvm/test/CodeGen/AMDGPU/vtx-schedule.ll
+++ b/llvm/test/CodeGen/AMDGPU/vtx-schedule.ll
@@ -9,7 +9,7 @@
 ; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0
 ; CHECK: Fetch clause
 ; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0
-define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
+define amdgpu_kernel void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
 entry:
   %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0
   %1 = load i32, i32 addrspace(1)* %0
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-flat.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-flat.ll
index d29bae4..5d86b12 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-flat.ll
@@ -9,7 +9,7 @@
 ; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]]
 ; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0)
 ; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}]
-define void @test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
   store volatile i32 0, i32 addrspace(1)* %out
   %val = load volatile i32, i32 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt.mir
index afe651a..38662e8 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt.mir
@@ -1,18 +1,18 @@
 # RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-waits  %s -o - | FileCheck %s
 
 --- |
-  define void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
+  define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
                                  <4 x i32> addrspace(1)* %global16,
                                  i32 addrspace(4)* %flat4,
                                  <4 x i32> addrspace(4)* %flat16) {
     ret void
   }
 
-  define void @single_fallthrough_successor_no_end_block_wait() {
+  define amdgpu_kernel void @single_fallthrough_successor_no_end_block_wait() {
     ret void
   }
 
-  define void @single_branch_successor_not_next_block() {
+  define amdgpu_kernel void @single_branch_successor_not_next_block() {
     ret void
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll b/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
index deac809..b1ee016 100644
--- a/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
@@ -10,7 +10,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 
-define void @write_vgpr_into_sgpr() {
+define amdgpu_kernel void @write_vgpr_into_sgpr() {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   call void @llvm.write_register.i32(metadata !0, i32 %tid)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/write_register.ll b/llvm/test/CodeGen/AMDGPU/write_register.ll
index 88660ba..9c62e00 100644
--- a/llvm/test/CodeGen/AMDGPU/write_register.ll
+++ b/llvm/test/CodeGen/AMDGPU/write_register.ll
@@ -4,7 +4,7 @@
 declare void @llvm.write_register.i64(metadata, i64) #0
 
 ; CHECK-LABEL: {{^}}test_write_m0:
-define void @test_write_m0(i32 %val) #0 {
+define amdgpu_kernel void @test_write_m0(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !0, i32 0)
   call void @llvm.write_register.i32(metadata !0, i32 -1)
   call void @llvm.write_register.i32(metadata !0, i32 %val)
@@ -15,7 +15,7 @@
 ; CHECK: s_mov_b64 exec, 0
 ; CHECK: s_mov_b64 exec, -1
 ; CHECK: s_mov_b64 exec, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_write_exec(i64 %val) #0 {
+define amdgpu_kernel void @test_write_exec(i64 %val) #0 {
   call void @llvm.write_register.i64(metadata !1, i64 0)
   call void @llvm.write_register.i64(metadata !1, i64 -1)
   call void @llvm.write_register.i64(metadata !1, i64 %val)
@@ -26,7 +26,7 @@
 ; CHECK: s_mov_b64 flat_scratch, 0
 ; CHECK: s_mov_b64 flat_scratch, -1
 ; CHECK: s_mov_b64 flat_scratch, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_write_flat_scratch(i64 %val) #0 {
+define amdgpu_kernel void @test_write_flat_scratch(i64 %val) #0 {
   call void @llvm.write_register.i64(metadata !2, i64 0)
   call void @llvm.write_register.i64(metadata !2, i64 -1)
   call void @llvm.write_register.i64(metadata !2, i64 %val)
@@ -36,7 +36,7 @@
 ; CHECK-LABEL: {{^}}test_write_flat_scratch_lo:
 ; CHECK: s_mov_b32 flat_scratch_lo, 0
 ; CHECK: s_mov_b32 flat_scratch_lo, s{{[0-9]+}}
-define void @test_write_flat_scratch_lo(i32 %val) #0 {
+define amdgpu_kernel void @test_write_flat_scratch_lo(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !3, i32 0)
   call void @llvm.write_register.i32(metadata !3, i32 %val)
   ret void
@@ -45,7 +45,7 @@
 ; CHECK-LABEL: {{^}}test_write_flat_scratch_hi:
 ; CHECK: s_mov_b32 flat_scratch_hi, 0
 ; CHECK: s_mov_b32 flat_scratch_hi, s{{[0-9]+}}
-define void @test_write_flat_scratch_hi(i32 %val) #0 {
+define amdgpu_kernel void @test_write_flat_scratch_hi(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !4, i32 0)
   call void @llvm.write_register.i32(metadata !4, i32 %val)
   ret void
@@ -54,7 +54,7 @@
 ; CHECK-LABEL: {{^}}test_write_exec_lo:
 ; CHECK: s_mov_b32 exec_lo, 0
 ; CHECK: s_mov_b32 exec_lo, s{{[0-9]+}}
-define void @test_write_exec_lo(i32 %val) #0 {
+define amdgpu_kernel void @test_write_exec_lo(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !5, i32 0)
   call void @llvm.write_register.i32(metadata !5, i32 %val)
   ret void
@@ -63,7 +63,7 @@
 ; CHECK-LABEL: {{^}}test_write_exec_hi:
 ; CHECK: s_mov_b32 exec_hi, 0
 ; CHECK: s_mov_b32 exec_hi, s{{[0-9]+}}
-define void @test_write_exec_hi(i32 %val) #0 {
+define amdgpu_kernel void @test_write_exec_hi(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !6, i32 0)
   call void @llvm.write_register.i32(metadata !6, i32 %val)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll b/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
index 7f6b8045..3653236 100644
--- a/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
+++ b/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
@@ -4,7 +4,7 @@
 ;CHECK: {{^}}fill3d:
 ;CHECK-NOT: MULLO_INT T[0-9]+
 
-define void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %x.i = tail call i32 @llvm.r600.read.global.size.x() #1
   %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1
diff --git a/llvm/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll b/llvm/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll
index babae9e..88ef9fd 100644
--- a/llvm/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll
@@ -5,7 +5,7 @@
 
 ; TODO: enable doubles
 ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
-define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
   %val = load double, double addrspace(1)* %in, align 8
   %add = fadd double %val, 4.0
   %bc = bitcast double %add to <2 x i32>
@@ -14,7 +14,7 @@
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64:
-define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
+define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -30,7 +30,7 @@
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64:
-define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
+define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index bf02d4c..57a082a 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -10,7 +10,7 @@
 ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
+define amdgpu_kernel void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1
   %result = xor <2 x i32> %a, %b
@@ -29,7 +29,7 @@
 ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
+define amdgpu_kernel void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1
   %result = xor <4 x i32> %a, %b
@@ -46,7 +46,7 @@
 ; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+define amdgpu_kernel void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
   %a = load float, float addrspace(1) * %in0
   %b = load float, float addrspace(1) * %in1
   %acmp = fcmp oge float %a, 0.000000e+00
@@ -63,7 +63,7 @@
 ; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
+define amdgpu_kernel void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
   %a = load volatile i1, i1 addrspace(1)* %in0
   %b = load volatile i1, i1 addrspace(1)* %in1
   %xor = xor i1 %a, %b
@@ -73,7 +73,7 @@
 
 ; FUNC-LABEL: {{^}}vector_xor_i32:
 ; SI: v_xor_b32_e32
-define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32, i32 addrspace(1)* %in0
   %b = load i32, i32 addrspace(1)* %in1
   %result = xor i32 %a, %b
@@ -83,7 +83,7 @@
 
 ; FUNC-LABEL: {{^}}scalar_xor_i32:
 ; SI: s_xor_b32
-define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %result = xor i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -91,7 +91,7 @@
 
 ; FUNC-LABEL: {{^}}scalar_not_i32:
 ; SI: s_not_b32
-define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
   %result = xor i32 %a, -1
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@
 
 ; FUNC-LABEL: {{^}}vector_not_i32:
 ; SI: v_not_b32
-define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32, i32 addrspace(1)* %in0
   %b = load i32, i32 addrspace(1)* %in1
   %result = xor i32 %a, -1
@@ -111,7 +111,7 @@
 ; SI: v_xor_b32_e32
 ; SI: v_xor_b32_e32
 ; SI: s_endpgm
-define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64, i64 addrspace(1)* %in0
   %b = load i64, i64 addrspace(1)* %in1
   %result = xor i64 %a, %b
@@ -122,7 +122,7 @@
 ; FUNC-LABEL: {{^}}scalar_xor_i64:
 ; SI: s_xor_b64
 ; SI: s_endpgm
-define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = xor i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -130,7 +130,7 @@
 
 ; FUNC-LABEL: {{^}}scalar_not_i64:
 ; SI: s_not_b64
-define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = xor i64 %a, -1
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -139,7 +139,7 @@
 ; FUNC-LABEL: {{^}}vector_not_i64:
 ; SI: v_not_b32
 ; SI: v_not_b32
-define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64, i64 addrspace(1)* %in0
   %b = load i64, i64 addrspace(1)* %in1
   %result = xor i64 %a, -1
@@ -153,7 +153,7 @@
 
 ; FUNC-LABEL: {{^}}xor_cf:
 ; SI: s_xor_b64
-define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
+define amdgpu_kernel void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
@@ -178,7 +178,7 @@
 ; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
-define void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = xor i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -192,7 +192,7 @@
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
-define void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = xor i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
 
@@ -211,7 +211,7 @@
 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
 ; SI-NOT: xor_b32
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = xor i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -220,7 +220,7 @@
 ; FUNC-LABEL: {{^}}scalar_xor_neg_inline_imm_i64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; SI: s_xor_b64 [[VAL]], [[VAL]], -8
-define void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = xor i64 %a, -8
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -231,7 +231,7 @@
 ; SI: v_xor_b32_e32 {{v[0-9]+}}, -8, v[[LO_VREG]]
 ; SI: v_xor_b32_e32 {{v[0-9]+}}, -1, {{.*}}
 ; SI: s_endpgm
-define void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = xor i64 %loada, -8
   store i64 %or, i64 addrspace(1)* %out
@@ -243,7 +243,7 @@
 ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
 ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = xor i64 %loada, 22470723082367
   store i64 %or, i64 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index 5720996..f256d89 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -9,7 +9,7 @@
 ; SI: {{^}}s_mad_zext_i32_to_i64:
 ; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
 ; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
-define void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
 entry:
   %tmp0 = mul i32 %a, %b
   %tmp1 = add i32 %tmp0, %c
@@ -20,7 +20,7 @@
 
 ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i32
 ; SI: v_cndmask_b32
-define void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %tmp0 = icmp eq i32 %a, %b
   %tmp1 = zext i1 %tmp0 to i32
@@ -29,7 +29,7 @@
 }
 
 ; SI-LABEL: {{^}}s_arg_zext_i1_to_i64:
-define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
+define amdgpu_kernel void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
   %ext = zext i1 %arg to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
@@ -39,7 +39,7 @@
 ; SI: s_mov_b32 s{{[0-9]+}}, 0
 ; SI: v_cmp_eq_u32
 ; SI: v_cndmask_b32
-define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp eq i32 %a, %b
   %ext = zext i1 %cmp to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
@@ -49,7 +49,7 @@
 ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i16
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; SI: buffer_store_short [[RESULT]]
-define void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
   %tmp0 = icmp eq i16 %a, %b
   %tmp1 = zext i1 %tmp0 to i16
   store i16 %tmp1, i16 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll b/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
index 842c30b..a902234 100644
--- a/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
+++ b/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
@@ -11,7 +11,7 @@
 ; GCN-NOT: v[[HI]]
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %ld.64 = load volatile i64, i64 addrspace(1)* %in0
   %ld.32 = load volatile i32, i32 addrspace(1)* %in1
   %ext = zext i32 %ld.32 to i64
@@ -31,7 +31,7 @@
 ; GCN-NOT: _or_
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %ld.64 = load volatile i64, i64 addrspace(1)* %in0
   %ld.32 = load volatile i32, i32 addrspace(1)* %in1
   %ext = zext i32 %ld.32 to i64
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir b/llvm/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
index 84d3baa..5da98fb 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
@@ -6,7 +6,7 @@
 
   @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4
 
-  define void @float(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir b/llvm/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
index 3277d37..7cef01c 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
@@ -1,6 +1,6 @@
 # RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s
 --- |
-  define void @add_f32_1.0_one_f16_use() #0 {
+  define amdgpu_kernel void @add_f32_1.0_one_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -11,7 +11,7 @@
     ret void
   }
 
-  define void @add_f32_1.0_multi_f16_use() #0 {
+  define amdgpu_kernel void @add_f32_1.0_multi_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -22,7 +22,7 @@
     ret void
   }
 
-  define void @add_f32_1.0_one_f32_use_one_f16_use () #0 {
+  define amdgpu_kernel void @add_f32_1.0_one_f32_use_one_f16_use () #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -33,7 +33,7 @@
     ret void
   }
 
-  define void @add_f32_1.0_one_f32_use_multi_f16_use () #0 {
+  define amdgpu_kernel void @add_f32_1.0_one_f32_use_multi_f16_use () #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -46,7 +46,7 @@
     ret void
   }
 
-  define void @add_i32_1_multi_f16_use() #0 {
+  define amdgpu_kernel void @add_i32_1_multi_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f16.add0 = fadd half %f16.val0, 0xH0001
@@ -56,7 +56,7 @@
     ret void
   }
 
-  define void @add_i32_m2_one_f32_use_multi_f16_use () #0 {
+  define amdgpu_kernel void @add_i32_m2_one_f32_use_multi_f16_use () #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -69,7 +69,7 @@
     ret void
   }
 
-  define void @add_f16_1.0_multi_f32_use() #0 {
+  define amdgpu_kernel void @add_f16_1.0_multi_f32_use() #0 {
     %f32.val0 = load volatile float, float addrspace(1)* undef
     %f32.val1 = load volatile float, float addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -80,7 +80,7 @@
     ret void
   }
 
-  define void @add_f16_1.0_other_high_bits_multi_f16_use() #0 {
+  define amdgpu_kernel void @add_f16_1.0_other_high_bits_multi_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile half, half addrspace(1)* undef
@@ -91,7 +91,7 @@
     ret void
   }
 
-  define void @add_f16_1.0_other_high_bits_use_f16_f32() #0 {
+  define amdgpu_kernel void @add_f16_1.0_other_high_bits_use_f16_f32() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile half, half addrspace(1)* undef
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/intrinsics.mir b/llvm/test/CodeGen/MIR/AMDGPU/intrinsics.mir
index f43266e..68950a4 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/intrinsics.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/intrinsics.mir
@@ -2,7 +2,7 @@
 
 --- |
 
-  define void @use_intrin() {
+  define amdgpu_kernel void @use_intrin() {
     ret void
   }
 
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir b/llvm/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
index fec7a5d..8cffc86 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
@@ -6,7 +6,7 @@
 
   @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4
 
-  define void @float(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir b/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
index 10c8128..32669de 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
@@ -7,7 +7,7 @@
 
   @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4
 
-  define void @float(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0
@@ -15,7 +15,7 @@
     ret void
   }
 
-  define void @float2(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float2(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0
diff --git a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
index 6cec253..2bcb3a9 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
@@ -5,7 +5,7 @@
 ; ASC-NOT: ptrtoint
 ; ASC-NOT: inttoptr
 
-define void @test_sink_ptrtoint_asc(float addrspace(1)* nocapture %arg, float addrspace(1)* nocapture readonly %arg1, float addrspace(3)* %arg2) #0 {
+define amdgpu_kernel void @test_sink_ptrtoint_asc(float addrspace(1)* nocapture %arg, float addrspace(1)* nocapture readonly %arg1, float addrspace(3)* %arg2) #0 {
 bb:
   %tmp = getelementptr inbounds float, float addrspace(3)* %arg2, i32 16
   %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll b/llvm/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll
index aa4fb8e..36c7bd9 100644
--- a/llvm/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll
+++ b/llvm/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll
@@ -14,7 +14,7 @@
 ; CHECK-LABEL: @indvar_32_bit(
 ; CHECK-NOT: sext i32
 ; CHECK: phi i32
-define void @indvar_32_bit(i32 %n, i32* nocapture %output) {
+define amdgpu_kernel void @indvar_32_bit(i32 %n, i32* nocapture %output) {
 entry:
   %cmp5 = icmp sgt i32 %n, 0
   br i1 %cmp5, label %for.body.preheader, label %for.end
@@ -46,7 +46,7 @@
 ; CHECK-NOT: ashr i64
 ; CHECK-NOT: mul nsw i64
 ; CHECK-NOT: add nsw i64
-define void @no_promote_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @no_promote_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   br label %for.body
 
@@ -72,7 +72,7 @@
 ; be legalized anyway.
 
 ; CHECK-LABEL: @indvar_48_bit(
-define void @indvar_48_bit(i48 %n, i48* nocapture %output) {
+define amdgpu_kernel void @indvar_48_bit(i48 %n, i48* nocapture %output) {
 entry:
   %cmp5 = icmp sgt i48 %n, 0
   br i1 %cmp5, label %for.body.preheader, label %for.end
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
index 67b4ccd..b566c14 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
@@ -45,7 +45,7 @@
 ; CHECK-LABEL: @store_global_from_flat(
 ; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
 ; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* %tmp0
-define void @store_global_from_flat(float addrspace(4)* %generic_scalar) #0 {
+define amdgpu_kernel void @store_global_from_flat(float addrspace(4)* %generic_scalar) #0 {
   %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
   store float 0.0, float addrspace(1)* %tmp0
   ret void
@@ -54,7 +54,7 @@
 ; CHECK-LABEL: @store_group_from_flat(
 ; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
 ; CHECK-NEXT: store float 0.000000e+00, float addrspace(3)* %tmp0
-define void @store_group_from_flat(float addrspace(4)* %generic_scalar) #0 {
+define amdgpu_kernel void @store_group_from_flat(float addrspace(4)* %generic_scalar) #0 {
   %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
   store float 0.0, float addrspace(3)* %tmp0
   ret void
@@ -63,7 +63,7 @@
 ; CHECK-LABEL: @store_private_from_flat(
 ; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
 ; CHECK-NEXT: store float 0.000000e+00, float* %tmp0
-define void @store_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
+define amdgpu_kernel void @store_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
   %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
   store float 0.0, float* %tmp0
   ret void
@@ -74,7 +74,7 @@
 ; CHECK-NEXT: %val = load i32, i32 addrspace(1)* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32 addrspace(1)* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -87,7 +87,7 @@
 ; CHECK-NEXT: %val = load i32, i32 addrspace(3)* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32 addrspace(3)* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -100,7 +100,7 @@
 ; CHECK-NEXT: %val = load i32, i32* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_private(i32* nocapture %input, i32* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_private(i32* nocapture %input, i32* nocapture %output) #0 {
   %tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -113,7 +113,7 @@
 ; CHECK-NEXT: %val = load i32, i32 addrspace(4)* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32 addrspace(4)* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_flat(i32 addrspace(4)* nocapture %input, i32 addrspace(4)* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_flat(i32 addrspace(4)* nocapture %input, i32 addrspace(4)* nocapture %output) #0 {
   %val = load i32, i32 addrspace(4)* %input, align 4
   store i32 %val, i32 addrspace(4)* %output, align 4
   ret void
@@ -122,7 +122,7 @@
 ; CHECK-LABEL: @store_addrspacecast_ptr_value(
 ; CHECK: %cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
 ; CHECK-NEXT: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4
-define void @store_addrspacecast_ptr_value(i32 addrspace(1)* nocapture %input, i32 addrspace(4)* addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @store_addrspacecast_ptr_value(i32 addrspace(1)* nocapture %input, i32 addrspace(4)* addrspace(1)* nocapture %output) #0 {
   %cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4
   ret void
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
index aad9db6..52067cd 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
@@ -28,7 +28,7 @@
 ; CHECK: store float %v, float addrspace(3)* %tmp7, align 4
 ; CHECK: call void @llvm.amdgcn.s.barrier()
 ; CHECK: ret void
-define void @load_store_lds_f32(i32 %i, float %v) #0 {
+define amdgpu_kernel void @load_store_lds_f32(i32 %i, float %v) #0 {
 bb:
   %tmp = load float, float addrspace(4)* addrspacecast (float addrspace(3)* @scalar to float addrspace(4)*), align 4
   call void @use(float %tmp)
@@ -83,7 +83,7 @@
 
 ; CHECK-LABEL: @nested_const_expr(
 ; CHECK: store i32 1, i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds ([10 x float], [10 x float] addrspace(3)* @array, i64 0, i64 1) to i32 addrspace(3)*), align 4
-define void @nested_const_expr() #0 {
+define amdgpu_kernel void @nested_const_expr() #0 {
   store i32 1, i32 addrspace(4)* bitcast (float addrspace(4)* getelementptr ([10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i64 0, i64 1) to i32 addrspace(4)*), align 4
   ret void
 }
@@ -93,7 +93,7 @@
 ; CHECK-NEXT: %v = load float, float addrspace(1)* %addr
 ; CHECK-NEXT: store float %v, float addrspace(1)* %addr
 ; CHECK-NEXT: ret void
-define void @rauw(float addrspace(1)* %input) #0 {
+define amdgpu_kernel void @rauw(float addrspace(1)* %input) #0 {
 bb:
   %generic_input = addrspacecast float addrspace(1)* %input to float addrspace(4)*
   %addr = getelementptr float, float addrspace(4)* %generic_input, i64 10
@@ -117,7 +117,7 @@
 ; CHECK: %exit_cond = icmp eq float addrspace(3)* %i2, %end
 
 ; CHECK: br i1 %exit_cond, label %exit, label %loop
-define void @loop() #0 {
+define amdgpu_kernel void @loop() #0 {
 entry:
   %p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)*
   %end = getelementptr float, float addrspace(4)* %p, i64 10
@@ -150,7 +150,7 @@
 ; CHECK: %0 = addrspacecast float addrspace(3)* %i2 to float addrspace(4)*
 ; CHECK: %exit_cond = icmp eq float addrspace(4)* %0, %end
 ; CHECK: br i1 %exit_cond, label %exit, label %loop
-define void @loop_with_generic_bound() #0 {
+define amdgpu_kernel void @loop_with_generic_bound() #0 {
 entry:
   %p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)*
   %end = load float addrspace(4)*, float addrspace(4)* addrspace(1)* @generic_end
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
index afd1493..557a80f 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: @memset_group_to_flat(
 ; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
+define amdgpu_kernel void @memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
   %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
@@ -10,7 +10,7 @@
 
 ; CHECK-LABEL: @memset_global_to_flat(
 ; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
+define amdgpu_kernel void @memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
   %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
@@ -18,7 +18,7 @@
 
 ; CHECK-LABEL: @memset_group_to_flat_no_md(
 ; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 %size, i32 4, i1 false){{$}}
-define void @memset_group_to_flat_no_md(i8 addrspace(3)* %group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memset_group_to_flat_no_md(i8 addrspace(3)* %group.ptr, i64 %size) #0 {
   %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false)
   ret void
@@ -26,7 +26,7 @@
 
 ; CHECK-LABEL: @memset_global_to_flat_no_md(
 ; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 %size, i32 4, i1 false){{$}}
-define void @memset_global_to_flat_no_md(i8 addrspace(1)* %global.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memset_global_to_flat_no_md(i8 addrspace(1)* %global.ptr, i64 %size) #0 {
   %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false)
   ret void
@@ -34,7 +34,7 @@
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group(
 ; CHCK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
@@ -42,7 +42,7 @@
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_with_group(
 ; CHECK: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size) #0 {
   %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
@@ -50,7 +50,7 @@
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_src_with_group(
 ; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %src.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_flat_to_flat_replace_dest_src_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_src_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   %cast.dest = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
@@ -59,7 +59,7 @@
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_group_src_global(
 ; CHECK: call void @llvm.memcpy.p3i8.p1i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_flat_to_flat_replace_dest_group_src_global(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_group_src_global(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(1)* %src.global.ptr to i8 addrspace(4)*
   %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
@@ -68,7 +68,7 @@
 
 ; CHECK-LABEL: @memcpy_group_to_flat_replace_dest_global(
 ; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_group_to_flat_replace_dest_global(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size) #0 {
+define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size) #0 {
   %cast.dest = addrspacecast i8 addrspace(1)* %dest.global.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p3i8.i32(i8 addrspace(4)* %cast.dest, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
@@ -76,7 +76,7 @@
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(
 ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa.struct !7
-define void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa.struct !7
   ret void
@@ -84,7 +84,7 @@
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_no_md(
 ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}}
-define void @memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false)
   ret void
@@ -93,7 +93,7 @@
 ; CHECK-LABEL: @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(
 ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}}
 ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}}
-define void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest0, i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest0, i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false)
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false)
@@ -103,14 +103,14 @@
 ; Check for iterator problems if the pointer has 2 uses in the same call
 ; CHECK-LABEL: @memcpy_group_flat_to_flat_self(
 ; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 addrspace(3)* %group.ptr, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_group_flat_to_flat_self(i8 addrspace(3)* %group.ptr) #0 {
+define amdgpu_kernel void @memcpy_group_flat_to_flat_self(i8 addrspace(3)* %group.ptr) #0 {
   %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast, i8 addrspace(4)* %cast, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
 }
 ; CHECK-LABEL: @memmove_flat_to_flat_replace_src_with_group(
 ; CHECK: call void @llvm.memmove.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memmove_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll
index 17997052..3231b6c 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll
@@ -9,7 +9,7 @@
 
 ; CHECK-LABEL: @generic_address_bitcast_const(
 ; CHECK: %vecload1 = load <2 x double>, <2 x double> addrspace(1)* bitcast (double addrspace(1)* getelementptr inbounds ([100 x double], [100 x double] addrspace(1)* @data, i64 0, i64 4) to <2 x double> addrspace(1)*), align 8
-define void @generic_address_bitcast_const(i64 %arg0, i32 addrspace(1)* nocapture %results) #0 {
+define amdgpu_kernel void @generic_address_bitcast_const(i64 %arg0, i32 addrspace(1)* nocapture %results) #0 {
 entry:
   %tmp1 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = zext i32 %tmp1 to i64
@@ -39,7 +39,7 @@
 ; CHECK: %tmp1 = bitcast %opencl.pipe_t addrspace(3)* %in_pipe to i32 addrspace(3)*
 ; CHECK: %add.ptr = getelementptr inbounds i32, i32 addrspace(3)* %tmp1, i32 2
 ; CHECK: %tmp2 = load i32, i32 addrspace(3)* %add.ptr, align 4
-define void @generic_address_pipe_bug9673(%opencl.pipe_t addrspace(3)* nocapture %in_pipe, i32 addrspace(1)* nocapture %dst) #0 {
+define amdgpu_kernel void @generic_address_pipe_bug9673(%opencl.pipe_t addrspace(3)* nocapture %in_pipe, i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = bitcast %opencl.pipe_t addrspace(3)* %in_pipe to i32 addrspace(3)*
@@ -55,7 +55,7 @@
 ; CHECK: br i1
 ; CHECK: load float, float addrspace(4)*
 ; CHECK: br label
-define void @generic_address_bug9749(i32 addrspace(1)* nocapture %results) #0 {
+define amdgpu_kernel void @generic_address_bug9749(i32 addrspace(1)* nocapture %results) #0 {
 entry:
   %ptr = alloca float addrspace(4)*, align 8
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -85,7 +85,7 @@
 ; CHECK-LABEL: @generic_address_opt_phi_bug9776_simple_phi_kernel(
 ; CHECK: phi i32 addrspace(3)*
 ; CHECK: store i32 %i.03, i32 addrspace(3)* %
-define void @generic_address_opt_phi_bug9776_simple_phi_kernel(i32 addrspace(3)* nocapture %in, i32 %numElems) #0 {
+define amdgpu_kernel void @generic_address_opt_phi_bug9776_simple_phi_kernel(i32 addrspace(3)* nocapture %in, i32 %numElems) #0 {
 entry:
   %cmp1 = icmp eq i32 %numElems, 0
   br i1 %cmp1, label %for.end, label %for.body.lr.ph
@@ -110,7 +110,7 @@
 ; CHECK-LABEL: @generic_address_bug9899(
 ; CHECK: %vecload = load <2 x i32>, <2 x i32> addrspace(3)*
 ; CHECK: store <2 x i32> %tmp16, <2 x i32> addrspace(3)*
-define void @generic_address_bug9899(i64 %arg0, i32 addrspace(3)* nocapture %sourceA, i32 addrspace(3)* nocapture %destValues) #0 {
+define amdgpu_kernel void @generic_address_bug9899(i64 %arg0, i32 addrspace(3)* nocapture %sourceA, i32 addrspace(3)* nocapture %destValues) #0 {
 entry:
   %tmp1 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = zext i32 %tmp1 to i64
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
index bcbca16..08edc20 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
@@ -18,7 +18,7 @@
 ; CHECK-LABEL: @store_select_group_flat(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1
 ; CHECK: store i32 -1, i32 addrspace(3)* %select
-define void @store_select_group_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
+define amdgpu_kernel void @store_select_group_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1
@@ -43,7 +43,7 @@
 ; CHECK: %2 = addrspacecast i32* %private.ptr.1 to i32 addrspace(4)*
 ; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* %2
 ; CHECK: store i32 -1, i32 addrspace(4)* %select
-define void @store_select_mismatch_group_private_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32* %private.ptr.1) #0 {
+define amdgpu_kernel void @store_select_mismatch_group_private_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32* %private.ptr.1) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %cast1 = addrspacecast i32* %private.ptr.1 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1
@@ -73,7 +73,7 @@
 ; CHECK-LABEL: @store_select_group_flat_null(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
 ; CHECK: store i32 -1, i32 addrspace(3)* %select
-define void @store_select_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* null
   store i32 -1, i32 addrspace(4)* %select
@@ -83,7 +83,7 @@
 ; CHECK-LABEL: @store_select_group_flat_null_swap(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*), i32 addrspace(3)* %group.ptr.0
 ; CHECK: store i32 -1, i32 addrspace(3)* %select
-define void @store_select_group_flat_null_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_null_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* null, i32 addrspace(4)* %cast0
   store i32 -1, i32 addrspace(4)* %select
@@ -93,7 +93,7 @@
 ; CHECK-LABEL: @store_select_group_flat_undef(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* undef
 ; CHECK: store i32 -1, i32 addrspace(3)* %select
-define void @store_select_group_flat_undef(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_undef(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* undef
   store i32 -1, i32 addrspace(4)* %select
@@ -103,7 +103,7 @@
 ; CHECK-LABEL: @store_select_group_flat_undef_swap(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* undef, i32 addrspace(3)* %group.ptr.0
 ; CHECK: store i32 -1, i32 addrspace(3)* %select
-define void @store_select_group_flat_undef_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_undef_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* undef, i32 addrspace(4)* %cast0
   store i32 -1, i32 addrspace(4)* %select
@@ -114,7 +114,7 @@
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
 ; CHECK: %gep = getelementptr i32, i32 addrspace(3)* %select, i64 16
 ; CHECK: store i32 -1, i32 addrspace(3)* %gep
-define void @store_select_gep_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_gep_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* null
   %gep = getelementptr i32, i32 addrspace(4)* %select, i64 16
@@ -127,7 +127,7 @@
 ; CHECK-LABEL: @store_select_group_flat_constexpr(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* @lds1
 ; CHECK: store i32 7, i32 addrspace(3)* %select
-define void @store_select_group_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds1 to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
@@ -137,7 +137,7 @@
 ; CHECK-LABEL: @store_select_group_flat_inttoptr_flat(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* inttoptr (i64 12345 to i32 addrspace(4)*) to i32 addrspace(3)*)
 ; CHECK: store i32 7, i32 addrspace(3)* %select
-define void @store_select_group_flat_inttoptr_flat(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_inttoptr_flat(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* inttoptr (i64 12345 to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
@@ -147,7 +147,7 @@
 ; CHECK-LABEL: @store_select_group_flat_inttoptr_group(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* inttoptr (i32 400 to i32 addrspace(3)*)
 ; CHECK-NEXT: store i32 7, i32 addrspace(3)* %select
-define void @store_select_group_flat_inttoptr_group(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_inttoptr_group(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i32 400 to i32 addrspace(3)*) to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
@@ -158,7 +158,7 @@
 ; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
 ; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
 ; CHECK: store i32 7, i32 addrspace(4)* %select
-define void @store_select_group_global_mismatch_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
@@ -169,7 +169,7 @@
 ; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
 ; CHECK: %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*), i32 addrspace(4)* %1
 ; CHECK: store i32 7, i32 addrspace(4)* %select
-define void @store_select_group_global_mismatch_flat_constexpr_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_flat_constexpr_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*), i32 addrspace(4)* %cast0
   store i32 7, i32 addrspace(4)* %select
@@ -179,7 +179,7 @@
 ; CHECK-LABEL: @store_select_group_global_mismatch_null_null(
 ; CHECK: %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)
 ; CHECK: store i32 7, i32 addrspace(4)* %select
-define void @store_select_group_global_mismatch_null_null(i1 %c) #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_null_null(i1 %c) #0 {
   %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
   ret void
@@ -187,42 +187,42 @@
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_null_null_constexpr(
 ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
-define void @store_select_group_global_mismatch_null_null_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_null_null_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
   ret void
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_gv_null_constexpr(
 ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
-define void @store_select_group_global_mismatch_gv_null_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_gv_null_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
   ret void
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_null_gv_constexpr(
 ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)), align 4
-define void @store_select_group_global_mismatch_null_gv_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_null_gv_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)), align 4
   ret void
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_inttoptr_null_constexpr(
 ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
-define void @store_select_group_global_mismatch_inttoptr_null_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_inttoptr_null_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
   ret void
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_inttoptr_flat_null_constexpr(
 ; CHECK: store i32 7, i32 addrspace(1)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(1)* addrspacecast (i32 addrspace(4)* inttoptr (i64 123 to i32 addrspace(4)*) to i32 addrspace(1)*), i32 addrspace(1)* null), align 4
-define void @store_select_group_global_mismatch_inttoptr_flat_null_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_inttoptr_flat_null_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* inttoptr (i64 123 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
   ret void
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_undef_undef_constexpr(
 ; CHECK: store i32 7, i32 addrspace(3)* null
-define void @store_select_group_global_mismatch_undef_undef_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_undef_undef_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* undef to i32 addrspace(4)*)), align 4
   ret void
 }
@@ -233,7 +233,7 @@
 ; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
 ; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32 addrspace(4)*)
 ; CHECK: store i32 7, i32 addrspace(4)* %select
-define void @store_select_group_constexpr_ptrtoint(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_constexpr_ptrtoint(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
@@ -248,7 +248,7 @@
 ; CHECK: %extract1 = extractelement <2 x i32 addrspace(4)*> %select, i32 1
 ; CHECK: store i32 -1, i32 addrspace(4)* %extract0
 ; CHECK: store i32 -2, i32 addrspace(4)* %extract1
-define void @store_select_group_flat_vector(i1 %c, <2 x i32 addrspace(3)*> %group.ptr.0, <2 x i32 addrspace(3)*> %group.ptr.1) #0 {
+define amdgpu_kernel void @store_select_group_flat_vector(i1 %c, <2 x i32 addrspace(3)*> %group.ptr.0, <2 x i32 addrspace(3)*> %group.ptr.1) #0 {
   %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32 addrspace(4)*>
   %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32 addrspace(4)*>
   %select = select i1 %c, <2 x i32 addrspace(4)*> %cast0, <2 x i32 addrspace(4)*> %cast1
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll
index d9b80e9..79bf926 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: @volatile_load_flat_from_global(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32 addrspace(1)*
-define void @volatile_load_flat_from_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -16,7 +16,7 @@
 ; CHECK-LABEL: @volatile_load_flat_from_constant(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32 addrspace(1)*
-define void @volatile_load_flat_from_constant(i32 addrspace(2)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_constant(i32 addrspace(2)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(2)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -27,7 +27,7 @@
 ; CHECK-LABEL: @volatile_load_flat_from_group(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32 addrspace(3)*
-define void @volatile_load_flat_from_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -38,7 +38,7 @@
 ; CHECK-LABEL: @volatile_load_flat_from_private(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32*
-define void @volatile_load_flat_from_private(i32* nocapture %input, i32* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_private(i32* nocapture %input, i32* nocapture %output) #0 {
   %tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -49,7 +49,7 @@
 ; CHECK-LABEL: @volatile_store_flat_to_global(
 ; CHECK: load i32, i32 addrspace(1)*
 ; CHECK: store volatile i32 %val, i32 addrspace(4)*
-define void @volatile_store_flat_to_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_store_flat_to_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -60,7 +60,7 @@
 ; CHECK-LABEL: @volatile_store_flat_to_group(
 ; CHECK: load i32, i32 addrspace(3)*
 ; CHECK: store volatile i32 %val, i32 addrspace(4)*
-define void @volatile_store_flat_to_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_store_flat_to_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -71,7 +71,7 @@
 ; CHECK-LABEL: @volatile_store_flat_to_private(
 ; CHECK: load i32, i32*
 ; CHECK: store volatile i32 %val, i32 addrspace(4)*
-define void @volatile_store_flat_to_private(i32* nocapture %input, i32* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_store_flat_to_private(i32* nocapture %input, i32* nocapture %output) #0 {
   %tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -119,7 +119,7 @@
 ; CHECK-LABEL: @volatile_memset_group_to_flat(
 ; CHECK: addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
 ; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true)
-define void @volatile_memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
+define amdgpu_kernel void @volatile_memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
   %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true)
   ret void
@@ -128,7 +128,7 @@
 ; CHECK-LABEL: @volatile_memset_global_to_flat(
 ; CHECK: addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
 ; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true)
-define void @volatile_memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
+define amdgpu_kernel void @volatile_memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
   %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true)
   ret void
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
index e6904ee..4b2dab4 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
@@ -15,7 +15,7 @@
 ; NOSCOPE: load float
 ; NOSCOPE: store float
 ; NOSCOPE: store float
-define void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   store float 0.0, float addrspace(1)* %a, align 4, !noalias !0
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
index d8f72a8..368dc6a 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
@@ -10,7 +10,7 @@
 
 ; ALIGNED: load i8, i8* %ptr0, align 1{{$}}
 ; ALIGNED: load i8, i8* %ptr1, align 1{{$}}
-define void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i8], align 1
   %ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset
   %val0 = load i8, i8* %ptr0, align 1
@@ -27,7 +27,7 @@
 
 ; ALIGNED: load i16, i16* %ptr0, align 1{{$}}
 ; ALIGNED: load i16, i16* %ptr1, align 1{{$}}
-define void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i16], align 1
   %ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset
   %val0 = load i16, i16* %ptr0, align 1
@@ -47,7 +47,7 @@
 
 ; ALIGNED: load i32, i32* %ptr0, align 1
 ; ALIGNED: load i32, i32* %ptr1, align 1
-define void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 1
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
   %val0 = load i32, i32* %ptr0, align 1
@@ -68,7 +68,7 @@
 ; FIXME: Should change alignment
 ; ALIGNED: load i32
 ; ALIGNED: load i32
-define void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 16
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
   %val0 = load i32, i32* %ptr0, align 1
@@ -85,7 +85,7 @@
 
 ; ALIGNED: store i8 9, i8* %ptr0, align 1{{$}}
 ; ALIGNED: store i8 10, i8* %ptr1, align 1{{$}}
-define void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i8], align 1
   %ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset
   store i8 9, i8* %ptr0, align 1
@@ -100,7 +100,7 @@
 
 ; ALIGNED: store i16 9, i16* %ptr0, align 1{{$}}
 ; ALIGNED: store i16 10, i16* %ptr1, align 1{{$}}
-define void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i16], align 1
   %ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset
   store i16 9, i16* %ptr0, align 1
@@ -119,7 +119,7 @@
 
 ; ALIGNED: store i32 9, i32* %ptr0, align 1
 ; ALIGNED: store i32 10, i32* %ptr1, align 1
-define void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 1
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
   store i32 9, i32* %ptr0, align 1
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
index 25abb98..8a75b87 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
@@ -8,7 +8,7 @@
 ; CHECK: sext i32 %id.x to i64
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float> zeroinitializer
-define void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %sext.id.x = sext i32 %id.x to i64
@@ -32,7 +32,7 @@
 ; CHECK: zext i32 %id.x to i64
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float>
-define void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %zext.id.x = zext i32 %id.x to i64
@@ -54,7 +54,7 @@
 ; CHECK-LABEL: @merge_op_zext_index(
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float>
-define void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
+define amdgpu_kernel void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %shl = shl i32 %id.x, 2
@@ -81,7 +81,7 @@
 ; CHECK-LABEL: @merge_op_sext_index(
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float>
-define void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
+define amdgpu_kernel void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %shl = shl i32 %id.x, 2
@@ -112,7 +112,7 @@
 ; CHECK: loop:
 ; CHECK: load <2 x i32>
 ; CHECK: store <2 x i32>
-define void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 {
+define amdgpu_kernel void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 {
 entry:
   %cmp0 = icmp eq i32 %n, 0
   br i1 %cmp0, label %exit, label %loop
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
index 2b2f9cb..6182c09 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
@@ -11,7 +11,7 @@
 ; CHECK: load <2 x float>
 ; CHECK: %w = add i32 %y, 9
 ; CHECK: %foo = add i32 %z, %w
-define void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
 entry:
   %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
   %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
@@ -38,7 +38,7 @@
 ; CHECK: %w = add i32 %y, 9
 ; CHECK: store <2 x float>
 ; CHECK: %foo = add i32 %z, %w
-define void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
 entry:
   %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
   %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
index 4d6240a..3f6d7ee 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
@@ -8,7 +8,7 @@
 ; CHECK: store double 0.000000e+00, double addrspace(1)* %a,
 ; CHECK: load double
 ; CHECK: store double 0.000000e+00, double addrspace(1)* %a.idx.1
-define void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
   %c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
index c85be87..0fcdc7b 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
@@ -17,7 +17,7 @@
 ; ELT8-UNALIGNED: store <2 x i32>
 
 ; ELT16-UNALIGNED: store <4 x i32>
-define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
   %out.gep.3 = getelementptr i32, i32* %out, i32 3
@@ -44,7 +44,7 @@
 ; ELT4-UNALIGNED: store i32
 ; ELT4-UNALIGNED: store i32
 ; ELT4-UNALIGNED: store i32
-define void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
   %out.gep.3 = getelementptr i32, i32* %out, i32 3
@@ -71,7 +71,7 @@
 ; ELT4-UNALIGNED: store i32
 ; ELT4-UNALIGNED: store i32
 ; ELT4-UNALIGNED: store i32
-define void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
   %out.gep.3 = getelementptr i32, i32* %out, i32 3
@@ -85,7 +85,7 @@
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
 ; ALL: store <4 x i8>
-define void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
   %out.gep.1 = getelementptr i8, i8* %out, i32 1
   %out.gep.2 = getelementptr i8, i8* %out, i32 2
   %out.gep.3 = getelementptr i8, i8* %out, i32 3
@@ -104,7 +104,7 @@
 ; ALIGNED: store i8
 
 ; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8>* %1, align 1
-define void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 {
   %out.gep.1 = getelementptr i8, i8* %out, i32 1
   %out.gep.2 = getelementptr i8, i8* %out, i32 2
   %out.gep.3 = getelementptr i8, i8* %out, i32 3
@@ -118,7 +118,7 @@
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16(
 ; ALL: store <2 x i16>
-define void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
   %out.gep.1 = getelementptr i16, i16* %out, i32 1
 
   store i16 9, i16* %out, align 4
@@ -131,7 +131,7 @@
 ; ALIGNED: store i16
 
 ; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 2
-define void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0 {
   %out.gep.1 = getelementptr i16, i16* %out, i32 1
 
   store i16 9, i16* %out, align 2
@@ -144,7 +144,7 @@
 ; ALIGNED: store i16
 
 ; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 1
-define void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0 {
   %out.gep.1 = getelementptr i16, i16* %out, i32 1
 
   store i16 9, i16* %out, align 1
@@ -154,7 +154,7 @@
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8(
 ; ALL: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 8
-define void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0 {
   %out.gep.1 = getelementptr i16, i16* %out, i32 1
 
   store i16 9, i16* %out, align 8
@@ -179,7 +179,7 @@
 ; ELT16-ALIGNED: store i32
 
 ; ELT16-UNALIGNED: store <3 x i32>
-define void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
 
@@ -202,7 +202,7 @@
 ; ELT8-UNALIGNED: store i32
 
 ; ELT16-UNALIGNED: store <3 x i32>
-define void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
 
@@ -218,7 +218,7 @@
 ; ALIGNED: store i8
 
 ; UNALIGNED: store <3 x i8>
-define void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8* %out) #0 {
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8* %out) #0 {
   %out.gep.1 = getelementptr i8, i8* %out, i8 1
   %out.gep.2 = getelementptr i8, i8* %out, i8 2
 
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
index d32387f..dbb7068 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
@@ -10,7 +10,7 @@
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i8(
 ; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* %{{[0-9]+}}, align 2
-define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -20,7 +20,7 @@
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align
 ; CHECK: store <2 x i8>
-define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -30,7 +30,7 @@
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i16
 ; CHECK: store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -40,7 +40,7 @@
 
 ; CHECK-LABEL: @merge_global_store_2_constants_0_i16
 ; CHECK: store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 0, i16 addrspace(1)* %out.gep.1
@@ -50,7 +50,7 @@
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align
 ; CHECK: store <2 x i16>
-define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -60,7 +60,7 @@
 
 ; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align
 ; CHECK: store <2 x half>
-define void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
 
   store half 2.0, half addrspace(1)* %out.gep.1
@@ -70,7 +70,7 @@
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i32
 ; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
@@ -80,7 +80,7 @@
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i32_f32
 ; CHECK: store <2 x i32> <i32 456, i32 1065353216>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
   store float 1.0, float addrspace(1)* %out.gep.1.bc
@@ -90,7 +90,7 @@
 
 ; CHECK-LABEL: @merge_global_store_2_constants_f32_i32
 ; CHECK  store <2 x float> <float 4.000000e+00, float 0x370EC00000000000>, <2 x float> addrspace(1)* %{{[0-9]+$}}
-define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
   store i32 123, i32 addrspace(1)* %out.gep.1.bc
@@ -100,7 +100,7 @@
 
 ; CHECK-LABEL: @merge_global_store_4_constants_i32
 ; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -114,7 +114,7 @@
 
 ; CHECK-LABEL: @merge_global_store_4_constants_f32_order
 ; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}
-define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -129,7 +129,7 @@
 ; First store is out of order.
 ; CHECK-LABEL: @merge_global_store_4_constants_f32
 ; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -143,7 +143,7 @@
 
 ; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32
 ; CHECK: store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -160,7 +160,7 @@
 
 ; CHECK-LABEL: @merge_global_store_3_constants_i32
 ; CHECK: store <3 x i32> <i32 1234, i32 123, i32 456>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 
@@ -172,7 +172,7 @@
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i64
 ; CHECK: store <2 x i64> <i64 456, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
-define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 
   store i64 123, i64 addrspace(1)* %out.gep.1
@@ -183,7 +183,7 @@
 ; CHECK-LABEL: @merge_global_store_4_constants_i64
 ; CHECK: store <2 x i64> <i64 456, i64 333>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
 ; CHECK: store <2 x i64> <i64 1234, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
-define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
@@ -202,7 +202,7 @@
 ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT1]], i32 1
 ; CHECK: store <2 x i32> [[INSERT1]]
-define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -220,7 +220,7 @@
 ; CHECK: insertelement
 ; CHECK: insertelement
 ; CHECK: store <2 x i32>
-define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 
@@ -241,7 +241,7 @@
 ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT1]], i32 0
 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT0]], i32 1
 ; CHECK: store <2 x i32> [[INSERT1]]
-define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -256,7 +256,7 @@
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -279,7 +279,7 @@
 ; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32
 ; CHECK: load <3 x i32>
 ; CHECK: store <3 x i32>
-define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
@@ -298,7 +298,7 @@
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32
 ; CHECK: load <4 x float>
 ; CHECK: store <4 x float>
-define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -321,7 +321,7 @@
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
@@ -346,7 +346,7 @@
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -373,7 +373,7 @@
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -408,7 +408,7 @@
 ; CHECK: insertelement <4 x i8>
 ; CHECK: insertelement <4 x i8>
 ; CHECK: store <4 x i8>
-define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -431,7 +431,7 @@
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align
 ; CHECK: load <4 x i8>
 ; CHECK: store <4 x i8>
-define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -454,7 +454,7 @@
 ; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -474,7 +474,7 @@
 
 ; CHECK-LABEL: @merge_local_store_2_constants_i8
 ; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(3)* %{{[0-9]+}}, align 2
-define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 
   store i8 123, i8 addrspace(3)* %out.gep.1
@@ -484,7 +484,7 @@
 
 ; CHECK-LABEL: @merge_local_store_2_constants_i32
 ; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4
-define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 
   store i32 123, i32 addrspace(3)* %out.gep.1
@@ -495,7 +495,7 @@
 ; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2
 ; CHECK: store i32
 ; CHECK: store i32
-define void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 
   store i32 123, i32 addrspace(3)* %out.gep.1, align 2
@@ -506,7 +506,7 @@
 ; CHECK-LABEL: @merge_local_store_4_constants_i32
 ; CHECK: store <2 x i32> <i32 456, i32 333>, <2 x i32> addrspace(3)*
 ; CHECK: store <2 x i32> <i32 1234, i32 123>, <2 x i32> addrspace(3)*
-define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
@@ -521,7 +521,7 @@
 ; CHECK-LABEL: @merge_global_store_5_constants_i32
 ; CHECK: store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store i32
-define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
   store i32 9, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 12, i32 addrspace(1)* %idx1, align 4
@@ -537,7 +537,7 @@
 ; CHECK-LABEL: @merge_global_store_6_constants_i32
 ; CHECK: store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store <2 x i32> <i32 11, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
   store i32 13, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 15, i32 addrspace(1)* %idx1, align 4
@@ -555,7 +555,7 @@
 ; CHECK-LABEL: @merge_global_store_7_constants_i32
 ; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store <3 x i32> <i32 98, i32 91, i32 212>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -575,7 +575,7 @@
 ; CHECK-LABEL: @merge_global_store_8_constants_i32
 ; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -597,7 +597,7 @@
 ; CHECK-LABEL: @copy_v3i32_align4
 ; CHECK: %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
 ; CHECK: store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
-define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
   store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
   ret void
@@ -606,7 +606,7 @@
 ; CHECK-LABEL: @copy_v3i64_align4
 ; CHECK: %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
 ; CHECK: store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
-define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
   store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
   ret void
@@ -615,7 +615,7 @@
 ; CHECK-LABEL: @copy_v3f32_align4
 ; CHECK: %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
 ; CHECK: store <3 x float>
-define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %fadd, <3 x float> addrspace(1)* %out
@@ -625,7 +625,7 @@
 ; CHECK-LABEL: @copy_v3f64_align4
 ; CHECK: %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
 ; CHECK: store <3 x double> %fadd, <3 x double> addrspace(1)* %out
-define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
   store <3 x double> %fadd, <3 x double> addrspace(1)* %out
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index 8885d61..226147d 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: @merge_v2i32_v2i32(
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32> zeroinitializer
-define void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 1
@@ -22,7 +22,7 @@
 ; CHECK-LABEL: @merge_v1i32_v1i32(
 ; CHECK: load <2 x i32>
 ; CHECK: store <2 x i32> zeroinitializer
-define void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %b, i64 1
@@ -41,7 +41,7 @@
 ; CHECK: load <3 x i32>
 ; CHECK: store <3 x i32> zeroinitializer
 ; CHECK: store <3 x i32> zeroinitializer
-define void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b, i64 1
@@ -58,7 +58,7 @@
 ; CHECK-LABEL: @merge_v2i16_v2i16(
 ; CHECK: load <4 x i16>
 ; CHECK: store <4 x i16> zeroinitializer
-define void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b, i64 1
@@ -76,7 +76,7 @@
 ; CHECK-LABEL: @merge_load_i32_v2i16(
 ; CHECK: load i32,
 ; CHECK: load <2 x i16>
-define void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 1
   %a.1.cast = bitcast i32 addrspace(1)* %a.1 to <2 x i16> addrspace(1)*
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
index ba792f7..f353106 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
@@ -7,7 +7,7 @@
 
 ; CHECK-LABEL: @load_keep_base_alignment_missing_align(
 ; CHECK: load <2 x float>, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
-define void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
+define amdgpu_kernel void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
   %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
   %val0 = load float, float addrspace(3)* %ptr0
 
@@ -21,7 +21,7 @@
 
 ; CHECK-LABEL: @store_keep_base_alignment_missing_align(
 ; CHECK: store <2 x float> zeroinitializer, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
-define void @store_keep_base_alignment_missing_align() {
+define amdgpu_kernel void @store_keep_base_alignment_missing_align() {
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 1
   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2
   store float 0.0, float addrspace(3)* %arrayidx0
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
index 88eca36..8a78f3d 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
@@ -11,7 +11,7 @@
 ; CHECK: store i32 0
 ; CHECK: store i32 0
 
-define void @no_crash(i32 %arg) {
+define amdgpu_kernel void @no_crash(i32 %arg) {
   %tmp2 = add i32 %arg, 14
   %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2
   %tmp4 = add i32 %arg, 15
@@ -37,7 +37,7 @@
 ; CHECK: load i32
 ; CHECK: load i32
 
-define void @interleave_get_longest(i32 %arg) {
+define amdgpu_kernel void @interleave_get_longest(i32 %arg) {
   %a1 = add i32 %arg, 1
   %a2 = add i32 %arg, 2
   %a3 = add i32 %arg, 3
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
index 4a42953..8181895 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
@@ -5,7 +5,7 @@
 ; CHECK: store i32
 ; CHECK: store i32
 ; CHECK: store i32
-define void @no_implicit_float(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @no_implicit_float(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
index 141e20a..28d29f8 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: @optnone(
 ; CHECK: store i32
 ; CHECK: store i32
-define void @optnone(i32 addrspace(1)* %out) noinline optnone {
+define amdgpu_kernel void @optnone(i32 addrspace(1)* %out) noinline optnone {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
@@ -13,7 +13,7 @@
 
 ; CHECK-LABEL: @do_opt(
 ; CHECK: store <2 x i32>
-define void @do_opt(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @do_opt(i32 addrspace(1)* %out) {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
index 202e988..65200b9 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
@@ -9,7 +9,7 @@
 ; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
 ; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
 ; CHECK: store <2 x i64> zeroinitializer
-define void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, i64 1
@@ -28,7 +28,7 @@
 ; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
 ; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
 ; CHECK: store <2 x i32> zeroinitializer
-define void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i64 1
   %b.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, i64 1
@@ -46,7 +46,7 @@
 ; CHECK: load <2 x i64>
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
 ; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
-define void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
   %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
@@ -61,7 +61,7 @@
 ; CHECK: load <2 x i64>
 ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0
 ; CHECK: inttoptr i64 [[ELT0]] to i8 addrspace(1)*
-define void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 =  getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -76,7 +76,7 @@
 ; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64
 ; CHECK: insertelement <2 x i64> undef, i64 [[ELT0]], i32 0
 ; CHECK: store <2 x i64>
-define void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 {
+define amdgpu_kernel void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -92,7 +92,7 @@
 ; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1]], i32 1
 ; CHECK: store <2 x i64>
-define void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 {
+define amdgpu_kernel void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
   %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to i64 addrspace(1)*
@@ -107,7 +107,7 @@
 ; CHECK: load <2 x i32>
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 1
 ; CHECK: inttoptr i32 [[ELT1]] to i8 addrspace(3)*
-define void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
   %a.1.cast = bitcast i32 addrspace(3)* %a.1 to i8 addrspace(3)* addrspace(3)*
@@ -122,7 +122,7 @@
 ; CHECK: load <2 x i32>
 ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 0
 ; CHECK: inttoptr i32 [[ELT0]] to i8 addrspace(3)*
-define void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
   %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
@@ -137,7 +137,7 @@
 ; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr0 to i32
 ; CHECK: insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
 ; CHECK: store <2 x i32>
-define void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 {
+define amdgpu_kernel void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 {
 entry:
   %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
   %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
@@ -152,7 +152,7 @@
 ; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr1 to i32
 ; CHECK: insertelement <2 x i32> %{{[^ ]+}}, i32 [[ELT1]], i32 1
 ; CHECK: store <2 x i32>
-define void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 {
+define amdgpu_kernel void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i32 1
   %a.cast = bitcast i8 addrspace(3)* addrspace(3)* %a to i32 addrspace(3)*
@@ -166,7 +166,7 @@
 ; CHECK-LABEL: @no_merge_store_ptr32_i64(
 ; CHECK: store i8 addrspace(3)*
 ; CHECK: store i64
-define void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 {
+define amdgpu_kernel void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -181,7 +181,7 @@
 ; CHECK-LABEL: @no_merge_store_i64_ptr32(
 ; CHECK: store i64
 ; CHECK: store i8 addrspace(3)*
-define void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 {
+define amdgpu_kernel void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 {
 entry:
   %a.1 =  getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a, i64 1
   %a.cast = bitcast i8 addrspace(3)* addrspace(1)* %a to i64 addrspace(1)*
@@ -195,7 +195,7 @@
 ; CHECK-LABEL: @no_merge_load_i64_ptr32(
 ; CHECK: load i64,
 ; CHECK: load i8 addrspace(3)*,
-define void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
   %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(3)* addrspace(1)*
@@ -209,7 +209,7 @@
 ; CHECK-LABEL: @no_merge_load_ptr32_i64(
 ; CHECK: load i8 addrspace(3)*,
 ; CHECK: load i64,
-define void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
   %a.1 =  getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -226,7 +226,7 @@
 ; CHECK: load <2 x i8 addrspace(1)*>
 ; CHECK: store <2 x i8 addrspace(1)*>
 ; CHECK: store <2 x i8 addrspace(1)*>
-define void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 {
+define amdgpu_kernel void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, i64 1
@@ -245,7 +245,7 @@
 ; CHECK: [[ELT0_INT:%[^ ]+]] = inttoptr i64 [[ELT0]] to i8 addrspace(1)*
 ; CHECK: [[ELT1_INT:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
 ; CHECK: bitcast i64 [[ELT1_INT]] to double
-define void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 =  getelementptr inbounds double, double addrspace(1)* %a, i64 1
@@ -262,7 +262,7 @@
 ; CHECK: bitcast i64 [[ELT0]] to double
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
 ; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
-define void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
   %a.1.cast = bitcast double addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
@@ -279,7 +279,7 @@
 ; CHECK: [[ELT1_INT:%[^ ]+]] = bitcast double %val1 to i64
 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
 ; CHECK: store <2 x i64>
-define void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 {
+define amdgpu_kernel void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 {
 entry:
   %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
@@ -296,7 +296,7 @@
 ; CHECK: [[ELT1_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
 ; CHECK: store <2 x i64>
-define void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 {
+define amdgpu_kernel void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
   %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to double addrspace(1)*
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
index d70c449..63e688e 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
@@ -9,7 +9,7 @@
 ; CHECK: store <4 x float>
 
 ; Function Attrs: nounwind
-define void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 {
+define amdgpu_kernel void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 {
 bb:
   %tmp = bitcast i8 addrspace(1)* %b to float addrspace(1)*
   %tmp1 = load float, float addrspace(1)* %tmp, align 4
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
index 18f62be..412d201 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
@@ -16,7 +16,7 @@
 ; CHECK-LABEL: @merge_store_2_constants_i1(
 ; CHECK: store i1
 ; CHECK: store i1
-define void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
   store i1 true, i1 addrspace(1)* %out.gep.1
   store i1 false, i1 addrspace(1)* %out
@@ -26,7 +26,7 @@
 ; CHECK-LABEL: @merge_store_2_constants_i2(
 ; CHECK: store i2 1
 ; CHECK: store i2 -1
-define void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
   store i2 1, i2 addrspace(1)* %out.gep.1
   store i2 -1, i2 addrspace(1)* %out
@@ -36,7 +36,7 @@
 ; CHECK-LABEL: @merge_different_store_sizes_i1_i8(
 ; CHECK: store i1 true
 ; CHECK: store i8 123
-define void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
   %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   store i1 true, i1 addrspace(1)* %out.i1
@@ -47,7 +47,7 @@
 ; CHECK-LABEL: @merge_different_store_sizes_i8_i1(
 ; CHECK: store i8 123
 ; CHECK: store i1 true
-define void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
   %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -58,7 +58,7 @@
 ; CHECK-LABEL: @merge_store_2_constant_structs(
 ; CHECK: store %struct.foo
 ; CHECK: store %struct.foo
-define void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
   store %struct.foo { i32 12, i8 3 }, %struct.foo addrspace(1)* %out.gep.1
   store %struct.foo { i32 92, i8 9 }, %struct.foo addrspace(1)* %out
@@ -69,7 +69,7 @@
 ; CHECK-LABEL: @merge_store_2_constants_v2i2(
 ; CHECK: store <2 x i2>
 ; CHECK: store <2 x i2>
-define void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
   store <2 x i2> <i2 1, i2 -1>, <2 x i2> addrspace(1)* %out.gep.1
   store <2 x i2> <i2 -1, i2 1>, <2 x i2> addrspace(1)* %out
@@ -81,7 +81,7 @@
 ; CHECK-LABEL: @merge_store_2_constants_v4i2(
 ; CHECK: store <4 x i2>
 ; CHECK: store <4 x i2>
-define void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
   store <4 x i2> <i2 1, i2 -1, i2 1, i2 -1>, <4 x i2> addrspace(1)* %out.gep.1
   store <4 x i2> <i2 -1, i2 1, i2 -1, i2 1>, <4 x i2> addrspace(1)* %out
@@ -91,7 +91,7 @@
 ; CHECK-LABEL: @merge_load_2_constants_i1(
 ; CHECK: load i1
 ; CHECK: load i1
-define void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
   %x = load i1, i1 addrspace(1)* %out.gep.1
   %y = load i1, i1 addrspace(1)* %out
@@ -103,7 +103,7 @@
 ; CHECK-LABEL: @merge_load_2_constants_i2(
 ; CHECK: load i2
 ; CHECK: load i2
-define void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
   %x = load i2, i2 addrspace(1)* %out.gep.1
   %y = load i2, i2 addrspace(1)* %out
@@ -115,7 +115,7 @@
 ; CHECK-LABEL: @merge_different_load_sizes_i1_i8(
 ; CHECK: load i1
 ; CHECK: load i8
-define void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
   %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   %x = load i1, i1 addrspace(1)* %out.i1
@@ -128,7 +128,7 @@
 ; CHECK-LABEL: @merge_different_load_sizes_i8_i1(
 ; CHECK: load i8
 ; CHECK: load i1
-define void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
   %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
   %x = load i8, i8 addrspace(1)* %out.gep.1
@@ -141,7 +141,7 @@
 ; CHECK-LABEL: @merge_load_2_constant_structs(
 ; CHECK: load %struct.foo
 ; CHECK: load %struct.foo
-define void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
   %x = load %struct.foo, %struct.foo addrspace(1)* %out.gep.1
   %y = load %struct.foo, %struct.foo addrspace(1)* %out
@@ -153,7 +153,7 @@
 ; CHECK-LABEL: @merge_load_2_constants_v2i2(
 ; CHECK: load <2 x i2>
 ; CHECK: load <2 x i2>
-define void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
   %x = load <2 x i2>, <2 x i2> addrspace(1)* %out.gep.1
   %y = load <2 x i2>, <2 x i2> addrspace(1)* %out
@@ -165,7 +165,7 @@
 ; CHECK-LABEL: @merge_load_2_constants_v4i2(
 ; CHECK: load <4 x i2>
 ; CHECK: load <4 x i2>
-define void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
   %x = load <4 x i2>, <4 x i2> addrspace(1)* %out.gep.1
   %y = load <4 x i2>, <4 x i2> addrspace(1)* %out
@@ -177,7 +177,7 @@
 ; CHECK-LABEL: @merge_store_2_constants_i9(
 ; CHECK: store i9 3
 ; CHECK: store i9 -5
-define void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i9, i9 addrspace(1)* %out, i32 1
   store i9 3, i9 addrspace(1)* %out.gep.1
   store i9 -5, i9 addrspace(1)* %out
@@ -187,7 +187,7 @@
 ; CHECK-LABEL: @merge_load_2_constants_v2i9(
 ; CHECK: load <2 x i9>
 ; CHECK: load <2 x i9>
-define void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <2 x i9>, <2 x i9> addrspace(1)* %out, i32 1
   %x = load <2 x i9>, <2 x i9> addrspace(1)* %out.gep.1
   %y = load <2 x i9>, <2 x i9> addrspace(1)* %out
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
index ccad351..054c61d 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
@@ -17,7 +17,7 @@
 ; OPT: %tmp7 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 undef seq_cst
 ; OPT: %0 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 %tmp8 seq_cst
 ; OPT: br i1 %exitcond
-define void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -54,7 +54,7 @@
 ; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383
 ; OPT: %tmp4 = cmpxchg i32 addrspace(3)* %scevgep4, i32 undef, i32 undef seq_cst monotonic
-define void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
index bf61112a..c5ea1b9 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
@@ -10,7 +10,7 @@
 ; OPT: %lsr.iv2 = phi i8 addrspace(1)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv2, i64 4095
 ; OPT: load i8, i8 addrspace(1)* %scevgep4, align 1
-define void @test_global_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -48,7 +48,7 @@
 ; OPT: {{^}}.lr.ph:
 ; OPT: %lsr.iv3 = phi i8 addrspace(1)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv3, i64 1
-define void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -83,7 +83,7 @@
 ; OPT: %lsr.iv2 = phi i8 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv2, i32 65535
 ; OPT: %tmp4 = load i8, i8 addrspace(3)* %scevgep4, align 1
-define void @test_local_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -122,7 +122,7 @@
 ; OPT: {{^}}.lr.ph:
 ; OPT: %lsr.iv3 = phi i8 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv3, i32 1
-define void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll
index 89b6263..02c3c05 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll
@@ -15,7 +15,7 @@
 ;CHECK: buffer_store_dword
 ;CHECK: s_branch [[LOOP_LABEL]]
 
-define void @foo() {
+define amdgpu_kernel void @foo() {
 entry:
   br label %loop
 
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
index 8c83df5..67b1926 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
@@ -16,7 +16,7 @@
 ; CHECK: bb:
 ; CHECK: inttoptr i32 %lsr.iv.next2 to i8 addrspace(3)*
 ; CHECK: %c1 = icmp ne i8 addrspace(3)*
-define void @local_cmp_user(i32 %arg0) nounwind {
+define amdgpu_kernel void @local_cmp_user(i32 %arg0) nounwind {
 entry:
   br label %bb11
 
@@ -47,7 +47,7 @@
 ; CHECK: bb:
 ; CHECK: inttoptr i64 %lsr.iv.next2 to i8 addrspace(1)*
 ; CHECK: icmp ne i8 addrspace(1)* %t
-define void @global_cmp_user(i64 %arg0) nounwind {
+define amdgpu_kernel void @global_cmp_user(i64 %arg0) nounwind {
 entry:
   br label %bb11
 
@@ -78,7 +78,7 @@
 ; CHECK: bb:
 ; CHECK: %idxprom = sext i32 %lsr.iv1 to i64
 ; CHECK: getelementptr i8, i8 addrspace(1)* %t, i64 %idxprom
-define void @global_gep_user(i32 %arg0) nounwind {
+define amdgpu_kernel void @global_gep_user(i32 %arg0) nounwind {
 entry:
   br label %bb11
 
@@ -108,7 +108,7 @@
 
 ; CHECK: bb
 ; CHECK: %p = getelementptr i8, i8 addrspace(1)* %t, i64 %ii.ext
-define void @global_sext_scale_user(i32 %arg0) nounwind {
+define amdgpu_kernel void @global_sext_scale_user(i32 %arg0) nounwind {
 entry:
   br label %bb11
 
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
index b3b696d..9eba0c3 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
@@ -14,7 +14,7 @@
 
 ; CHECK: %scevgep = getelementptr i32, i32 addrspace(3)* %tmp1, i32 4
 ; CHECK:%tmp14 = load i32, i32 addrspace(3)* %scevgep
-define void @lsr_crash_preserve_addrspace_unknown_type() #0 {
+define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 {
 bb:
   br label %bb1
 
diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
index e732ddc..ca8cc32 100644
--- a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
+++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
@@ -6,7 +6,7 @@
 ; CHECK: call void @llvm.amdgcn.s.barrier()
 ; CHECK: call void @llvm.amdgcn.s.barrier()
 ; CHECK-NOT: br
-define void @test_unroll_convergent_barrier(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(1)* noalias nocapture %in) #0 {
+define amdgpu_kernel void @test_unroll_convergent_barrier(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(1)* noalias nocapture %in) #0 {
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
index 915a13d..e986c3d 100644
--- a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
+++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
@@ -7,7 +7,7 @@
 ; CHECK:       store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4
 ; CHECK:       ret void
 
-define void @non_invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
+define amdgpu_kernel void @non_invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
 entry:
   %arr = alloca [64 x i32], align 4
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -40,7 +40,7 @@
 ; CHECK:       br i1 %[[exitcond]]
 ; CHECK-NOT:   icmp eq i32 %{{.*}}, 100
 
-define void @invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
+define amdgpu_kernel void @invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
 entry:
   %arr = alloca [64 x i32], align 4
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -82,7 +82,7 @@
 ; CHECK:       icmp eq i32 %{{.*}}, 100
 ; CHECK:       br
 
-define void @too_big(i32 addrspace(1)* nocapture %a, i32 %x) {
+define amdgpu_kernel void @too_big(i32 addrspace(1)* nocapture %a, i32 %x) {
 entry:
   %arr = alloca [256 x i32], align 4
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -116,7 +116,7 @@
 ; CHECK:       icmp eq i32 %{{.*}}, 100
 ; CHECK:       br
 
-define void @dynamic_size_alloca(i32 addrspace(1)* nocapture %a, i32 %n, i32 %x) {
+define amdgpu_kernel void @dynamic_size_alloca(i32 addrspace(1)* nocapture %a, i32 %n, i32 %x) {
 entry:
   %arr = alloca i32, i32 %n, align 4
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll b/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
index e70467a..1f106bd 100644
--- a/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
+++ b/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
@@ -3,14 +3,14 @@
 ; Check that loop unswitch happened and condition hoisted out of the loop.
 ; Condition is uniform so all targets should perform unswitching.
 
-; CHECK-LABEL: {{^}}define void @uniform_unswitch
+; CHECK-LABEL: {{^}}define amdgpu_kernel void @uniform_unswitch
 ; CHECK: entry:
 ; CHECK-NEXT: [[LOOP_COND:%[a-z0-9]+]] = icmp
 ; CHECK-NEXT: [[IF_COND:%[a-z0-9]+]] = icmp eq i32 %x, 123456
 ; CHECK-NEXT: and i1 [[LOOP_COND]], [[IF_COND]]
 ; CHECK-NEXT: br i1
 
-define void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) {
+define amdgpu_kernel void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) {
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup
@@ -42,14 +42,14 @@
 
 ; Check that loop unswitch does not happen if condition is divergent.
 
-; CHECK-LABEL: {{^}}define void @divergent_unswitch
+; CHECK-LABEL: {{^}}define amdgpu_kernel void @divergent_unswitch
 ; CHECK: entry:
 ; CHECK: icmp
 ; CHECK: [[IF_COND:%[a-z0-9]+]] = icmp {{.*}} 567890
 ; CHECK: br label
 ; CHECK: br i1 [[IF_COND]]
 
-define void @divergent_unswitch(i32 * nocapture %out, i32 %n) {
+define amdgpu_kernel void @divergent_unswitch(i32 * nocapture %out, i32 %n) {
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body.lr.ph, label %for.cond.cleanup
diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll
index 85ba95c..f303ed5 100644
--- a/llvm/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll
+++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll
@@ -7,7 +7,7 @@
 ; CHECK: store i32
 ; CHECK-NOT: store i32
 ; CHECK: ret
-define void @small_loop(i32* nocapture %inArray, i32 %size) nounwind {
+define amdgpu_kernel void @small_loop(i32* nocapture %inArray, i32 %size) nounwind {
 entry:
   %0 = icmp sgt i32 %size, 0
   br i1 %0, label %loop, label %exit
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
index 3576395..63c6d77 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
@@ -9,7 +9,7 @@
 
 
 ; Simple 3-pair chain with loads and stores
-define void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) {
+define amdgpu_kernel void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) {
 ; CHECK-LABEL: @test1_as_3_3_3(
 ; CHECK: load <2 x double>, <2 x double> addrspace(3)*
 ; CHECK: load <2 x double>, <2 x double> addrspace(3)*
@@ -29,7 +29,7 @@
   ret void
 }
 
-define void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
+define amdgpu_kernel void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
 ; CHECK-LABEL: @test1_as_3_0_0(
 ; CHECK: load <2 x double>, <2 x double> addrspace(3)*
 ; CHECK: load <2 x double>, <2 x double>*
@@ -49,7 +49,7 @@
   ret void
 }
 
-define void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) {
+define amdgpu_kernel void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) {
 ; CHECK-LABEL: @test1_as_0_0_3(
 ; CHECK: load <2 x double>, <2 x double>*
 ; CHECK: load <2 x double>, <2 x double>*
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
index 5815ae6..23ec0ca 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
@@ -9,7 +9,7 @@
 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 1
 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 32
 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 33
-define void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
   %tmp = sext i32 %y to i64
   %tmp1 = sext i32 %x to i64
   %tmp2 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp1, i64 %tmp
@@ -42,7 +42,7 @@
 ; IR: add i32 %x, 256
 ; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
 ; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-define void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
   %tmp = sext i32 %y to i64
   %tmp1 = sext i32 %x to i64
   %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp1, i64 %tmp
@@ -74,7 +74,7 @@
 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 255
 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16128
 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16383
-define void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
   %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %x, i32 %y
   %tmp4 = load float, float addrspace(3)* %tmp2, align 4
   %tmp5 = fadd float %tmp4, 0.000000e+00
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
index f2853ac..9554ae6 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
@@ -6,7 +6,7 @@
 ; CHECK-LABEL: @slsr_after_reassociate_global_geps_mubuf_max_offset(
 ; CHECK: [[b1:%[0-9]+]] = getelementptr float, float addrspace(1)* %arr, i64 [[bump:%[0-9]+]]
 ; CHECK: [[b2:%[0-9]+]] = getelementptr float, float addrspace(1)* [[b1]], i64 [[bump]]
-define void @slsr_after_reassociate_global_geps_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_global_geps_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 1023
@@ -33,7 +33,7 @@
 ; CHECK: %tmp = sext i32 %j1 to i64
 ; CHECK: getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp
 ; CHECK: getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp5
-define void @slsr_after_reassociate_global_geps_over_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_global_geps_over_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 1024
@@ -61,7 +61,7 @@
 
 ; CHECK: [[B2:%[0-9]+]] = getelementptr float, float addrspace(3)* [[B1]], i32 %i
 ; CHECK: getelementptr inbounds float, float addrspace(3)* [[B2]], i32 16383
-define void @slsr_after_reassociate_lds_geps_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_lds_geps_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 16383
@@ -86,7 +86,7 @@
 ; CHECK: getelementptr inbounds float, float addrspace(3)* %arr, i32 %j1
 ; CHECK: %j2 = add i32 %j1, %i
 ; CHECK: getelementptr inbounds float, float addrspace(3)* %arr, i32 %j2
-define void @slsr_after_reassociate_lds_geps_over_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_lds_geps_over_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 16384