AMDGPU: Bulk update some intrinsic tests to opaque pointers Done entirely with the script.

commit: ad386a886b16ad83a982c0eff3f80d8ae8888882 [log] [tgz]
author: Matt Arsenault <Matthew.Arsenault@amd.com> Mon Nov 28 14:13:14 2022 -0500
committer: Matt Arsenault <Matthew.Arsenault@amd.com> Mon Nov 28 14:21:31 2022 -0500
tree: f8655f3a187a4f104815bd2b31906cab995a74cf
parent: b1a6f2aa89c3cf82ae67d28a8a20651eb68d84f1 [diff]
diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
index ae4e8ee..5615a00 100644
--- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
+++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll

@@ -40,7 +40,7 @@
   unreachable
 }
 
-define amdgpu_kernel void @test2(i32* %p, i32 %x) {
+define amdgpu_kernel void @test2(ptr %p, i32 %x) {
 ; GFX9-LABEL: test2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -99,7 +99,7 @@
   unreachable
 
 else:
-  store i32 %x, i32* %p
+  store i32 %x, ptr %p
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/cube.ll b/llvm/test/CodeGen/AMDGPU/cube.ll
index 3fd4ee6..51b7ca1 100644
--- a/llvm/test/CodeGen/AMDGPU/cube.ll
+++ b/llvm/test/CodeGen/AMDGPU/cube.ll

@@ -12,7 +12,7 @@
 ; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: _store_dwordx4
-define amdgpu_kernel void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @cube(ptr addrspace(1) %out, float %a, float %b, float %c) #1 {
   %cubeid = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
   %cubesc = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
   %cubetc = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
@@ -22,7 +22,7 @@
   %vec1 = insertelement <4 x float> %vec0, float %cubesc, i32 1
   %vec2 = insertelement <4 x float> %vec1, float %cubetc, i32 2
   %vec3 = insertelement <4 x float> %vec2, float %cubema, i32 3
-  store <4 x float> %vec3, <4 x float> addrspace(1)* %out
+  store <4 x float> %vec3, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll
index 71a599a..a3fd636 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll

@@ -4,9 +4,9 @@
 
 ; GCN-LABEL: {{^}}v_alignbyte_b32:
 ; GCN: v_alignbyte_b32 {{[vs][0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}
-define amdgpu_kernel void @v_alignbyte_b32(i32 addrspace(1)* %out, i32 %src1, i32 %src2, i32 %src3) #1 {
+define amdgpu_kernel void @v_alignbyte_b32(ptr addrspace(1) %out, i32 %src1, i32 %src2, i32 %src3) #1 {
   %val = call i32 @llvm.amdgcn.alignbyte(i32 %src1, i32 %src2, i32 %src3) #0
-  store i32 %val, i32 addrspace(1)* %out
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
index 5e9b711..e96a354 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll

@@ -2,7 +2,7 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs | FileCheck %s -check-prefix=GCN
 
 declare i32 @llvm.amdgcn.buffer.atomic.csub(i32, <4 x i32>, i32, i32, i1)
-declare i32 @llvm.amdgcn.global.atomic.csub(i32 addrspace(1)*, i32)
+declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32)
 
 ; GCN-LABEL: {{^}}buffer_atomic_csub:
 ; GCN: buffer_atomic_csub v0, v1, s[0:3], 0 idxen glc
@@ -22,17 +22,17 @@
 
 ; GCN-LABEL: {{^}}global_atomic_csub:
 ; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc
-define amdgpu_kernel void @global_atomic_csub(i32 addrspace(1)* %ptr, i32 %data) {
+define amdgpu_kernel void @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
 main_body:
-  %ret = call i32 @llvm.amdgcn.global.atomic.csub(i32 addrspace(1)* %ptr, i32 %data)
+  %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_atomic_csub_off4:
 ; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc
-define amdgpu_kernel void @global_atomic_csub_off4(i32 addrspace(1)* %ptr, i32 %data) {
+define amdgpu_kernel void @global_atomic_csub_off4(ptr addrspace(1) %ptr, i32 %data) {
 main_body:
-  %p = getelementptr i32, i32 addrspace(1)* %ptr, i64 1
-  %ret = call i32 @llvm.amdgcn.global.atomic.csub(i32 addrspace(1)* %p, i32 %data)
+  %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
+  %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %p, i32 %data)
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
index dc991ae..578ba58 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll

@@ -2,13 +2,13 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,CIVI %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
-declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #2
-declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
-declare i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr nocapture, i32, i32, i32, i1) #2
 
-declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #2
-declare i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* nocapture, i64, i32, i32, i1) #2
-declare i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* nocapture, i64, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) nocapture, i64, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) nocapture, i64, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr nocapture, i64, i32, i32, i1) #2
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
@@ -18,9 +18,9 @@
 
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
-define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32 addrspace(1)* %out
+define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -30,10 +30,10 @@
 
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
-define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32 addrspace(1)* %out
+define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -45,8 +45,8 @@
 ; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_dec_u32 [[VPTR]], [[DATA]]
-define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -56,9 +56,9 @@
 
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16
-define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -67,9 +67,9 @@
 ; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
 ; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; GFX9: global_atomic_dec v{{[0-9]+}}, [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
-define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32 addrspace(1)* %out
+define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -79,10 +79,10 @@
 
 ; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; GFX9: global_atomic_dec v{{[0-9]+}}, [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}}
-define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32 addrspace(1)* %out
+define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -92,8 +92,8 @@
 
 ; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; GFX9: global_atomic_dec [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]$}}
-define amdgpu_kernel void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -103,9 +103,9 @@
 
 ; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; GFX9: global_atomic_dec [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
-define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -113,13 +113,13 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}}
 ; VI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id
-  %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32 addrspace(1)* %out.gep
+  %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id
+  %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -127,20 +127,20 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
 ; VI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
-  %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
+  %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
+  %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define amdgpu_kernel void @flat_atomic_dec_ret_i32(i32* %out, i32* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32* %out
+define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr %out
   ret void
 }
 
@@ -148,18 +148,18 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
 ; GFX9: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16 glc{{$}}
-define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(i32* %out, i32* %ptr) #0 {
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32* %out
+define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #0 {
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define amdgpu_kernel void @flat_atomic_dec_noret_i32(i32* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -167,9 +167,9 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
 ; GFX9: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16{{$}}
-define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(i32* %ptr) nounwind {
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) nounwind {
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -177,13 +177,13 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
 ; GFX9: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20 glc{{$}}
-define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i32, i32* %ptr, i32 %id
-  %out.gep = getelementptr i32, i32* %out, i32 %id
-  %gep = getelementptr i32, i32* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32* %out.gep
+  %gep.tid = getelementptr i32, ptr %ptr, i32 %id
+  %out.gep = getelementptr i32, ptr %out, i32 %id
+  %gep = getelementptr i32, ptr %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr %out.gep
   ret void
 }
 
@@ -191,11 +191,11 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
 ; GFX9: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20{{$}}
-define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i32, i32* %ptr, i32 %id
-  %gep = getelementptr i32, i32* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
+  %gep.tid = getelementptr i32, ptr %ptr, i32 %id
+  %gep = getelementptr i32, ptr %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -203,9 +203,9 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
-define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64* %out, i64* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64* %out
+define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr %out
   ret void
 }
 
@@ -214,10 +214,10 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
 ; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32 glc{{$}}
-define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64* %out, i64* %ptr) #0 {
-  %gep = getelementptr i64, i64* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64* %out
+define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #0 {
+  %gep = getelementptr i64, ptr %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr %out
   ret void
 }
 
@@ -225,8 +225,8 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}}
-define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -235,9 +235,9 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}}
 ; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32{{$}}
-define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64* %ptr) nounwind {
-  %gep = getelementptr i64, i64* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) nounwind {
+  %gep = getelementptr i64, ptr %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -246,13 +246,13 @@
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
 ; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40 glc{{$}}
-define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i64, i64* %ptr, i32 %id
-  %out.gep = getelementptr i64, i64* %out, i32 %id
-  %gep = getelementptr i64, i64* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64* %out.gep
+  %gep.tid = getelementptr i64, ptr %ptr, i32 %id
+  %out.gep = getelementptr i64, ptr %out, i32 %id
+  %gep = getelementptr i64, ptr %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr %out.gep
   ret void
 }
 
@@ -261,11 +261,11 @@
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}}
 ; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40{{$}}
-define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i64, i64* %ptr, i32 %id
-  %gep = getelementptr i64, i64* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
+  %gep.tid = getelementptr i64, ptr %ptr, i32 %id
+  %gep = getelementptr i64, ptr %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -277,13 +277,13 @@
 
 ; GCN-DAG:  v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false)
-  store i32 %idx.0, i32 addrspace(1)* %add_use
-  store i32 %val0, i32 addrspace(1)* %out
+  %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0
+  %val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %arrayidx0, i32 9, i32 0, i32 0, i1 false)
+  store i32 %idx.0, ptr addrspace(1) %add_use
+  store i32 %val0, ptr addrspace(1) %out
   ret void
 }
 
@@ -294,9 +294,9 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v[[[KLO]]:[[KHI]]]{{$}}
-define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64 addrspace(1)* %out
+define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %ptr, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -307,10 +307,10 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32
-define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64 addrspace(1)* %out
+define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -321,8 +321,8 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_u64 v{{[0-9]+}}, v[[[KLO]]:[[KHI]]]{{$}}
-define amdgpu_kernel void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -333,9 +333,9 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_u64 v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32{{$}}
-define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -346,9 +346,9 @@
 ; CIVI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
 
 ; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
-define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64 addrspace(1)* %out
+define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %ptr, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -358,10 +358,10 @@
 ; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
 ; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
-define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64 addrspace(1)* %out
+define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -371,8 +371,8 @@
 ; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GFX9: global_atomic_dec_x2 v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]$}}
-define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -382,9 +382,9 @@
 ; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
 ; GFX9: global_atomic_dec_x2 v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
-define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -394,13 +394,13 @@
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
-define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
-  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id
-  %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64 addrspace(1)* %out.gep
+  %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
+  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id
+  %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -410,11 +410,11 @@
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]]{{$}}
-define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
-  %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
+  %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
+  %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -426,13 +426,13 @@
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
-define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
-  %val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false)
-  store i32 %idx.0, i32 addrspace(1)* %add_use
-  store i64 %val0, i64 addrspace(1)* %out
+  %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
+  %val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %arrayidx0, i64 9, i32 0, i32 0, i1 false)
+  store i32 %idx.0, ptr addrspace(1) %add_use
+  store i64 %val0, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll
index 84b675d..0bcc4db 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll

@@ -3,8 +3,8 @@
 
 declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
 declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1)
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)*, float)
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>)
+declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
+declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
 
 ; GFX908:  LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD
 
@@ -42,52 +42,52 @@
 
 ; GFX90A-LABEL: {{^}}global_atomic_add_f32:
 ; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc
-define amdgpu_ps float @global_atomic_add_f32(float addrspace(1)* %ptr, float %data) {
+define amdgpu_ps float @global_atomic_add_f32(ptr addrspace(1) %ptr, float %data) {
 main_body:
-  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   ret float %ret
 }
 
 ; GFX90A-LABEL: {{^}}global_atomic_add_f32_off4:
 ; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:4 glc
-define amdgpu_ps float @global_atomic_add_f32_off4(float addrspace(1)* %ptr, float %data) {
+define amdgpu_ps float @global_atomic_add_f32_off4(ptr addrspace(1) %ptr, float %data) {
 main_body:
-  %p = getelementptr float, float addrspace(1)* %ptr, i64 1
-  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data)
+  %p = getelementptr float, ptr addrspace(1) %ptr, i64 1
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data)
   ret float %ret
 }
 
 ; GFX90A-LABEL: {{^}}global_atomic_add_f32_offneg4:
 ; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:-4 glc
-define amdgpu_ps float @global_atomic_add_f32_offneg4(float addrspace(1)* %ptr, float %data) {
+define amdgpu_ps float @global_atomic_add_f32_offneg4(ptr addrspace(1) %ptr, float %data) {
 main_body:
-  %p = getelementptr float, float addrspace(1)* %ptr, i64 -1
-  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data)
+  %p = getelementptr float, ptr addrspace(1) %ptr, i64 -1
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data)
   ret float %ret
 }
 
 ; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16:
 ; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
-define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
 main_body:
-  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
   ret <2 x half> %ret
 }
 
 ; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_off4:
 ; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:4 glc
-define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_off4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_off4(ptr addrspace(1) %ptr, <2 x half> %data) {
 main_body:
-  %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 1
-  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data)
+  %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 1
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data)
   ret <2 x half> %ret
 }
 
 ; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4:
 ; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-4 glc
-define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_offneg4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_offneg4(ptr addrspace(1) %ptr, <2 x half> %data) {
 main_body:
-  %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -1
-  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data)
+  %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -1
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data)
   ret <2 x half> %ret
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll
index 635327b..43335ad 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll

@@ -3,9 +3,9 @@
 
 declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
 declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1)
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)*, float)
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>)
-declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float*, float)
+declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
+declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
+declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr, float)
 
 ; GCN-LABEL: {{^}}buffer_atomic_add_f32:
 ; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen
@@ -41,53 +41,53 @@
 
 ; GCN-LABEL: {{^}}global_atomic_add_f32:
 ; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @global_atomic_add_f32(float addrspace(1)* %ptr, float %data) {
+define amdgpu_kernel void @global_atomic_add_f32(ptr addrspace(1) %ptr, float %data) {
 main_body:
-  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_atomic_add_f32_off4:
 ; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
-define amdgpu_kernel void @global_atomic_add_f32_off4(float addrspace(1)* %ptr, float %data) {
+define amdgpu_kernel void @global_atomic_add_f32_off4(ptr addrspace(1) %ptr, float %data) {
 main_body:
-  %p = getelementptr float, float addrspace(1)* %ptr, i64 1
-  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data)
+  %p = getelementptr float, ptr addrspace(1) %ptr, i64 1
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data)
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_atomic_add_f32_offneg4:
 ; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4
-define amdgpu_kernel void @global_atomic_add_f32_offneg4(float addrspace(1)* %ptr, float %data) {
+define amdgpu_kernel void @global_atomic_add_f32_offneg4(ptr addrspace(1) %ptr, float %data) {
 main_body:
-  %p = getelementptr float, float addrspace(1)* %ptr, i64 -1
-  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data)
+  %p = getelementptr float, ptr addrspace(1) %ptr, i64 -1
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data)
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16:
 ; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
-define amdgpu_kernel void @global_atomic_pk_add_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+define amdgpu_kernel void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
 main_body:
-  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_off4:
 ; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
-define amdgpu_kernel void @global_atomic_pk_add_v2f16_off4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+define amdgpu_kernel void @global_atomic_pk_add_v2f16_off4(ptr addrspace(1) %ptr, <2 x half> %data) {
 main_body:
-  %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 1
-  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data)
+  %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 1
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data)
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4:
 ; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4{{$}}
-define amdgpu_kernel void @global_atomic_pk_add_v2f16_offneg4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+define amdgpu_kernel void @global_atomic_pk_add_v2f16_offneg4(ptr addrspace(1) %ptr, <2 x half> %data) {
 main_body:
-  %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -1
-  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data)
+  %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -1
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data)
   ret void
 }
 
@@ -95,15 +95,15 @@
 ; the feature set.
 ; GCN-LABEL: {{^}}global_atomic_fadd_f32_wrong_subtarget:
 ; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
-define amdgpu_kernel void @global_atomic_fadd_f32_wrong_subtarget(float addrspace(1)* %ptr, float %data) #0 {
-  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
+define amdgpu_kernel void @global_atomic_fadd_f32_wrong_subtarget(ptr addrspace(1) %ptr, float %data) #0 {
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_fadd_f32_wrong_subtarget:
 ; GCN: flat_atomic_add_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
-define amdgpu_kernel void @flat_atomic_fadd_f32_wrong_subtarget(float* %ptr, float %data) #1 {
-  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data)
+define amdgpu_kernel void @flat_atomic_fadd_f32_wrong_subtarget(ptr %ptr, float %data) #1 {
+  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
index 23792c6..fcfa671 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll

@@ -2,13 +2,13 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
-declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #2
-declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
-declare i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr nocapture, i32, i32, i32, i1) #2
 
-declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #2
-declare i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* nocapture, i64, i32, i32, i1) #2
-declare i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* nocapture, i64, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) nocapture, i64, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) nocapture, i64, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr nocapture, i64, i32, i32, i1) #2
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
@@ -18,9 +18,9 @@
 
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
-define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32 addrspace(1)* %out
+define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -30,10 +30,10 @@
 
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
-define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32 addrspace(1)* %out
+define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -45,8 +45,8 @@
 ; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_inc_u32 [[VPTR]], [[DATA]]
-define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -56,9 +56,9 @@
 
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16
-define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -66,9 +66,9 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
 ; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
-define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32 addrspace(1)* %out
+define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -76,10 +76,10 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
 ; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}}
-define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32 addrspace(1)* %out
+define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -87,8 +87,8 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GFX9: global_atomic_inc v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]$}}
-define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -96,9 +96,9 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GFX9: global_atomic_inc v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
-define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -106,13 +106,13 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}}
 ; VI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id
-  %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32 addrspace(1)* %out.gep
+  %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id
+  %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -120,11 +120,11 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
 ; VI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
-  %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
+  %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
+  %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -133,13 +133,13 @@
 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:
 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false)
-  store i32 %idx.0, i32 addrspace(1)* %add_use
-  store i32 %val0, i32 addrspace(1)* %out
+  %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0
+  %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %arrayidx0, i32 9, i32 0, i32 0, i1 false)
+  store i32 %idx.0, ptr addrspace(1) %add_use
+  store i32 %val0, ptr addrspace(1) %out
   ret void
 }
 
@@ -147,9 +147,9 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v[[[KLO]]:[[KHI]]]{{$}}
-define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64 addrspace(1)* %out
+define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %ptr, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -157,10 +157,10 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32
-define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64 addrspace(1)* %out
+define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -168,8 +168,8 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_u64 v{{[0-9]+}}, v[[[KLO]]:[[KHI]]]{{$}}
-define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -177,9 +177,9 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_u64 v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32{{$}}
-define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -189,9 +189,9 @@
 ; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
 ; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
-define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64 addrspace(1)* %out
+define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -201,10 +201,10 @@
 ; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
 ; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
-define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64 addrspace(1)* %out
+define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -215,8 +215,8 @@
 ; CIVI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
 ; GFX9: global_atomic_inc_x2 v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]$}}
-define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -226,9 +226,9 @@
 ; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
 ; GFX9: global_atomic_inc_x2 v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
-define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -238,13 +238,13 @@
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
-define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
-  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id
-  %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64 addrspace(1)* %out.gep
+  %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
+  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id
+  %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -254,20 +254,20 @@
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]]{{$}}
-define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
-  %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
+  %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
+  %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define amdgpu_kernel void @flat_atomic_inc_ret_i32(i32* %out, i32* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32* %out
+define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr %out
   ret void
 }
 
@@ -275,18 +275,18 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
 ; GFX9: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16 glc{{$}}
-define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(i32* %out, i32* %ptr) #0 {
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32* %out
+define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #0 {
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define amdgpu_kernel void @flat_atomic_inc_noret_i32(i32* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -294,9 +294,9 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
 ; GFX9: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16{{$}}
-define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(i32* %ptr) nounwind {
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) nounwind {
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -304,13 +304,13 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
 ; GFX9: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20 glc{{$}}
-define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i32, i32* %ptr, i32 %id
-  %out.gep = getelementptr i32, i32* %out, i32 %id
-  %gep = getelementptr i32, i32* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
-  store i32 %result, i32* %out.gep
+  %gep.tid = getelementptr i32, ptr %ptr, i32 %id
+  %out.gep = getelementptr i32, ptr %out, i32 %id
+  %gep = getelementptr i32, ptr %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
+  store i32 %result, ptr %out.gep
   ret void
 }
 
@@ -318,11 +318,11 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CIVI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
 ; GFX9: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20{{$}}
-define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i32, i32* %ptr, i32 %id
-  %gep = getelementptr i32, i32* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
+  %gep.tid = getelementptr i32, ptr %ptr, i32 %id
+  %gep = getelementptr i32, ptr %gep.tid, i32 5
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -331,13 +331,13 @@
 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:
 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
-define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
-  %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false)
-  store i32 %idx.0, i32 addrspace(1)* %add_use
-  store i64 %val0, i64 addrspace(1)* %out
+  %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
+  %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %arrayidx0, i64 9, i32 0, i32 0, i1 false)
+  store i32 %idx.0, ptr addrspace(1) %add_use
+  store i64 %val0, ptr addrspace(1) %out
   ret void
 }
 
@@ -345,9 +345,9 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
-define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64* %out, i64* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64* %out
+define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr %out
   ret void
 }
 
@@ -356,10 +356,10 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
 ; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32 glc{{$}}
-define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64* %out, i64* %ptr) #0 {
-  %gep = getelementptr i64, i64* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64* %out
+define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #0 {
+  %gep = getelementptr i64, ptr %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr %out
   ret void
 }
 
@@ -367,8 +367,8 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}}
-define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -377,9 +377,9 @@
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}}
 ; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32{{$}}
-define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64* %ptr) nounwind {
-  %gep = getelementptr i64, i64* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind {
+  %gep = getelementptr i64, ptr %ptr, i32 4
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -388,13 +388,13 @@
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
 ; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40 glc{{$}}
-define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i64, i64* %ptr, i32 %id
-  %out.gep = getelementptr i64, i64* %out, i32 %id
-  %gep = getelementptr i64, i64* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
-  store i64 %result, i64* %out.gep
+  %gep.tid = getelementptr i64, ptr %ptr, i32 %id
+  %out.gep = getelementptr i64, ptr %out, i32 %id
+  %gep = getelementptr i64, ptr %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
+  store i64 %result, ptr %out.gep
   ret void
 }
 
@@ -403,11 +403,11 @@
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}}
 ; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40{{$}}
-define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.tid = getelementptr i64, i64* %ptr, i32 %id
-  %gep = getelementptr i64, i64* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
+  %gep.tid = getelementptr i64, ptr %ptr, i32 %id
+  %gep = getelementptr i64, ptr %gep.tid, i32 5
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -415,12 +415,12 @@
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
-define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(3)* %ptr) #0 {
-  %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
-  %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(3) %ptr) #0 {
+  %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false)
+  %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false)
 
-  store i32 %result0, i32 addrspace(1)* %out0
-  store i32 %result1, i32 addrspace(1)* %out1
+  store i32 %result0, ptr addrspace(1) %out0
+  store i32 %result1, ptr addrspace(1) %out1
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
index dcf06ae..ec68daa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll

@@ -117,12 +117,12 @@
 ; CHECK-LABEL: buffer_load_mmo:
 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) {
+define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, ptr addrspace(3) %lds) {
 entry:
-  store float 0.0, float addrspace(3)* %lds
+  store float 0.0, ptr addrspace(3) %lds
   %val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
-  %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
-  store float 0.0, float addrspace(3)* %tmp2
+  %tmp2 = getelementptr float, ptr addrspace(3) %lds, i32 4
+  store float 0.0, ptr addrspace(3) %tmp2
   ret float %val
 }
 
@@ -448,7 +448,7 @@
 ; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen
 define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) {
   %alloca = alloca i32, addrspace(5)
-  %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32
+  %alloca.cast = ptrtoint ptr addrspace(5) %alloca to i32
 
   %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 0, i1 false, i1 false)
   ret float %ret.val
@@ -460,7 +460,7 @@
 ; CHECK: buffer_load_dword v0, v[[[FI]]:[[HI]]
 define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
   %alloca = alloca i32, addrspace(5)
-  %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32
+  %alloca.cast = ptrtoint ptr addrspace(5) %alloca to i32
 
   %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 %soffset, i1 false, i1 false)
   ret float %ret.val

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
index 05b125f..4e00a70 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll

@@ -9,10 +9,10 @@
 ; VI: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xfc,0xe0,0x00,0x00,0x00,0x00]
 ; GCN: _store_byte
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @test_buffer_wbinvl1_vol(i8 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @test_buffer_wbinvl1_vol(ptr addrspace(1) %ptr) #0 {
   call void @llvm.amdgcn.buffer.wbinvl1.vol()
 ; This used to crash in hazard recognizer
-  store i8 0, i8 addrspace(1)* %ptr, align 1
+  store i8 0, ptr addrspace(1) %ptr, align 1
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
index b0f8754..89dbe9b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll

@@ -11,15 +11,15 @@
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @class_f16(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    i32 addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load i32, i32 addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load i32, ptr addrspace(1) %b
   %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val, i32 %b.val)
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -32,7 +32,7 @@
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @class_f16_fabs(
-  i32 addrspace(1)* %r,
+  ptr addrspace(1) %r,
   [8 x i32],
   half %a.val,
   [8 x i32],
@@ -41,7 +41,7 @@
   %a.val.fabs = call half @llvm.fabs.f16(half %a.val)
   %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val.fabs, i32 %b.val)
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -54,7 +54,7 @@
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @class_f16_fneg(
-  i32 addrspace(1)* %r,
+  ptr addrspace(1) %r,
   [8 x i32],
   half %a.val,
   [8 x i32],
@@ -63,7 +63,7 @@
   %a.val.fneg = fsub half -0.0, %a.val
   %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val.fneg, i32 %b.val)
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -76,7 +76,7 @@
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @class_f16_fabs_fneg(
-  i32 addrspace(1)* %r,
+  ptr addrspace(1) %r,
   [8 x i32],
   half %a.val,
   [8 x i32],
@@ -86,7 +86,7 @@
   %a.val.fabs.fneg = fsub half -0.0, %a.val.fabs
   %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val.fabs.fneg, i32 %b.val)
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -97,12 +97,12 @@
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @class_f16_1(
-  i32 addrspace(1)* %r,
+  ptr addrspace(1) %r,
   half %a.val) {
 entry:
   %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val, i32 1)
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -113,12 +113,12 @@
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @class_f16_64(
-  i32 addrspace(1)* %r,
+  ptr addrspace(1) %r,
   half %a.val) {
 entry:
   %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val, i32 64)
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -130,12 +130,12 @@
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @class_f16_full_mask(
-  i32 addrspace(1)* %r,
+  ptr addrspace(1) %r,
   half %a.val) {
 entry:
   %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val, i32 1023)
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -147,11 +147,11 @@
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @class_f16_nine_bit_mask(
-  i32 addrspace(1)* %r,
+  ptr addrspace(1) %r,
   half %a.val) {
 entry:
   %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val, i32 511)
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
index 311d74b..98b1198 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll

@@ -14,10 +14,10 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
+define amdgpu_kernel void @test_class_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -29,11 +29,11 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
+define amdgpu_kernel void @test_class_fabs_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -45,11 +45,11 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
   %a.fneg = fsub float -0.0, %a
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -61,12 +61,12 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_fabs_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
   %a.fneg.fabs = fsub float -0.0, %a.fabs
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -76,10 +76,10 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_1_f32(ptr addrspace(1) %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -89,10 +89,10 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_64_f32(ptr addrspace(1) %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -104,10 +104,10 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_full_mask_f32(ptr addrspace(1) %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -118,10 +118,10 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_9bit_mask_f32(ptr addrspace(1) %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -132,15 +132,15 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_class_full_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep.in
 
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  store i32 %sext, ptr addrspace(1) %gep.out, align 4
   ret void
 }
 
@@ -150,15 +150,15 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %b = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %b = load i32, ptr addrspace(1) %gep.in
 
   %result = call i1 @llvm.amdgcn.class.f32(float 1.0, i32 %b) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  store i32 %sext, ptr addrspace(1) %gep.out, align 4
   ret void
 }
 
@@ -170,15 +170,15 @@
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %b = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %b = load i32, ptr addrspace(1) %gep.in
 
   %result = call i1 @llvm.amdgcn.class.f32(float 1024.0, i32 %b) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  store i32 %sext, ptr addrspace(1) %gep.out, align 4
   ret void
 }
 
@@ -190,10 +190,10 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
+define amdgpu_kernel void @test_class_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -205,11 +205,11 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
+define amdgpu_kernel void @test_class_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -221,11 +221,11 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
   %a.fneg = fsub double -0.0, %a
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -237,32 +237,32 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
   %a.fneg.fabs = fsub double -0.0, %a.fabs
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; SI-LABEL: {{^}}test_class_1_f64:
 ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_1_f64(ptr addrspace(1) %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; SI-LABEL: {{^}}test_class_64_f64:
 ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_64_f64(ptr addrspace(1) %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -275,10 +275,10 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, [8 x i32], double %a) #0 {
+define amdgpu_kernel void @test_class_full_mask_f64(ptr addrspace(1) %out, [8 x i32], double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -290,15 +290,15 @@
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_class_full_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load double, double addrspace(1)* %in
+  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load double, ptr addrspace(1) %in
 
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  store i32 %sext, ptr addrspace(1) %gep.out, align 4
   ret void
 }
 
@@ -306,30 +306,30 @@
 ; XSI: v_cmp_class_f64_e32 vcc, 1.0,
 ; SI: v_cmp_class_f64_e32 vcc,
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %b = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %b = load i32, ptr addrspace(1) %gep.in
 
   %result = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 %b) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  store i32 %sext, ptr addrspace(1) %gep.out, align 4
   ret void
 }
 
 ; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64:
 ; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %b = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %b = load i32, ptr addrspace(1) %gep.in
 
   %result = call i1 @llvm.amdgcn.class.f64(double 1024.0, i32 %b) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  store i32 %sext, ptr addrspace(1) %gep.out, align 4
   ret void
 }
 
@@ -338,18 +338,18 @@
 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define amdgpu_kernel void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep.in
 
   %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
   %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 3) #1
   %or = or i1 %class0, %class1
 
   %sext = sext i1 %or to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -358,11 +358,11 @@
 ; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define amdgpu_kernel void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or3_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep.in
 
   %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
   %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 2) #1
@@ -371,7 +371,7 @@
   %or.1 = or i1 %or.0, %class2
 
   %sext = sext i1 %or.1 to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -381,11 +381,11 @@
 ; SI: v_cmp_class_f32_e64 s[0:1], v{{[0-9]+}}, [[MASK]]{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep.in
 
   %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
   %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 2) #1
@@ -407,7 +407,7 @@
   %or.7 = or i1 %or.6, %class8
   %or.8 = or i1 %or.7, %class9
   %sext = sext i1 %or.8 to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -416,18 +416,18 @@
 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define amdgpu_kernel void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_class_f32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep.in
 
   %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
   %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 8) #1
   %or = or i1 %class0, %class1
 
   %sext = sext i1 %or to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -436,18 +436,18 @@
 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define amdgpu_kernel void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_class_f32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep.in
 
   %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 7) #1
   %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 7) #1
   %or = or i1 %class0, %class1
 
   %sext = sext i1 %or to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -456,18 +456,18 @@
 ; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
 ; SI: s_or_b64
 ; SI: s_endpgm
-define amdgpu_kernel void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
+define amdgpu_kernel void @test_no_fold_or_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep.in
 
   %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
   %class1 = call i1 @llvm.amdgcn.class.f32(float %b, i32 8) #1
   %or = or i1 %class0, %class1
 
   %sext = sext i1 %or to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -476,10 +476,10 @@
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_0_f32(ptr addrspace(1) %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -488,10 +488,10 @@
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_0_f64(ptr addrspace(1) %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -500,10 +500,10 @@
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_undef_f32(ptr addrspace(1) %out, float %a, i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float undef, i32 %b) #1
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -538,7 +538,7 @@
 ; SI: s_and_b64
 define i1 @test_fold_and_ord_multi_use(float %a) {
   %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
-  store volatile i1 %class, i1 addrspace(1)* undef
+  store volatile i1 %class, ptr addrspace(1) undef
   %ord = fcmp ord float %a, %a
   %and = and i1 %ord, %class
   ret i1 %and

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
index 0543886..1f78dd7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll

@@ -8,11 +8,11 @@
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @cos_f16(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
+  %a.val = load half, ptr addrspace(1) %a
   %r.val = call half @llvm.amdgcn.cos.f16(half %a.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
index 7209d94..68438ce 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll

@@ -5,9 +5,9 @@
 
 ; GCN-LABEL: {{^}}v_cos_f32:
 ; GCN: v_cos_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define amdgpu_kernel void @v_cos_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_cos_f32(ptr addrspace(1) %out, float %src) #1 {
   %cos = call float @llvm.amdgcn.cos.f32(float %src) #0
-  store float %cos, float addrspace(1)* %out
+  store float %cos, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
index 2d3e3ee..cb71a99 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll

@@ -5,9 +5,9 @@
 
 ; GCN-LABEL: {{^}}test_cubeid:
 ; GCN: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @test_cubeid(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubeid(ptr addrspace(1) %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
-  store float %result, float addrspace(1)* %out
+  store float %result, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
index 48c55e9..8ab8742 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll

@@ -5,9 +5,9 @@
 
 ; GCN-LABEL: {{^}}test_cubema:
 ; GCN: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @test_cubema(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubema(ptr addrspace(1) %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubema(float %a, float %b, float %c)
-  store float %result, float addrspace(1)* %out
+  store float %result, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
index 0733646..eee7bbd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll

@@ -5,9 +5,9 @@
 
 ; GCN-LABEL: {{^}}test_cubesc:
 ; GCN: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @test_cubesc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubesc(ptr addrspace(1) %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
-  store float %result, float addrspace(1)* %out
+  store float %result, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
index c3a8c9e..28ce72e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll

@@ -5,9 +5,9 @@
 
 ; GCN-LABEL: {{^}}test_cubetc:
 ; GCN: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @test_cubetc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubetc(ptr addrspace(1) %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
-  store float %result, float addrspace(1)* %out
+  store float %result, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
index bc9e8ef..70f6bc2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll

@@ -8,20 +8,20 @@
 ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]]
 ; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]]
 ; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]]
-define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @s_cvt_pk_i16_i32(ptr addrspace(1) %out, i32 %x, i32 %y) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y)
   %r = bitcast <2 x i16> %result to i32
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_cvt_pk_i16_samereg_i32:
 ; GCN: s_load_dword [[X:s[0-9]+]]
 ; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
-define amdgpu_kernel void @s_cvt_pk_i16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @s_cvt_pk_i16_samereg_i32(ptr addrspace(1) %out, i32 %x) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %x)
   %r = bitcast <2 x i16> %result to i32
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -30,32 +30,32 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[A]], [[B]]
 ; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[A]], [[B]]
-define amdgpu_kernel void @v_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pk_i16_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile i32, i32 addrspace(1)* %a.gep
-  %b = load volatile i32, i32 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile i32, ptr addrspace(1) %a.gep
+  %b = load volatile i32, ptr addrspace(1) %b.gep
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 %b)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_reg_imm:
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
-define amdgpu_kernel void @v_cvt_pk_i16_i32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pk_i16_i32_reg_imm(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile i32, ptr addrspace(1) %a.gep
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 1)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -63,15 +63,15 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, 1, [[A]]
 ; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, 1, [[A]]
-define amdgpu_kernel void @v_cvt_pk_i16_i32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pk_i16_i32_imm_reg(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile i32, ptr addrspace(1) %a.gep
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 1, i32 %a)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
index 817eb10..59c19d9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll

@@ -8,20 +8,20 @@
 ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]]
 ; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]]
 ; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]]
-define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @s_cvt_pk_u16_u32(ptr addrspace(1) %out, i32 %x, i32 %y) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y)
   %r = bitcast <2 x i16> %result to i32
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_cvt_pk_u16_samereg_i32:
 ; GCN: s_load_dword [[X:s[0-9]+]]
 ; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
-define amdgpu_kernel void @s_cvt_pk_u16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @s_cvt_pk_u16_samereg_i32(ptr addrspace(1) %out, i32 %x) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %x)
   %r = bitcast <2 x i16> %result to i32
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -30,32 +30,32 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[A]], [[B]]
 ; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[A]], [[B]]
-define amdgpu_kernel void @v_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pk_u16_u32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile i32, i32 addrspace(1)* %a.gep
-  %b = load volatile i32, i32 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile i32, ptr addrspace(1) %a.gep
+  %b = load volatile i32, ptr addrspace(1) %b.gep
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 %b)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_reg_imm:
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
-define amdgpu_kernel void @v_cvt_pk_u16_u32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pk_u16_u32_reg_imm(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile i32, ptr addrspace(1) %a.gep
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 1)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -63,15 +63,15 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, 1, [[A]]
 ; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, 1, [[A]]
-define amdgpu_kernel void @v_cvt_pk_u16_u32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pk_u16_u32_imm_reg(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile i32, ptr addrspace(1) %a.gep
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 1, i32 %a)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
index 25980e2..aef8449 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll

@@ -8,20 +8,20 @@
 ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]]
 ; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]]
 ; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]]
-define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @s_cvt_pknorm_i16_f32(ptr addrspace(1) %out, float %x, float %y) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y)
   %r = bitcast <2 x i16> %result to i32
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_cvt_pknorm_i16_samereg_f32:
 ; GCN: s_load_dword [[X:s[0-9]+]]
 ; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
-define amdgpu_kernel void @s_cvt_pknorm_i16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @s_cvt_pknorm_i16_samereg_f32(ptr addrspace(1) %out, float %x) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %x)
   %r = bitcast <2 x i16> %result to i32
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -30,32 +30,32 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
 ; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[A]], [[B]]
-define amdgpu_kernel void @v_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
-  %b = load volatile float, float addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %b = load volatile float, ptr addrspace(1) %b.gep
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %b)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_reg_imm:
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
-define amdgpu_kernel void @v_cvt_pknorm_i16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_reg_imm(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float 1.0)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -63,15 +63,15 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
 ; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, 1.0, [[A]]
-define amdgpu_kernel void @v_cvt_pknorm_i16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_imm_reg(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float 1.0, float %a)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -79,18 +79,18 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
-define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
-  %b = load volatile float, float addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %b = load volatile float, ptr addrspace(1) %b.gep
   %neg.a = fsub float -0.0, %a
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %b)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -98,18 +98,18 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
-define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
-  %b = load volatile float, float addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %b = load volatile float, ptr addrspace(1) %b.gep
   %neg.b = fsub float -0.0, %b
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %neg.b)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -117,19 +117,19 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
-define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
-  %b = load volatile float, float addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %b = load volatile float, ptr addrspace(1) %b.gep
   %neg.a = fsub float -0.0, %a
   %neg.b = fsub float -0.0, %b
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %neg.b)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -137,20 +137,20 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
-define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
-  %b = load volatile float, float addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %b = load volatile float, ptr addrspace(1) %b.gep
   %fabs.a = call float @llvm.fabs.f32(float %a)
   %neg.fabs.a = fsub float -0.0, %fabs.a
   %neg.b = fsub float -0.0, %b
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.fabs.a, float %neg.b)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
index 8f9a494..21f104b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll

@@ -8,20 +8,20 @@
 ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]]
 ; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]]
 ; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]]
-define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @s_cvt_pknorm_u16_f32(ptr addrspace(1) %out, float %x, float %y) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y)
   %r = bitcast <2 x i16> %result to i32
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_cvt_pknorm_u16_samereg_f32:
 ; GCN: s_load_dword [[X:s[0-9]+]]
 ; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
-define amdgpu_kernel void @s_cvt_pknorm_u16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @s_cvt_pknorm_u16_samereg_f32(ptr addrspace(1) %out, float %x) #0 {
   %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %x)
   %r = bitcast <2 x i16> %result to i32
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -30,32 +30,32 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
 ; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[A]], [[B]]
-define amdgpu_kernel void @v_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
-  %b = load volatile float, float addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %b = load volatile float, ptr addrspace(1) %b.gep
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %b)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_reg_imm:
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
-define amdgpu_kernel void @v_cvt_pknorm_u16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_reg_imm(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float 1.0)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -63,15 +63,15 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
 ; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, 1.0, [[A]]
-define amdgpu_kernel void @v_cvt_pknorm_u16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_imm_reg(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float 1.0, float %a)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -79,18 +79,18 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
-define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
-  %b = load volatile float, float addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %b = load volatile float, ptr addrspace(1) %b.gep
   %neg.a = fsub float -0.0, %a
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %b)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -98,18 +98,18 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
-define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
-  %b = load volatile float, float addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %b = load volatile float, ptr addrspace(1) %b.gep
   %neg.b = fsub float -0.0, %b
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %neg.b)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -117,19 +117,19 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
-define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
-  %b = load volatile float, float addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %b = load volatile float, ptr addrspace(1) %b.gep
   %neg.a = fsub float -0.0, %a
   %neg.b = fsub float -0.0, %b
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %neg.b)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -137,20 +137,20 @@
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
-define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
-  %b = load volatile float, float addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %b = load volatile float, ptr addrspace(1) %b.gep
   %fabs.a = call float @llvm.fabs.f32(float %a)
   %neg.fabs.a = fsub float -0.0, %fabs.a
   %neg.b = fsub float -0.0, %b
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.fabs.a, float %neg.b)
   %r = bitcast <2 x i16> %cvt to i32
-  store i32 %r, i32 addrspace(1)* %out.gep
+  store i32 %r, ptr addrspace(1) %out.gep
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 64e442a..bc756b4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll

@@ -5,7 +5,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11
 
-define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x, float %y) #0 {
 ; SI-LABEL: s_cvt_pkrtz_v2f16_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -59,11 +59,11 @@
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out
+  store <2 x half> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, float %x) #0 {
 ; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -119,11 +119,11 @@
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out
+  store <2 x half> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: s_cvt_pkrtz_undef_undef:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_endpgm
@@ -136,11 +136,11 @@
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_endpgm
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out
+  store <2 x half> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -230,17 +230,17 @@
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
-  %b = load volatile float, float addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %b = load volatile float, ptr addrspace(1) %b.gep
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
-  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  store <2 x half> %cvt, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -309,15 +309,15 @@
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 1.0)
-  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  store <2 x half> %cvt, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 {
 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -386,15 +386,15 @@
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 1.0, float %a)
-  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  store <2 x half> %cvt, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -484,18 +484,18 @@
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
-  %b = load volatile float, float addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %b = load volatile float, ptr addrspace(1) %b.gep
   %neg.a = fsub float -0.0, %a
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %b)
-  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  store <2 x half> %cvt, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -585,18 +585,18 @@
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
-  %b = load volatile float, float addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %b = load volatile float, ptr addrspace(1) %b.gep
   %neg.b = fsub float -0.0, %b
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %neg.b)
-  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  store <2 x half> %cvt, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -686,19 +686,19 @@
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
-  %b = load volatile float, float addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %b = load volatile float, ptr addrspace(1) %b.gep
   %neg.a = fsub float -0.0, %a
   %neg.b = fsub float -0.0, %b
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %neg.b)
-  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  store <2 x half> %cvt, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -788,16 +788,16 @@
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
-  %a = load volatile float, float addrspace(1)* %a.gep
-  %b = load volatile float, float addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %b = load volatile float, ptr addrspace(1) %b.gep
   %fabs.a = call float @llvm.fabs.f32(float %a)
   %neg.fabs.a = fsub float -0.0, %fabs.a
   %neg.b = fsub float -0.0, %b
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.fabs.a, float %neg.b)
-  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  store <2 x half> %cvt, ptr addrspace(1) %out.gep
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
index 5279f8e..c4c7ce0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll

@@ -9,9 +9,9 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
-define amdgpu_kernel void @dispatch_id(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @dispatch_id(ptr addrspace(1) %out) #0 {
   %tmp0 = call i64 @llvm.amdgcn.dispatch.id()
-  store i64 %tmp0, i64 addrspace(1)* %out
+  store i64 %tmp0, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
index 42826b7..06500ad 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll

@@ -6,11 +6,10 @@
 ; GCN-LABEL: {{^}}test:
 ; GCN: enable_sgpr_dispatch_ptr = 1
 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
-  %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
-  %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
-  %value = load i32, i32 addrspace(4)* %header_ptr
-  store i32 %value, i32 addrspace(1)* %out
+define amdgpu_kernel void @test(ptr addrspace(1) %out) {
+  %dispatch_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
+  %value = load i32, ptr addrspace(4) %dispatch_ptr
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -20,16 +19,15 @@
 ; GCN: s_lshr_b32 s{{[0-9]+}}, s[[REG]], 16
 ; GCN-NOT: load_ushort
 ; GCN: s_endpgm
-define amdgpu_kernel void @test2(i32 addrspace(1)* %out) {
-  %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
-  %d1 = getelementptr inbounds i8, i8 addrspace(4)* %dispatch_ptr, i64 6
-  %h1 = bitcast i8 addrspace(4)* %d1 to i16 addrspace(4)*
-  %v1 = load i16, i16 addrspace(4)* %h1
+define amdgpu_kernel void @test2(ptr addrspace(1) %out) {
+  %dispatch_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
+  %d1 = getelementptr inbounds i8, ptr addrspace(4) %dispatch_ptr, i64 6
+  %v1 = load i16, ptr addrspace(4) %d1
   %e1 = zext i16 %v1 to i32
-  store i32 %e1, i32 addrspace(1)* %out
+  store i32 %e1, ptr addrspace(1) %out
   ret void
 }
 
-declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+declare noalias ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
 
 attributes #0 = { readnone }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
index 7b36521f1..b7d1d4e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll

@@ -10,16 +10,16 @@
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @div_fixup_f16(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b,
-    half addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
-  %c.val = load volatile half, half addrspace(1)* %c
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
+  %c.val = load volatile half, ptr addrspace(1) %c
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half %c.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -31,14 +31,14 @@
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @div_fixup_f16_imm_a(
-    half addrspace(1)* %r,
-    half addrspace(1)* %b,
-    half addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %b.val = load volatile half, half addrspace(1)* %b
-  %c.val = load volatile half, half addrspace(1)* %c
+  %b.val = load volatile half, ptr addrspace(1) %b
+  %c.val = load volatile half, ptr addrspace(1) %c
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half %b.val, half %c.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -50,14 +50,14 @@
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @div_fixup_f16_imm_b(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %c.val = load volatile half, half addrspace(1)* %c
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %c.val = load volatile half, ptr addrspace(1) %c
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half 3.0, half %c.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -69,14 +69,14 @@
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @div_fixup_f16_imm_c(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half 3.0)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -87,12 +87,12 @@
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @div_fixup_f16_imm_a_imm_b(
-    half addrspace(1)* %r,
-    half addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %c) {
 entry:
-  %c.val = load volatile half, half addrspace(1)* %c
+  %c.val = load volatile half, ptr addrspace(1) %c
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half 3.0, half %c.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -103,12 +103,12 @@
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @div_fixup_f16_imm_b_imm_c(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
+  %a.val = load half, ptr addrspace(1) %a
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half 3.0, half 3.0)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -119,11 +119,11 @@
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @div_fixup_f16_imm_a_imm_c(
-    half addrspace(1)* %r,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %b) {
 entry:
-  %b.val = load half, half addrspace(1)* %b
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half %b.val, half 3.0)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
index 4d9921f..1f58f18 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll

@@ -18,16 +18,16 @@
 ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) nounwind {
+define amdgpu_kernel void @test_div_fixup_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_div_fixup_f64:
 ; GCN: v_div_fixup_f64
-define amdgpu_kernel void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
+define amdgpu_kernel void @test_div_fixup_f64(ptr addrspace(1) %out, double %a, double %b, double %c) nounwind {
   %result = call double @llvm.amdgcn.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
-  store double %result, double addrspace(1)* %out, align 8
+  store double %result, ptr addrspace(1) %out, align 8
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 113f121..22b801c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll

@@ -23,9 +23,9 @@
 ; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
 ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
-define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -36,9 +36,9 @@
 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
 ; SI: buffer_store_dword [[RESULT]],
-define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -53,9 +53,9 @@
 ; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
 ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
-define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, [8 x i32], i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, float %a, float %b, float %c, [8 x i32], i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -70,45 +70,45 @@
 ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
 ; GCN: buffer_store_dword [[RESULT]],
-define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f64:
 ; GCN: v_div_fmas_f64
-define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) nounwind {
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
-  store double %result, double addrspace(1)* %out, align 8
+  store double %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
 ; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %i) nounwind {
   %cmp = icmp eq i32 %i, 0
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
 ; GCN: s_mov_b64 vcc, 0
 ; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
 ; GCN: s_mov_b64 vcc, -1
 ; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -123,23 +123,23 @@
 ; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %d) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
-  %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
+  %gep.a = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.b = getelementptr float, ptr addrspace(1) %gep.a, i32 1
+  %gep.c = getelementptr float, ptr addrspace(1) %gep.a, i32 2
+  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 2
 
-  %a = load volatile float, float addrspace(1)* %gep.a
-  %b = load volatile float, float addrspace(1)* %gep.b
-  %c = load volatile float, float addrspace(1)* %gep.c
+  %a = load volatile float, ptr addrspace(1) %gep.a
+  %b = load volatile float, ptr addrspace(1) %gep.b
+  %c = load volatile float, ptr addrspace(1) %gep.c
 
   %cmp0 = icmp eq i32 %tid, 0
   %cmp1 = icmp ne i32 %d, 0
   %and = and i1 %cmp0, %cmp1
 
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone
-  store float %result, float addrspace(1)* %gep.out, align 4
+  store float %result, ptr addrspace(1) %gep.out, align 4
   ret void
 }
 
@@ -162,29 +162,29 @@
 ; SI:     buffer_store_dword
 ; SI:     s_endpgm
 
-define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) nounwind {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
-  %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
-  %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
+  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 2
+  %gep.a = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.b = getelementptr float, ptr addrspace(1) %gep.a, i32 1
+  %gep.c = getelementptr float, ptr addrspace(1) %gep.a, i32 2
 
-  %a = load float, float addrspace(1)* %gep.a
-  %b = load float, float addrspace(1)* %gep.b
-  %c = load float, float addrspace(1)* %gep.c
+  %a = load float, ptr addrspace(1) %gep.a
+  %b = load float, ptr addrspace(1) %gep.b
+  %c = load float, ptr addrspace(1) %gep.c
 
   %cmp0 = icmp eq i32 %tid, 0
   br i1 %cmp0, label %bb, label %exit
 
 bb:
-  %val = load i32, i32 addrspace(1)* %dummy
+  %val = load i32, ptr addrspace(1) %dummy
   %cmp1 = icmp ne i32 %val, 0
   br label %exit
 
 exit:
   %cond = phi i1 [false, %entry], [%cmp1, %bb]
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone
-  store float %result, float addrspace(1)* %gep.out, align 4
+  store float %result, ptr addrspace(1) %gep.out, align 4
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
index 8f3d77b..d8e1da4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll

@@ -11,17 +11,17 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile float, float addrspace(1)* %gep.0, align 4
-  %b = load volatile float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -31,17 +31,17 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile float, float addrspace(1)* %gep.0, align 4
-  %b = load volatile float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -51,17 +51,17 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile double, double addrspace(1)* %gep.0, align 8
-  %b = load volatile double, double addrspace(1)* %gep.1, align 8
+  %a = load volatile double, ptr addrspace(1) %gep.0, align 8
+  %b = load volatile double, ptr addrspace(1) %gep.1, align 8
 
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
+  store double %result0, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -71,17 +71,17 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile double, double addrspace(1)* %gep.0, align 8
-  %b = load volatile double, double addrspace(1)* %gep.1, align 8
+  %a = load volatile double, ptr addrspace(1) %gep.0, align 8
+  %b = load volatile double, ptr addrspace(1) %gep.1, align 8
 
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
+  store double %result0, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -91,15 +91,15 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out, ptr addrspace(1) %in, float %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid
 
-  %b = load float, float addrspace(1)* %gep, align 4
+  %b = load float, ptr addrspace(1) %gep, align 4
 
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -109,15 +109,15 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out, ptr addrspace(1) %in, float %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid
 
-  %b = load float, float addrspace(1)* %gep, align 4
+  %b = load float, ptr addrspace(1) %gep, align 4
 
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -127,15 +127,15 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid
 
-  %a = load float, float addrspace(1)* %gep, align 4
+  %a = load float, ptr addrspace(1) %gep, align 4
 
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -145,15 +145,15 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid
 
-  %a = load float, float addrspace(1)* %gep, align 4
+  %a = load float, ptr addrspace(1) %gep, align 4
 
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -163,15 +163,15 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out, ptr addrspace(1) %in, double %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid
 
-  %b = load double, double addrspace(1)* %gep, align 8
+  %b = load double, ptr addrspace(1) %gep, align 8
 
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
+  store double %result0, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -181,15 +181,15 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out, ptr addrspace(1) %in, double %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid
 
-  %b = load double, double addrspace(1)* %gep, align 8
+  %b = load double, ptr addrspace(1) %gep, align 8
 
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
+  store double %result0, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -199,15 +199,15 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out, ptr addrspace(1) %in, double %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid
 
-  %a = load double, double addrspace(1)* %gep, align 8
+  %a = load double, ptr addrspace(1) %gep, align 8
 
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
+  store double %result0, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -217,15 +217,15 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out, ptr addrspace(1) %in, double %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid
 
-  %a = load double, double addrspace(1)* %gep, align 8
+  %a = load double, ptr addrspace(1) %gep, align 8
 
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
+  store double %result0, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -236,10 +236,10 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -250,10 +250,10 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -265,10 +265,10 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v[[[VA_LO]]:[[VA_HI]]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], double %b) nounwind {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
+  store double %result0, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -280,10 +280,10 @@
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v[[[VB_LO]]:[[VB_HI]]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], double %b) nounwind {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
+  store double %result0, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -292,14 +292,14 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %a = load float, float addrspace(1)* %gep.0, align 4
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %a = load float, ptr addrspace(1) %gep.0, align 4
 
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -308,14 +308,14 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %a = load float, float addrspace(1)* %gep.0, align 4
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %a = load float, ptr addrspace(1) %gep.0, align 4
 
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -325,19 +325,19 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], -[[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_fneg_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_fneg_num(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile float, float addrspace(1)* %gep.0, align 4
-  %b = load volatile float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
   %a.fneg = fneg float %a
 
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a.fneg, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -348,19 +348,19 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[ABS_A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile float, float addrspace(1)* %gep.0, align 4
-  %b = load volatile float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
   %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
 
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -370,19 +370,19 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], -[[B]], -[[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_fneg_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_fneg_den(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile float, float addrspace(1)* %gep.0, align 4
-  %b = load volatile float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
   %b.fneg = fneg float %b
 
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b.fneg, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -393,49 +393,49 @@
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[ABS_B]], [[ABS_B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile float, float addrspace(1)* %gep.0, align 4
-  %b = load volatile float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
   %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
 
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; SI-LABEL: {{^}}test_div_scale_f32_val_undef_val:
 ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
 ; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], v{{[0-9]+}}, [[K]]
-define amdgpu_kernel void @test_div_scale_f32_val_undef_val(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %out) #0 {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false)
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; SI-LABEL: {{^}}test_div_scale_f32_undef_val_val:
 ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
 ; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[K]], v{{[0-9]+}}
-define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %out) #0 {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false)
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; SI-LABEL: {{^}}test_div_scale_f32_undef_undef_val:
 ; SI-NOT: v0
 ; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s0, s0, v0
-define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) %out) #0 {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false)
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -443,10 +443,10 @@
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x40200000
 ; SI: v_div_scale_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]], v[0:1], s[[[K_LO]]:[[K_HI]]]
-define amdgpu_kernel void @test_div_scale_f64_val_undef_val(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %out) #0 {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false)
   %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
+  store double %result0, ptr addrspace(1) %out, align 8
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll
index 7675102..9c0bacd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll

@@ -20,7 +20,7 @@
   ret void
 }
 
-define amdgpu_gs void @test_add_32_use(i32 %arg, i32 addrspace(1)* %out) {
+define amdgpu_gs void @test_add_32_use(i32 %arg, ptr addrspace(1) %out) {
 ; CHECK-LABEL: test_add_32_use:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -34,7 +34,7 @@
 ; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
   %res = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %arg, i32 16)
-  store i32 %res, i32 addrspace(1)* %out, align 4
+  store i32 %res, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -53,7 +53,7 @@
   ret void
 }
 
-define amdgpu_gs void @test_add_64_use(i32 %arg, i64 addrspace(1)* %out) {
+define amdgpu_gs void @test_add_64_use(i32 %arg, ptr addrspace(1) %out) {
 ; CHECK-LABEL: test_add_64_use:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -67,6 +67,6 @@
 ; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
   %res = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %arg, i32 32)
-  store i64 %res, i64 addrspace(1)* %out, align 4
+  store i64 %res, ptr addrspace(1) %out, align 4
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll
index 3005437..40c014d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll

@@ -13,9 +13,9 @@
 ; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
 ; GCN-NOT: buffer_wbinvl1
 ; GCN: {{.*}}store{{.*}} [[RESULT]]
-define amdgpu_kernel void @ds_append_lds(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
-  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_append_lds(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %lds, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -25,10 +25,10 @@
 ; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532{{$}}
 ; GCN-NOT: buffer_wbinvl1
 ; GCN: {{.*}}store{{.*}} [[RESULT]]
-define amdgpu_kernel void @ds_append_lds_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383
-  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_append_lds_max_offset(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 {
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 16383
+  %val = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %gep, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -44,11 +44,11 @@
 
 ; GCN-NOT: buffer_wbinvl1
 ; GCN: {{.*}}store{{.*}} [[RESULT]]
-define amdgpu_kernel void @ds_append_no_fold_offset_si(i32 addrspace(3)* addrspace(4)* %lds.ptr, i32 addrspace(1)* %out) #0 {
-  %lds = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* %lds.ptr, align 4
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 4
-  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_append_no_fold_offset_si(ptr addrspace(4) %lds.ptr, ptr addrspace(1) %out) #0 {
+  %lds = load ptr addrspace(3), ptr addrspace(4) %lds.ptr, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 4
+  %val = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %gep, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -65,10 +65,10 @@
 ; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
 ; GCN-NOT: buffer_wbinvl1
 ; GCN: {{.*}}store{{.*}} [[RESULT]]
-define amdgpu_kernel void @ds_append_lds_over_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16384
-  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_append_lds_over_max_offset(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 {
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 16384
+  %val = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %gep, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -81,9 +81,9 @@
 ; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
 ; GCN-NOT: buffer_wbinvl1
 ; GCN: {{.*}}store{{.*}} [[RESULT]]
-define void @ds_append_lds_vgpr_addr(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
-  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define void @ds_append_lds_vgpr_addr(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %lds, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -93,9 +93,9 @@
 ; GCN: ds_append [[RESULT:v[0-9]+]] gds{{$}}
 ; GCN-NOT: buffer_wbinvl1
 ; GCN: {{.*}}store{{.*}} [[RESULT]]
-define amdgpu_kernel void @ds_append_gds(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
-  %val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gds, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_append_gds(ptr addrspace(2) %gds, ptr addrspace(1) %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.append.p2(ptr addrspace(2) %gds, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -105,19 +105,19 @@
 ; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532 gds{{$}}
 ; GCN-NOT: buffer_wbinvl1
 ; GCN: {{.*}}store{{.*}} [[RESULT]]
-define amdgpu_kernel void @ds_append_gds_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
-  %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16383
-  %val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gep, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_append_gds_max_offset(ptr addrspace(2) %gds, ptr addrspace(1) %out) #0 {
+  %gep = getelementptr inbounds i32, ptr addrspace(2) %gds, i32 16383
+  %val = call i32 @llvm.amdgcn.ds.append.p2(ptr addrspace(2) %gep, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}ds_append_gds_over_max_offset:
 ; GCN-NOT: buffer_wbinvl1
-define amdgpu_kernel void @ds_append_gds_over_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
-  %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16384
-  %val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gep, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_append_gds_over_max_offset(ptr addrspace(2) %gds, ptr addrspace(1) %out) #0 {
+  %gep = getelementptr inbounds i32, ptr addrspace(2) %gds, i32 16384
+  %val = call i32 @llvm.amdgcn.ds.append.p2(ptr addrspace(2) %gep, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -130,10 +130,10 @@
 ; GFX9-NOT: m0
 ; GCN: _store_dword
 ; GCN: ds_read_b32
-define amdgpu_kernel void @ds_append_lds_m0_restore(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
-  %val0 = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false)
-  store i32 %val0, i32 addrspace(1)* %out
-  %val1 = load volatile i32, i32 addrspace(3)* %lds
+define amdgpu_kernel void @ds_append_lds_m0_restore(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 {
+  %val0 = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %lds, i1 false)
+  store i32 %val0, ptr addrspace(1) %out
+  %val1 = load volatile i32, ptr addrspace(3) %lds
   ret void
 }
 
@@ -142,14 +142,14 @@
 ; GCN: s_load_dword [[PTR:s[0-9]+]]
 ; GCN: s_mov_b32 m0, [[PTR]]
 ; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532{{$}}
-define amdgpu_kernel void @ds_append_lds_no_use(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383
-  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false)
+define amdgpu_kernel void @ds_append_lds_no_use(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 {
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 16383
+  %val = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %gep, i1 false)
   ret void
 }
 
-declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #1
-declare i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* nocapture, i1 immarg) #1
+declare i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) nocapture, i1 immarg) #1
+declare i32 @llvm.amdgcn.ds.append.p2(ptr addrspace(2) nocapture, i1 immarg) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { argmemonly convergent nounwind }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
index f6f3a36d..90e18a8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll

@@ -4,49 +4,49 @@
 
 ; CHECK-LABEL: {{^}}ds_bpermute:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_bpermute(ptr addrspace(1) %out, i32 %index, i32 %src) nounwind {
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
-  store i32 %bpermute, i32 addrspace(1)* %out, align 4
+  store i32 %bpermute, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}ds_bpermute_imm_offset:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
-define amdgpu_kernel void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_bpermute_imm_offset(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
   %index = add i32 %base_index, 4
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
-  store i32 %bpermute, i32 addrspace(1)* %out, align 4
+  store i32 %bpermute, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}ds_bpermute_imm_index:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64
-define amdgpu_kernel void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_bpermute_imm_index(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0
-  store i32 %bpermute, i32 addrspace(1)* %out, align 4
+  store i32 %bpermute, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}ds_bpermute_add_shl:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_bpermute_add_shl(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define void @ds_bpermute_add_shl(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
   %index = add i32 %base_index, 1
   %byte_index = shl i32 %index, 2
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0
-  store i32 %bpermute, i32 addrspace(1)* %out, align 4
+  store i32 %bpermute, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}ds_bpermute_or_shl:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_bpermute_or_shl(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define void @ds_bpermute_or_shl(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
   %masked = and i32 %base_index, 62
   %index = or i32 %masked, 1
   %byte_index = shl i32 %index, 2
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0
-  store i32 %bpermute, i32 addrspace(1)* %out, align 4
+  store i32 %bpermute, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll
index 085eb9d..20c4b04 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll

@@ -4,7 +4,7 @@
 
 declare { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32, i32, <4 x i32>, i32 immarg)
 
-define amdgpu_gs void @test_ds_bvh_stack(i32 %addr, i32 %data0, <4 x i32> %data1, i32 addrspace(1)* %out) {
+define amdgpu_gs void @test_ds_bvh_stack(i32 %addr, i32 %data0, <4 x i32> %data1, ptr addrspace(1) %out) {
 ; CHECK-LABEL: test_ds_bvh_stack:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ds_bvh_stack_rtn_b32 v1, v0, v1, v[2:5]
@@ -17,11 +17,11 @@
   %vdst = extractvalue { i32, i32 } %pair, 0
   %newaddr = extractvalue { i32, i32 } %pair, 1
   %res = add i32 %vdst, %newaddr
-  store i32 %res, i32 addrspace(1)* %out, align 4
+  store i32 %res, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_gs void @test_ds_bvh_stack_1(i32 %addr, i32 %data0, <4 x i32> %data1, i32 addrspace(1)* %out) {
+define amdgpu_gs void @test_ds_bvh_stack_1(i32 %addr, i32 %data0, <4 x i32> %data1, ptr addrspace(1) %out) {
 ; CHECK-LABEL: test_ds_bvh_stack_1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ds_bvh_stack_rtn_b32 v1, v0, v1, v[2:5] offset:1
@@ -34,6 +34,6 @@
   %vdst = extractvalue { i32, i32 } %pair, 0
   %newaddr = extractvalue { i32, i32 } %pair, 1
   %res = add i32 %vdst, %newaddr
-  store i32 %res, i32 addrspace(1)* %out, align 4
+  store i32 %res, ptr addrspace(1) %out, align 4
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll
index 59c6549..a0d2f0e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll

@@ -13,9 +13,9 @@
 ; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
 ; GCN-NOT: buffer_wbinvl1
 ; GCN: {{.*}}store{{.*}} [[RESULT]]
-define amdgpu_kernel void @ds_consume_lds(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
-  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_consume_lds(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %lds, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -25,10 +25,10 @@
 ; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532{{$}}
 ; GCN-NOT: buffer_wbinvl1
 ; GCN: {{.*}}store{{.*}} [[RESULT]]
-define amdgpu_kernel void @ds_consume_lds_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383
-  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_consume_lds_max_offset(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 {
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 16383
+  %val = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %gep, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -44,11 +44,11 @@
 
 ; GCN-NOT: buffer_wbinvl1
 ; GCN: {{.*}}store{{.*}} [[RESULT]]
-define amdgpu_kernel void @ds_consume_no_fold_offset_si(i32 addrspace(3)* addrspace(4)* %lds.ptr, i32 addrspace(1)* %out) #0 {
-  %lds = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* %lds.ptr, align 4
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 4
-  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_consume_no_fold_offset_si(ptr addrspace(4) %lds.ptr, ptr addrspace(1) %out) #0 {
+  %lds = load ptr addrspace(3), ptr addrspace(4) %lds.ptr, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 4
+  %val = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %gep, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -63,10 +63,10 @@
 ; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
 ; GCN-NOT: buffer_wbinvl1
 ; GCN: {{.*}}store{{.*}} [[RESULT]]
-define amdgpu_kernel void @ds_consume_lds_over_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16384
-  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_consume_lds_over_max_offset(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 {
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 16384
+  %val = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %gep, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -77,9 +77,9 @@
 ; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
 ; GCN-NOT: buffer_wbinvl1
 ; GCN: {{.*}}store{{.*}} [[RESULT]]
-define void @ds_consume_lds_vgpr_addr(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
-  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define void @ds_consume_lds_vgpr_addr(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %lds, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -89,9 +89,9 @@
 ; GCN: ds_consume [[RESULT:v[0-9]+]] gds{{$}}
 ; GCN-NOT: buffer_wbinvl1
 ; GCN: {{.*}}store{{.*}} [[RESULT]]
-define amdgpu_kernel void @ds_consume_gds(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
-  %val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gds, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_consume_gds(ptr addrspace(2) %gds, ptr addrspace(1) %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.consume.p2(ptr addrspace(2) %gds, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -101,19 +101,19 @@
 ; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532 gds{{$}}
 ; GCN-NOT: buffer_wbinvl1
 ; GCN: {{.*}}store{{.*}} [[RESULT]]
-define amdgpu_kernel void @ds_consume_gds_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
-  %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16383
-  %val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gep, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_consume_gds_max_offset(ptr addrspace(2) %gds, ptr addrspace(1) %out) #0 {
+  %gep = getelementptr inbounds i32, ptr addrspace(2) %gds, i32 16383
+  %val = call i32 @llvm.amdgcn.ds.consume.p2(ptr addrspace(2) %gep, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}ds_consume_gds_over_max_offset:
 ; GCN-NOT: buffer_wbinvl1
-define amdgpu_kernel void @ds_consume_gds_over_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
-  %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16384
-  %val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gep, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_consume_gds_over_max_offset(ptr addrspace(2) %gds, ptr addrspace(1) %out) #0 {
+  %gep = getelementptr inbounds i32, ptr addrspace(2) %gds, i32 16384
+  %val = call i32 @llvm.amdgcn.ds.consume.p2(ptr addrspace(2) %gep, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -126,10 +126,10 @@
 ; GFX9-NOT: m0
 ; GCN: _store_dword
 ; GCN: ds_read_b32
-define amdgpu_kernel void @ds_consume_lds_m0_restore(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
-  %val0 = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i1 false)
-  store i32 %val0, i32 addrspace(1)* %out
-  %val1 = load volatile i32, i32 addrspace(3)* %lds
+define amdgpu_kernel void @ds_consume_lds_m0_restore(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 {
+  %val0 = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %lds, i1 false)
+  store i32 %val0, ptr addrspace(1) %out
+  %val1 = load volatile i32, ptr addrspace(3) %lds
   ret void
 }
 
@@ -138,14 +138,14 @@
 ; GCN: s_load_dword [[PTR:s[0-9]+]]
 ; GCN: s_mov_b32 m0, [[PTR]]
 ; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532{{$}}
-define amdgpu_kernel void @ds_consume_lds_no_use(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383
-  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false)
+define amdgpu_kernel void @ds_consume_lds_no_use(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 {
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 16383
+  %val = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %gep, i1 false)
   ret void
 }
 
-declare i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #1
-declare i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* nocapture, i1 immarg) #1
+declare i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) nocapture, i1 immarg) #1
+declare i32 @llvm.amdgcn.ds.consume.p2(ptr addrspace(2) nocapture, i1 immarg) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { argmemonly convergent nounwind }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index d7a4ba9..557683b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll

@@ -144,9 +144,9 @@
 ; LOOP: s_mov_b32 m0, -1
 ; LOOP: ds_write_b32
 define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val) #0 {
-  store i32 1, i32 addrspace(3)* @lds
+  store i32 1, ptr addrspace(3) @lds
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 10)
-  store i32 2, i32 addrspace(3)* @lds
+  store i32 2, ptr addrspace(3) @lds
   ret void
 }
 
@@ -165,8 +165,8 @@
 ; GCN-LABEL: {{^}}gws_barrier_wait_before:
 ; NOLOOP: s_waitcnt
 ; NOLOOP-NOT: s_waitcnt{{$}}
-define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
-  store i32 0, i32 addrspace(1)* %ptr
+define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, ptr addrspace(1) %ptr) #0 {
+  store i32 0, ptr addrspace(1) %ptr
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
   ret void
 }
@@ -176,9 +176,9 @@
 ; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds
 ; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOLOOP: load_{{dword|b32}}
-define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, ptr addrspace(1) %ptr) #0 {
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
-  %load = load volatile i32, i32 addrspace(1)* %ptr
+  %load = load volatile i32, ptr addrspace(1) %ptr
   ret void
 }
 
@@ -189,8 +189,8 @@
 ; NOLOOP: s_waitcnt vmcnt(0) lgkmcnt(0)
 ; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds
 ; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
-  store i32 0, i32 addrspace(1)* %ptr
+define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, ptr addrspace(1) %ptr) #0 {
+  store i32 0, ptr addrspace(1) %ptr
   fence release
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
   ret void
@@ -204,10 +204,10 @@
 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
 ; NOLOOP-NEXT: load_{{dword|b32}}
 
-define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, ptr addrspace(1) %ptr) #0 {
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
   fence release
-  %load = load volatile i32, i32 addrspace(1)* %ptr
+  %load = load volatile i32, ptr addrspace(1) %ptr
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll
index f87a3ea..f658ab3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll

@@ -137,9 +137,9 @@
 ; LOOP: s_mov_b32 m0, -1
 ; LOOP: ds_write_b32
 define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 {
-  store volatile i32 1, i32 addrspace(3)* @lds
+  store volatile i32 1, ptr addrspace(3) @lds
   call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 10)
-  store i32 2, i32 addrspace(3)* @lds
+  store i32 2, ptr addrspace(3) @lds
   ret void
 }
 
@@ -159,8 +159,8 @@
 ; NOLOOP-NOT: s_waitcnt
 ; NOLOOP: ds_gws_init
 ; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-define amdgpu_kernel void @gws_init_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
-  store i32 0, i32 addrspace(1)* %ptr
+define amdgpu_kernel void @gws_init_wait_before(i32 %val, ptr addrspace(1) %ptr) #0 {
+  store i32 0, ptr addrspace(1) %ptr
   call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx10.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx10.ll
index 6a9d10f..fd501ef 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx10.ll

@@ -7,9 +7,9 @@
 ; GCN-DAG: v_{{(dual_)?}}mov_b32{{(_e32)?}} v[[INCR:[0-9]+]], 31
 ; GCN-DAG: s_mov_b32 m0,
 ; GCN: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds
-define amdgpu_kernel void @ds_ordered_add(i32 addrspace(2)* inreg %gds, i32 addrspace(1)* %out) {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 16777217, i1 true, i1 true)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_ordered_add(ptr addrspace(2) inreg %gds, ptr addrspace(1) %out) {
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 16777217, i1 true, i1 true)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -17,10 +17,10 @@
 ; GCN-DAG: v_{{(dual_)?}}mov_b32{{(_e32)?}} v[[INCR:[0-9]+]], 31
 ; GCN-DAG: s_mov_b32 m0,
 ; GCN: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:49924 gds
-define amdgpu_kernel void @ds_ordered_add_4dw(i32 addrspace(2)* inreg %gds, i32 addrspace(1)* %out) {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 67108865, i1 true, i1 true)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_ordered_add_4dw(ptr addrspace(2) inreg %gds, ptr addrspace(1) %out) {
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 67108865, i1 true, i1 true)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
-declare i32 @llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* nocapture, i32, i32, i32, i1, i32, i1, i1)
+declare i32 @llvm.amdgcn.ds.ordered.add(ptr addrspace(2) nocapture, i32, i32, i32, i1, i32, i1, i1)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll
index 6d5cc7a..a9c2c27 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll

@@ -5,9 +5,9 @@
 ; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31
 ; GCN-DAG: s_mov_b32 m0,
 ; GCN: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds
-define amdgpu_kernel void @ds_ordered_add(i32 addrspace(2)* inreg %gds, i32 addrspace(1)* %out) {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 16777217, i1 true, i1 true)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_ordered_add(ptr addrspace(2) inreg %gds, ptr addrspace(1) %out) {
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 16777217, i1 true, i1 true)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -16,8 +16,8 @@
 ; GCN: s_mov_b32 m0, s0
 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds
 ; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
-define amdgpu_cs float @ds_ordered_add_cs(i32 addrspace(2)* inreg %gds) {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 16777217, i1 true, i1 true)
+define amdgpu_cs float @ds_ordered_add_cs(ptr addrspace(2) inreg %gds) {
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 16777217, i1 true, i1 true)
   %r = bitcast i32 %val to float
   ret float %r
 }
@@ -27,8 +27,8 @@
 ; GCN: s_mov_b32 m0, s0
 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds
 ; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
-define amdgpu_ps float @ds_ordered_add_ps(i32 addrspace(2)* inreg %gds) {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 16777217, i1 true, i1 true)
+define amdgpu_ps float @ds_ordered_add_ps(ptr addrspace(2) inreg %gds) {
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 16777217, i1 true, i1 true)
   %r = bitcast i32 %val to float
   ret float %r
 }
@@ -38,8 +38,8 @@
 ; GCN: s_mov_b32 m0, s0
 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds
 ; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
-define amdgpu_vs float @ds_ordered_add_vs(i32 addrspace(2)* inreg %gds) {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 16777217, i1 true, i1 true)
+define amdgpu_vs float @ds_ordered_add_vs(ptr addrspace(2) inreg %gds) {
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 16777217, i1 true, i1 true)
   %r = bitcast i32 %val to float
   ret float %r
 }
@@ -49,10 +49,10 @@
 ; GCN: s_mov_b32 m0, s0
 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds
 ; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
-define amdgpu_gs float @ds_ordered_add_gs(i32 addrspace(2)* inreg %gds) {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 16777217, i1 true, i1 true)
+define amdgpu_gs float @ds_ordered_add_gs(ptr addrspace(2) inreg %gds) {
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 16777217, i1 true, i1 true)
   %r = bitcast i32 %val to float
   ret float %r
 }
 
-declare i32 @llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* nocapture, i32, i32, i32, i1, i32, i1, i1)
+declare i32 @llvm.amdgcn.ds.ordered.add(ptr addrspace(2) nocapture, i32, i32, i32, i1, i32, i1, i1)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll
index 76bd227..aa83d8e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll

@@ -11,9 +11,9 @@
 ; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31
 ; GCN-DAG: s_mov_b32 m0,
 ; GCN: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds
-define amdgpu_kernel void @ds_ordered_add(i32 addrspace(2)* inreg %gds, i32 addrspace(1)* %out) {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_ordered_add(ptr addrspace(2) inreg %gds, ptr addrspace(1) %out) {
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -23,9 +23,9 @@
 ; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31
 ; GCN-DAG: s_mov_b32 m0,
 ; GCN: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:776 gds
-define amdgpu_kernel void @ds_ordered_add_counter2(i32 addrspace(2)* inreg %gds, i32 addrspace(1)* %out) {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 2, i1 true, i1 true)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_ordered_add_counter2(ptr addrspace(2) inreg %gds, ptr addrspace(1) %out) {
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 2, i1 true, i1 true)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -33,9 +33,9 @@
 ; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31
 ; GCN-DAG: s_mov_b32 m0,
 ; GCN: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:260 gds
-define amdgpu_kernel void @ds_ordered_add_nodone(i32 addrspace(2)* inreg %gds, i32 addrspace(1)* %out) {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_ordered_add_nodone(ptr addrspace(2) inreg %gds, ptr addrspace(1) %out) {
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -43,9 +43,9 @@
 ; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31
 ; GCN-DAG: s_mov_b32 m0,
 ; GCN: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:4 gds
-define amdgpu_kernel void @ds_ordered_add_norelease(i32 addrspace(2)* inreg %gds, i32 addrspace(1)* %out) {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 false, i1 false)
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @ds_ordered_add_norelease(ptr addrspace(2) inreg %gds, ptr addrspace(1) %out) {
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 false, i1 false)
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -55,8 +55,8 @@
 ; VIGFX9-NEXT: s_nop 0
 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds
 ; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
-define amdgpu_cs float @ds_ordered_add_cs(i32 addrspace(2)* inreg %gds) {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
+define amdgpu_cs float @ds_ordered_add_cs(ptr addrspace(2) inreg %gds) {
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
   %r = bitcast i32 %val to float
   ret float %r
 }
@@ -68,7 +68,7 @@
 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds
 ; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
 define float @ds_ordered_add_default_cc() {
-  %val = call i32 @llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* null, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
+  %val = call i32 @llvm.amdgcn.ds.ordered.add(ptr addrspace(2) null, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
   %r = bitcast i32 %val to float
   ret float %r
 }
@@ -80,7 +80,7 @@
 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds
 ; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
 define fastcc float @ds_ordered_add_fastcc() {
-  %val = call i32 @llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* null, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
+  %val = call i32 @llvm.amdgcn.ds.ordered.add(ptr addrspace(2) null, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
   %r = bitcast i32 %val to float
   ret float %r
 }
@@ -92,7 +92,7 @@
 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds
 ; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
 define float @ds_ordered_add_func() {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* null, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) null, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
   %r = bitcast i32 %val to float
   ret float %r
 }
@@ -103,8 +103,8 @@
 ; VIGFX9-NEXT: s_nop 0
 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:1796 gds
 ; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
-define amdgpu_ps float @ds_ordered_add_ps(i32 addrspace(2)* inreg %gds) {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
+define amdgpu_ps float @ds_ordered_add_ps(ptr addrspace(2) inreg %gds) {
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
   %r = bitcast i32 %val to float
   ret float %r
 }
@@ -115,8 +115,8 @@
 ; VIGFX9-NEXT: s_nop 0
 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:2820 gds
 ; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
-define amdgpu_vs float @ds_ordered_add_vs(i32 addrspace(2)* inreg %gds) {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
+define amdgpu_vs float @ds_ordered_add_vs(ptr addrspace(2) inreg %gds) {
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
   %r = bitcast i32 %val to float
   ret float %r
 }
@@ -127,10 +127,10 @@
 ; VIGFX9-NEXT: s_nop 0
 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:3844 gds
 ; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
-define amdgpu_gs float @ds_ordered_add_gs(i32 addrspace(2)* inreg %gds) {
-  %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
+define amdgpu_gs float @ds_ordered_add_gs(ptr addrspace(2) inreg %gds) {
+  %val = call i32@llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
   %r = bitcast i32 %val to float
   ret float %r
 }
 
-declare i32 @llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* nocapture, i32, i32, i32, i1, i32, i1, i1)
+declare i32 @llvm.amdgcn.ds.ordered.add(ptr addrspace(2) nocapture, i32, i32, i32, i1, i32, i1, i1)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll
index 7626691..87bc6e4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll

@@ -12,8 +12,8 @@
 ; VIGFX9-NEXT: s_nop 0
 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v0 offset:4868 gds
 ; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
-define amdgpu_cs float @ds_ordered_swap(i32 addrspace(2)* inreg %gds, i32 %value) {
-  %val = call i32@llvm.amdgcn.ds.ordered.swap(i32 addrspace(2)* %gds, i32 %value, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
+define amdgpu_cs float @ds_ordered_swap(ptr addrspace(2) inreg %gds, i32 %value) {
+  %val = call i32@llvm.amdgcn.ds.ordered.swap(ptr addrspace(2) %gds, i32 %value, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
   %r = bitcast i32 %val to float
   ret float %r
 }
@@ -31,13 +31,13 @@
 ; GCN-NEXT: s_waitcnt expcnt(0)
 ; GCN-NEXT: s_or_b64 exec, exec, s[[SAVED]]
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
-define amdgpu_cs float @ds_ordered_swap_conditional(i32 addrspace(2)* inreg %gds, i32 %value) {
+define amdgpu_cs float @ds_ordered_swap_conditional(ptr addrspace(2) inreg %gds, i32 %value) {
 entry:
   %c = icmp ne i32 %value, 0
   br i1 %c, label %if-true, label %endif
 
 if-true:
-  %val = call i32@llvm.amdgcn.ds.ordered.swap(i32 addrspace(2)* %gds, i32 %value, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
+  %val = call i32@llvm.amdgcn.ds.ordered.swap(ptr addrspace(2) %gds, i32 %value, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true)
   br label %endif
 
 endif:
@@ -46,4 +46,4 @@
   ret float %r
 }
 
-declare i32 @llvm.amdgcn.ds.ordered.swap(i32 addrspace(2)* nocapture, i32, i32, i32, i1, i32, i1, i1)
+declare i32 @llvm.amdgcn.ds.ordered.swap(ptr addrspace(2) nocapture, i32, i32, i32, i1, i32, i1, i1)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
index 63618c3..6581e25 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll

@@ -4,18 +4,18 @@
 
 ; CHECK-LABEL: {{^}}ds_permute:
 ; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_permute(ptr addrspace(1) %out, i32 %index, i32 %src) nounwind {
   %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0
-  store i32 %bpermute, i32 addrspace(1)* %out, align 4
+  store i32 %bpermute, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}ds_permute_imm_offset:
 ; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
-define amdgpu_kernel void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_permute_imm_offset(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
   %index = add i32 %base_index, 4
   %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0
-  store i32 %bpermute, i32 addrspace(1)* %out, align 4
+  store i32 %bpermute, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll
index 8d99b08..7f0f2c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll

@@ -20,7 +20,7 @@
   ret void
 }
 
-define amdgpu_gs void @test_sub_32_use(i32 %arg, i32 addrspace(1)* %out) {
+define amdgpu_gs void @test_sub_32_use(i32 %arg, ptr addrspace(1) %out) {
 ; CHECK-LABEL: test_sub_32_use:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -34,7 +34,7 @@
 ; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
   %res = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %arg, i32 16)
-  store i32 %res, i32 addrspace(1)* %out, align 4
+  store i32 %res, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -53,7 +53,7 @@
   ret void
 }
 
-define amdgpu_gs void @test_sub_64_use(i32 %arg, i64 addrspace(1)* %out) {
+define amdgpu_gs void @test_sub_64_use(i32 %arg, ptr addrspace(1) %out) {
 ; CHECK-LABEL: test_sub_64_use:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -67,7 +67,7 @@
 ; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
   %res = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %arg, i32 32)
-  store i64 %res, i64 addrspace(1)* %out, align 4
+  store i64 %res, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
index 26e8222..038ba91 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll

@@ -5,9 +5,9 @@
 
 ; CHECK-LABEL: {{^}}ds_swizzle:
 ; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:swizzle(BITMASK_PERM,"00p11")
-define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind {
+define amdgpu_kernel void @ds_swizzle(ptr addrspace(1) %out, i32 %src) nounwind {
   %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
-  store i32 %swizzle, i32 addrspace(1)* %out, align 4
+  store i32 %swizzle, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
index 014e690..b5172a9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll

@@ -629,12 +629,10 @@
   %data0 = alloca <4 x float>, align 8, addrspace(5)
   %data1 = alloca <4 x float>, align 8, addrspace(5)
   %cmp = icmp eq i32 %idx, 1
-  %data = select i1 %cmp, <4 x float> addrspace(5)* %data0, <4 x float> addrspace(5)* %data1
-  %sptr = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %data, i32 0, i32 0
-  store float %v, float addrspace(5)* %sptr, align 8
+  %data = select i1 %cmp, ptr addrspace(5) %data0, ptr addrspace(5) %data1
+  store float %v, ptr addrspace(5) %data, align 8
   call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 0.0, float 0.0, float 0.0, float 1.0, i1 true, i1 false)
-  %ptr0 = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %data0, i32 0, i32 0
-  %load0 = load float, float addrspace(5)* %ptr0, align 8
+  %load0 = load float, ptr addrspace(5) %data0, align 8
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %load0, float 0.0, float 1.0, float 0.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float %load0, float 0.0, float 1.0, float 0.0, i1 false, i1 false)
   ret void

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
index cbfbe77..7fafbff 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll

@@ -12,7 +12,7 @@
 declare i32 @llvm.amdgcn.fcmp.f16(half, half, i32) #0
 declare half @llvm.fabs.f16(half) #0
 
-define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(i32 addrspace(1)* %out, float %src, float %a) {
+define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -57,11 +57,11 @@
 ; GISEL-GFX10-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(i32 addrspace(1)* %out, float %src, float %a) {
+define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -107,11 +107,11 @@
   %temp = call float @llvm.fabs.f32(float %a)
   %src_input = call float @llvm.fabs.f32(float %src)
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src_input, float %temp, i32 1)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32(i32 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_endpgm
@@ -137,11 +137,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v0, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 -1)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_oeq(i32 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -193,11 +193,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_one(i32 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_one:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -249,11 +249,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_ogt(i32 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_ogt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -305,11 +305,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_oge(i32 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_oge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -361,11 +361,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_olt(i32 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_olt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -417,11 +417,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_ole(i32 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_ole:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -473,12 +473,12 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f32_ueq(i32 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_ueq:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -530,11 +530,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_une(i32 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_une:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -586,11 +586,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_ugt(i32 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_ugt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -642,11 +642,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_uge(i32 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_uge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -698,11 +698,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_ult(i32 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_ult:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -754,11 +754,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_ule(i32 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f32_ule:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -810,11 +810,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_oeq(i32 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_oeq:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -866,11 +866,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_one(i32 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_one:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -922,11 +922,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_ogt(i32 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ogt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -978,11 +978,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_oge(i32 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_oge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1034,11 +1034,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_olt(i32 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_olt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1090,11 +1090,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_ole(i32 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ole:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1146,11 +1146,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_ueq(i32 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ueq:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1202,11 +1202,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_une(i32 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_une:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1258,11 +1258,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_ugt(i32 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ugt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1314,11 +1314,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_uge(i32 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_uge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1370,11 +1370,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_ult(i32 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ult:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1426,11 +1426,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_ule(i32 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ule:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1482,12 +1482,12 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(i32 addrspace(1)* %out, half %src, half %a) {
+define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half %src, half %a) {
 ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1544,12 +1544,12 @@
 ; GISEL-GFX10-NEXT:    s_endpgm
   %temp = call half @llvm.fabs.f16(half %a)
   %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half %temp, i32 1)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(i32 addrspace(1)* %out, half %src, half %a) {
+define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(1) %out, half %src, half %a) {
 ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1607,11 +1607,11 @@
   %temp = call half @llvm.fabs.f16(half %a)
   %src_input = call half @llvm.fabs.f16(half %src)
   %result = call i32 @llvm.amdgcn.fcmp.f16(half %src_input, half %temp, i32 1)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f16(i32 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f16:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_endpgm
@@ -1637,12 +1637,12 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v0, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 -1)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_oeq(i32 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1694,12 +1694,12 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 1)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_one(i32 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f16_one:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1751,12 +1751,12 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 6)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_ogt(i32 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f16_ogt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1808,12 +1808,12 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 2)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_oge(i32 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f16_oge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1865,12 +1865,12 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 3)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_olt(i32 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f16_olt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1922,12 +1922,12 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 4)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_ole(i32 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f16_ole:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1979,12 +1979,12 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 5)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_ueq(i32 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f16_ueq:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -2036,12 +2036,12 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 9)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_une(i32 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f16_une:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -2093,12 +2093,12 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 14)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_ugt(i32 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f16_ugt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -2150,12 +2150,12 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 10)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_uge(i32 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f16_uge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -2207,12 +2207,12 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 11)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_ult(i32 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f16_ult:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -2264,12 +2264,12 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 12)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_ule(i32 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f16_ule:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -2321,7 +2321,7 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 13)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
index d809391..9b7fff3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll

@@ -14,7 +14,7 @@
 declare i64 @llvm.amdgcn.fcmp.f16(half, half, i32) #0
 declare half @llvm.fabs.f16(half) #0
 
-define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
+define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX11-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -66,11 +66,11 @@
 ; SDAG-VI-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
+define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -123,11 +123,11 @@
   %temp = call float @llvm.fabs.f32(float %a)
   %src_input = call float @llvm.fabs.f32(float %src)
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src_input, float %temp, i32 1)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX-LABEL: v_fcmp_f32:
 ; SDAG-GFX:       ; %bb.0:
 ; SDAG-GFX-NEXT:    s_endpgm
@@ -177,11 +177,11 @@
 ; GISEL-GFX-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GISEL-GFX-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 -1)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
 ; GFX11-LABEL: v_fcmp_f32_oeq:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -237,11 +237,11 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
 ; GFX11-LABEL: v_fcmp_f32_one:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -297,11 +297,11 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
 ; GFX11-LABEL: v_fcmp_f32_ogt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -357,11 +357,11 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
 ; GFX11-LABEL: v_fcmp_f32_oge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -417,11 +417,11 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
 ; GFX11-LABEL: v_fcmp_f32_olt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -477,11 +477,11 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
 ; GFX11-LABEL: v_fcmp_f32_ole:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -537,12 +537,12 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
 ; GFX11-LABEL: v_fcmp_f32_ueq:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -598,11 +598,11 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
 ; GFX11-LABEL: v_fcmp_f32_une:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -658,11 +658,11 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
 ; GFX11-LABEL: v_fcmp_f32_ugt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -718,11 +718,11 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
 ; GFX11-LABEL: v_fcmp_f32_uge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -778,11 +778,11 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
 ; GFX11-LABEL: v_fcmp_f32_ult:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -838,11 +838,11 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
 ; GFX11-LABEL: v_fcmp_f32_ule:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -898,11 +898,11 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_oeq:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -973,11 +973,11 @@
 ; SDAG-GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
 ; SDAG-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_one:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1048,11 +1048,11 @@
 ; SDAG-GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
 ; SDAG-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ogt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1123,11 +1123,11 @@
 ; SDAG-GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
 ; SDAG-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_oge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1198,11 +1198,11 @@
 ; SDAG-GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
 ; SDAG-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_olt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1273,11 +1273,11 @@
 ; SDAG-GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
 ; SDAG-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ole:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1348,11 +1348,11 @@
 ; SDAG-GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
 ; SDAG-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ueq:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1423,11 +1423,11 @@
 ; SDAG-GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
 ; SDAG-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_une:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1498,11 +1498,11 @@
 ; SDAG-GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
 ; SDAG-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ugt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1573,11 +1573,11 @@
 ; SDAG-GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
 ; SDAG-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_uge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1648,11 +1648,11 @@
 ; SDAG-GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
 ; SDAG-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ult:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1723,11 +1723,11 @@
 ; SDAG-GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
 ; SDAG-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f64_ule(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ule:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1798,12 +1798,12 @@
 ; SDAG-GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
 ; SDAG-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(i64 addrspace(1)* %out, half %src, half %a) {
+define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half %src, half %a) {
 ; GFX11-LABEL: v_fcmp_f16_oeq_with_fabs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -1865,12 +1865,12 @@
 ; SDAG-VI-NEXT:    s_endpgm
   %temp = call half @llvm.fabs.f16(half %a)
   %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half %temp, i32 1)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, half %src, half %a) {
+define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(1) %out, half %src, half %a) {
 ; GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -1933,11 +1933,11 @@
   %temp = call half @llvm.fabs.f16(half %a)
   %src_input = call half @llvm.fabs.f16(half %src)
   %result = call i64 @llvm.amdgcn.fcmp.f16(half %src_input, half %temp, i32 1)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_fcmp_f16(i64 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX-LABEL: v_fcmp_f16:
 ; SDAG-GFX:       ; %bb.0:
 ; SDAG-GFX-NEXT:    s_endpgm
@@ -1987,12 +1987,12 @@
 ; GISEL-GFX-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GISEL-GFX-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 -1)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_oeq(i64 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
 ; GFX11-LABEL: v_fcmp_f16_oeq:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -2048,12 +2048,12 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 1)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_one(i64 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
 ; GFX11-LABEL: v_fcmp_f16_one:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -2109,12 +2109,12 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 6)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_ogt(i64 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
 ; GFX11-LABEL: v_fcmp_f16_ogt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -2170,12 +2170,12 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 2)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_oge(i64 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
 ; GFX11-LABEL: v_fcmp_f16_oge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -2231,12 +2231,12 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 3)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_olt(i64 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
 ; GFX11-LABEL: v_fcmp_f16_olt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -2292,12 +2292,12 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 4)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_ole(i64 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
 ; GFX11-LABEL: v_fcmp_f16_ole:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -2353,12 +2353,12 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 5)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_ueq(i64 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
 ; GFX11-LABEL: v_fcmp_f16_ueq:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -2414,12 +2414,12 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 9)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_une(i64 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
 ; GFX11-LABEL: v_fcmp_f16_une:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -2475,12 +2475,12 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 14)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_ugt(i64 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
 ; GFX11-LABEL: v_fcmp_f16_ugt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -2536,12 +2536,12 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 10)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_uge(i64 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
 ; GFX11-LABEL: v_fcmp_f16_uge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -2597,12 +2597,12 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 11)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_ult(i64 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
 ; GFX11-LABEL: v_fcmp_f16_ult:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -2658,12 +2658,12 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 12)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @v_fcmp_f16_ule(i64 addrspace(1)* %out, half %src) {
+define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
 ; GFX11-LABEL: v_fcmp_f16_ule:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -2719,7 +2719,7 @@
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; SDAG-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 13)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
index 248ee99..d3ec46f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll

@@ -8,9 +8,9 @@
 ; CHECK: v_rcp_f32_e32
 ; CHECK: v_mul_f32_e32
 ; CHECK: v_mul_f32_e32
-define amdgpu_kernel void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 {
+define amdgpu_kernel void @test_fdiv_fast(ptr addrspace(1) %out, float %a, float %b) #1 {
   %fdiv = call float @llvm.amdgcn.fdiv.fast(float %a, float %b)
-  store float %fdiv, float addrspace(1)* %out
+  store float %fdiv, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index 295684a..b5d5be5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll

@@ -18,16 +18,16 @@
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-    i16 addrspace(1)* %r,
-    <2 x i16> addrspace(1)* %a,
-    <2 x i16> addrspace(1)* %b,
-    i16 addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
-  %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
-  %c.val = load i16, i16 addrspace(1)* %c
+  %a.val = load <2 x i16>, ptr addrspace(1) %a
+  %b.val = load <2 x i16>, ptr addrspace(1) %b
+  %c.val = load i16, ptr addrspace(1) %c
   %r.val = call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %a.val, <2 x i16> %b.val, i16 %c.val)
-  store i16 %r.val, i16 addrspace(1)* %r
+  store i16 %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -57,19 +57,19 @@
 ; GISEL-GFX11-NEXT:    scratch_store_b16 off, v0, s0
 ; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
-    i16 addrspace(5)* %r,
-    <2 x i16> addrspace(5)* %a,
-    <2 x i16> addrspace(5)* %b,
-    i16 addrspace(5)* %c) {
+    ptr addrspace(5) %r,
+    ptr addrspace(5) %a,
+    ptr addrspace(5) %b,
+    ptr addrspace(5) %c) {
 entry:
-  %a.val = load <2 x i16>, <2 x i16> addrspace(5)* %a
-  %b.val = load <2 x i16>, <2 x i16> addrspace(5)* %b
-  %c.val = load i16, i16 addrspace(5)* %c
+  %a.val = load <2 x i16>, ptr addrspace(5) %a
+  %b.val = load <2 x i16>, ptr addrspace(5) %b
+  %c.val = load i16, ptr addrspace(5) %c
   %a.val.i32 = bitcast <2 x i16> %a.val to i32
   %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 %a.val.i32, i32 %a.val.i32, i32 1, i32 15, i32 15, i1 1)
   %a.val.dpp.v2i16 = bitcast i32 %dpp to <2 x i16>
   %r.val = call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %a.val.dpp.v2i16, <2 x i16> %b.val, i16 %c.val)
-  store i16 %r.val, i16 addrspace(5)* %r
+  store i16 %r.val, ptr addrspace(5) %r
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
index 1be7a0f..dde7df0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll

@@ -18,16 +18,16 @@
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-    half addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b,
-    half addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
+  %c.val = load half, ptr addrspace(1) %c
   %r.val = call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a.val, <2 x half> %b.val, half %c.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -57,19 +57,19 @@
 ; GISEL-GFX11-NEXT:    scratch_store_b16 off, v0, s0
 ; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
-    half addrspace(5)* %r,
-    <2 x half> addrspace(5)* %a,
-    <2 x half> addrspace(5)* %b,
-    half addrspace(5)* %c) {
+    ptr addrspace(5) %r,
+    ptr addrspace(5) %a,
+    ptr addrspace(5) %b,
+    ptr addrspace(5) %c) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(5)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(5)* %b
-  %c.val = load half, half addrspace(5)* %c
+  %a.val = load <2 x half>, ptr addrspace(5) %a
+  %b.val = load <2 x half>, ptr addrspace(5) %b
+  %c.val = load half, ptr addrspace(5) %c
   %a.val.i32 = bitcast <2 x half> %a.val to i32
   %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 %a.val.i32, i32 %a.val.i32, i32 1, i32 15, i32 15, i1 1)
   %a.val.dpp.v2half = bitcast i32 %dpp to <2 x half>
   %r.val = call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a.val.dpp.v2half, <2 x half> %b.val, half %c.val)
-  store half %r.val, half addrspace(5)* %r
+  store half %r.val, ptr addrspace(5) %r
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index 516e868..8276df2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll

@@ -19,16 +19,16 @@
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-    float addrspace(1)* %r,
-    <2 x i16> addrspace(1)* %a,
-    <2 x i16> addrspace(1)* %b,
-    float addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
-  %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
-  %c.val = load float, float addrspace(1)* %c
+  %a.val = load <2 x i16>, ptr addrspace(1) %a
+  %b.val = load <2 x i16>, ptr addrspace(1) %b
+  %c.val = load float, ptr addrspace(1) %c
   %r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a.val, <2 x i16> %b.val, float %c.val, i1 1)
-  store float %r.val, float addrspace(1)* %r
+  store float %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -48,15 +48,15 @@
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-    float addrspace(1)* %r,
-    <2 x i16> addrspace(1)* %a,
-    <2 x i16> addrspace(1)* %b,
-    float addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
-  %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
-  %c.val = load float, float addrspace(1)* %c
+  %a.val = load <2 x i16>, ptr addrspace(1) %a
+  %b.val = load <2 x i16>, ptr addrspace(1) %b
+  %c.val = load float, ptr addrspace(1) %c
   %r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a.val, <2 x i16> %b.val, float %c.val, i1 0)
-  store float %r.val, float addrspace(1)* %r
+  store float %r.val, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
index 2425fbb..3ced376 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll

@@ -10,16 +10,16 @@
 ; GFX9:   v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 ; GFX10:  v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_fdot2_clamp(
-    float addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b,
-    float addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
-  %c.val = load float, float addrspace(1)* %c
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
+  %c.val = load float, ptr addrspace(1) %c
   %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 1)
-  store float %r.val, float addrspace(1)* %r
+  store float %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -28,16 +28,16 @@
 ; GFX940: v_dot2c_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
 ; GFX10:  {{v_dot2c_f32_f16_e32|v_dot2acc_f32_f16}} v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_fdot2_no_clamp(
-    float addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b,
-    float addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
-  %c.val = load float, float addrspace(1)* %c
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
+  %c.val = load float, ptr addrspace(1) %c
   %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 0)
-  store float %r.val, float addrspace(1)* %r
+  store float %r.val, ptr addrspace(1) %r
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll
index cd9c47a..226670a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll

@@ -7,54 +7,54 @@
 ; GCN-LABEL: {{^}}mad_f16:
 ; GCN: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
 define amdgpu_kernel void @mad_f16(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b,
-    half addrspace(1)* %c) {
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
+  %c.val = load half, ptr addrspace(1) %c
   %r.val = call half @llvm.amdgcn.fmad.ftz.f16(half %a.val, half %b.val, half %c.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
 ; GCN-LABEL: {{^}}mad_f16_imm_a:
 ; GCN: v_madmk_f16 {{v[0-9]+}}, {{v[0-9]+}}, 0x4800, {{v[0-9]+}}
 define amdgpu_kernel void @mad_f16_imm_a(
-    half addrspace(1)* %r,
-    half addrspace(1)* %b,
-    half addrspace(1)* %c) {
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
+  %b.val = load half, ptr addrspace(1) %b
+  %c.val = load half, ptr addrspace(1) %c
   %r.val = call half @llvm.amdgcn.fmad.ftz.f16(half 8.0, half %b.val, half %c.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
 ; GCN-LABEL: {{^}}mad_f16_imm_b:
 ; GCN: v_mac_f16_e32 {{v[0-9]+}}, 0x4800, {{v[0-9]+$}}
 define amdgpu_kernel void @mad_f16_imm_b(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %c) {
-  %a.val = load half, half addrspace(1)* %a
-  %c.val = load half, half addrspace(1)* %c
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %c) {
+  %a.val = load half, ptr addrspace(1) %a
+  %c.val = load half, ptr addrspace(1) %c
   %r.val = call half @llvm.amdgcn.fmad.ftz.f16(half %a.val, half 8.0, half %c.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
 ; GCN-LABEL: {{^}}mad_f16_imm_c:
 ; GCN: v_madak_f16 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x4800{{$}}
 define amdgpu_kernel void @mad_f16_imm_c(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = call half @llvm.amdgcn.fmad.ftz.f16(half %a.val, half %b.val, half 8.0)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -62,16 +62,16 @@
 ; GFX8: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_mad_legacy_f16 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @mad_f16_neg_b(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b,
-    half addrspace(1)* %c) {
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
+  %c.val = load half, ptr addrspace(1) %c
   %neg.b = fsub half -0.0, %b.val
   %r.val = call half @llvm.amdgcn.fmad.ftz.f16(half %a.val, half %neg.b, half %c.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -79,16 +79,16 @@
 ; GFX8: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
 ; GFX9: v_mad_legacy_f16 v{{[0-9]+}}, v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
 define amdgpu_kernel void @mad_f16_abs_b(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b,
-    half addrspace(1)* %c) {
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
+  %c.val = load half, ptr addrspace(1) %c
   %abs.b = call half @llvm.fabs.f16(half %b.val)
   %r.val = call half @llvm.amdgcn.fmad.ftz.f16(half %a.val, half %abs.b, half %c.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -96,17 +96,17 @@
 ; GFX8: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, -|v{{[0-9]+}}|, v{{[0-9]+}}
 ; GFX9: v_mad_legacy_f16 v{{[0-9]+}}, v{{[0-9]+}}, -|v{{[0-9]+}}|, v{{[0-9]+}}
 define amdgpu_kernel void @mad_f16_neg_abs_b(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b,
-    half addrspace(1)* %c) {
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
-  %c.val = load half, half addrspace(1)* %c
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
+  %c.val = load half, ptr addrspace(1) %c
   %abs.b = call half @llvm.fabs.f16(half %b.val)
   %neg.abs.b = fsub half -0.0, %abs.b
   %r.val = call half @llvm.amdgcn.fmad.ftz.f16(half %a.val, half %neg.abs.b, half %c.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
index 7c4608b..53f12c8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll

@@ -8,28 +8,28 @@
 ; GCN-LABEL: {{^}}mad_f32:
 ; GCN:  v_ma{{[dc]}}_f32
 define amdgpu_kernel void @mad_f32(
-    float addrspace(1)* %r,
-    float addrspace(1)* %a,
-    float addrspace(1)* %b,
-    float addrspace(1)* %c) {
-  %a.val = load float, float addrspace(1)* %a
-  %b.val = load float, float addrspace(1)* %b
-  %c.val = load float, float addrspace(1)* %c
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
+  %a.val = load float, ptr addrspace(1) %a
+  %b.val = load float, ptr addrspace(1) %b
+  %c.val = load float, ptr addrspace(1) %c
   %r.val = call float @llvm.amdgcn.fmad.ftz.f32(float %a.val, float %b.val, float %c.val)
-  store float %r.val, float addrspace(1)* %r
+  store float %r.val, ptr addrspace(1) %r
   ret void
 }
 
 ; GCN-LABEL: {{^}}mad_f32_imm_a:
 ; GCN: v_madmk_f32 {{v[0-9]+}}, {{v[0-9]+}}, 0x41000000,
 define amdgpu_kernel void @mad_f32_imm_a(
-    float addrspace(1)* %r,
-    float addrspace(1)* %b,
-    float addrspace(1)* %c) {
-  %b.val = load float, float addrspace(1)* %b
-  %c.val = load float, float addrspace(1)* %c
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
+  %b.val = load float, ptr addrspace(1) %b
+  %c.val = load float, ptr addrspace(1) %c
   %r.val = call float @llvm.amdgcn.fmad.ftz.f32(float 8.0, float %b.val, float %c.val)
-  store float %r.val, float addrspace(1)* %r
+  store float %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -37,13 +37,13 @@
 ; GCN: v_mov_b32_e32 [[KB:v[0-9]+]], 0x41000000
 ; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{[s][0-9]+}}, [[KB]]
 define amdgpu_kernel void @mad_f32_imm_b(
-    float addrspace(1)* %r,
-    float addrspace(1)* %a,
-    float addrspace(1)* %c) {
-  %a.val = load float, float addrspace(1)* %a
-  %c.val = load float, float addrspace(1)* %c
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %c) {
+  %a.val = load float, ptr addrspace(1) %a
+  %c.val = load float, ptr addrspace(1) %c
   %r.val = call float @llvm.amdgcn.fmad.ftz.f32(float %a.val, float 8.0, float %c.val)
-  store float %r.val, float addrspace(1)* %r
+  store float %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -54,62 +54,62 @@
 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
 ; GCN: v_mac_f32_e32 [[C]], {{s[0-9]+}}, [[VB]]{{$}}
 define amdgpu_kernel void @mad_f32_imm_c(
-    float addrspace(1)* %r,
-    float addrspace(1)* %a,
-    float addrspace(1)* %b) {
-  %a.val = load float, float addrspace(1)* %a
-  %b.val = load float, float addrspace(1)* %b
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
+  %a.val = load float, ptr addrspace(1) %a
+  %b.val = load float, ptr addrspace(1) %b
   %r.val = call float @llvm.amdgcn.fmad.ftz.f32(float %a.val, float %b.val, float 8.0)
-  store float %r.val, float addrspace(1)* %r
+  store float %r.val, ptr addrspace(1) %r
   ret void
 }
 
 ; GCN-LABEL: {{^}}mad_f32_neg_b:
 ; GCN:  v_mad_f32 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @mad_f32_neg_b(
-    float addrspace(1)* %r,
-    float addrspace(1)* %a,
-    float addrspace(1)* %b,
-    float addrspace(1)* %c) {
-  %a.val = load float, float addrspace(1)* %a
-  %b.val = load float, float addrspace(1)* %b
-  %c.val = load float, float addrspace(1)* %c
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
+  %a.val = load float, ptr addrspace(1) %a
+  %b.val = load float, ptr addrspace(1) %b
+  %c.val = load float, ptr addrspace(1) %c
   %neg.b = fneg float %b.val
   %r.val = call float @llvm.amdgcn.fmad.ftz.f32(float %a.val, float %neg.b, float %c.val)
-  store float %r.val, float addrspace(1)* %r
+  store float %r.val, ptr addrspace(1) %r
   ret void
 }
 
 ; GCN-LABEL: {{^}}mad_f32_abs_b:
 ; GCN:  v_mad_f32 v{{[0-9]+}}, s{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
 define amdgpu_kernel void @mad_f32_abs_b(
-    float addrspace(1)* %r,
-    float addrspace(1)* %a,
-    float addrspace(1)* %b,
-    float addrspace(1)* %c) {
-  %a.val = load float, float addrspace(1)* %a
-  %b.val = load float, float addrspace(1)* %b
-  %c.val = load float, float addrspace(1)* %c
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
+  %a.val = load float, ptr addrspace(1) %a
+  %b.val = load float, ptr addrspace(1) %b
+  %c.val = load float, ptr addrspace(1) %c
   %abs.b = call float @llvm.fabs.f32(float %b.val)
   %r.val = call float @llvm.amdgcn.fmad.ftz.f32(float %a.val, float %abs.b, float %c.val)
-  store float %r.val, float addrspace(1)* %r
+  store float %r.val, ptr addrspace(1) %r
   ret void
 }
 
 ; GCN-LABEL: {{^}}mad_f32_neg_abs_b:
 ; GCN:  v_mad_f32 v{{[0-9]+}}, s{{[0-9]+}}, -|v{{[0-9]+}}|, v{{[0-9]+}}
 define amdgpu_kernel void @mad_f32_neg_abs_b(
-    float addrspace(1)* %r,
-    float addrspace(1)* %a,
-    float addrspace(1)* %b,
-    float addrspace(1)* %c) {
-  %a.val = load float, float addrspace(1)* %a
-  %b.val = load float, float addrspace(1)* %b
-  %c.val = load float, float addrspace(1)* %c
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
+  %a.val = load float, ptr addrspace(1) %a
+  %b.val = load float, ptr addrspace(1) %b
+  %c.val = load float, ptr addrspace(1) %c
   %abs.b = call float @llvm.fabs.f32(float %b.val)
   %neg.abs.b = fneg float %abs.b
   %r.val = call float @llvm.amdgcn.fmad.ftz.f32(float %a.val, float %neg.abs.b, float %c.val)
-  store float %r.val, float addrspace(1)* %r
+  store float %r.val, ptr addrspace(1) %r
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
index 91d1857..81e4814 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll

@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}test_fmed3_f16:
 ; GCN: v_med3_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @test_fmed3_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
+define amdgpu_kernel void @test_fmed3_f16(ptr addrspace(1) %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
   %src0.f16 = trunc i32 %src0.arg to i16
   %src0 = bitcast i16 %src0.f16 to half
   %src1.f16 = trunc i32 %src1.arg to i16
@@ -10,13 +10,13 @@
   %src2.f16 = trunc i32 %src2.arg to i16
   %src2 = bitcast i16 %src2.f16 to half
   %mad = call half @llvm.amdgcn.fmed3.f16(half %src0, half %src1, half %src2)
-  store half %mad, half addrspace(1)* %out
+  store half %mad, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_fmed3_srcmods_f16:
 ; GCN: v_med3_f16 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}|
-define amdgpu_kernel void @test_fmed3_srcmods_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
+define amdgpu_kernel void @test_fmed3_srcmods_f16(ptr addrspace(1) %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
   %src0.f16 = trunc i32 %src0.arg to i16
   %src0 = bitcast i16 %src0.f16 to half
   %src1.f16 = trunc i32 %src1.arg to i16
@@ -28,7 +28,7 @@
   %src2.fabs = call half @llvm.fabs.f16(half %src2)
   %src2.fneg.fabs = fsub half -0.0, %src2.fabs
   %mad = call half @llvm.amdgcn.fmed3.f16(half %src0.fneg, half %src1.fabs, half %src2.fneg.fabs)
-  store half %mad, half addrspace(1)* %out
+  store half %mad, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
index 05b074b..015017c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll

@@ -3,62 +3,62 @@
 
 ; GCN-LABEL: {{^}}test_fmed3:
 ; GCN: v_med3_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @test_fmed3(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
+define amdgpu_kernel void @test_fmed3(ptr addrspace(1) %out, float %src0, float %src1, float %src2) #1 {
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float %src2)
-  store float %med3, float addrspace(1)* %out
+  store float %med3, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_fmed3_srcmods:
 ; GCN: v_med3_f32 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}|
-define amdgpu_kernel void @test_fmed3_srcmods(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
+define amdgpu_kernel void @test_fmed3_srcmods(ptr addrspace(1) %out, float %src0, float %src1, float %src2) #1 {
   %src0.fneg = fsub float -0.0, %src0
   %src1.fabs = call float @llvm.fabs.f32(float %src1)
   %src2.fabs = call float @llvm.fabs.f32(float %src2)
   %src2.fneg.fabs = fsub float -0.0, %src2.fabs
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0.fneg, float %src1.fabs, float %src2.fneg.fabs)
-  store float %med3, float addrspace(1)* %out
+  store float %med3, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_fneg_fmed3:
 ; GCN: v_med3_f32 v{{[0-9]+}}, -s{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
-define amdgpu_kernel void @test_fneg_fmed3(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
+define amdgpu_kernel void @test_fneg_fmed3(ptr addrspace(1) %out, float %src0, float %src1, float %src2) #1 {
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float %src2)
   %neg.med3 = fsub float -0.0, %med3
-  store float %neg.med3, float addrspace(1)* %out
+  store float %neg.med3, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_fneg_fmed3_multi_use:
 ; GCN: v_med3_f32 [[MED3:v[0-9]+]], -s{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, -4.0, [[MED3]]
-define amdgpu_kernel void @test_fneg_fmed3_multi_use(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
+define amdgpu_kernel void @test_fneg_fmed3_multi_use(ptr addrspace(1) %out, float %src0, float %src1, float %src2) #1 {
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float %src2)
   %neg.med3 = fsub float -0.0, %med3
   %med3.user = fmul float %med3, 4.0
-  store volatile float %med3.user, float addrspace(1)* %out
-  store volatile float %neg.med3, float addrspace(1)* %out
+  store volatile float %med3.user, ptr addrspace(1) %out
+  store volatile float %neg.med3, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_fabs_fmed3:
 ; GCN: v_med3_f32 [[MED3:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7fffffff, [[MED3]]
-define amdgpu_kernel void @test_fabs_fmed3(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
+define amdgpu_kernel void @test_fabs_fmed3(ptr addrspace(1) %out, float %src0, float %src1, float %src2) #1 {
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float %src2)
   %fabs.med3 = call float @llvm.fabs.f32(float %med3)
-  store float %fabs.med3, float addrspace(1)* %out
+  store float %fabs.med3, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_fneg_fmed3_rr_0:
 ; GCN: v_bfrev_b32_e32 [[NEG0:v[0-9]+]], 1
 ; GCN: v_med3_f32 v{{[0-9]+}}, -s{{[0-9]+}}, -v{{[0-9]+}}, [[NEG0]]
-define amdgpu_kernel void @test_fneg_fmed3_rr_0(float addrspace(1)* %out, float %src0, float %src1) #1 {
+define amdgpu_kernel void @test_fneg_fmed3_rr_0(ptr addrspace(1) %out, float %src0, float %src1) #1 {
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float 0.0)
   %neg.med3 = fsub float -0.0, %med3
-  store float %neg.med3, float addrspace(1)* %out
+  store float %neg.med3, ptr addrspace(1) %out
   ret void
 }
 
@@ -67,11 +67,11 @@
 ; GCN: v_bfrev_b32_e32 [[NEG0:v[0-9]+]], 1
 ; GCN: v_med3_f32 [[MED3:v[0-9]+]], -s{{[0-9]+}}, -v{{[0-9]+}}, [[NEG0]]
 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[MED3]]
-define amdgpu_kernel void @test_fneg_fmed3_rr_0_foldable_user(float addrspace(1)* %out, float %src0, float %src1, float %mul.arg) #1 {
+define amdgpu_kernel void @test_fneg_fmed3_rr_0_foldable_user(ptr addrspace(1) %out, float %src0, float %src1, float %mul.arg) #1 {
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float 0.0)
   %neg.med3 = fsub float -0.0, %med3
   %mul = fmul float %neg.med3, %mul.arg
-  store float %mul, float addrspace(1)* %out
+  store float %mul, ptr addrspace(1) %out
   ret void
 }
 
@@ -79,10 +79,10 @@
 ; GCN-DAG: v_bfrev_b32_e32 [[NEG0:v[0-9]+]], 1
 ; GCN-DAG: v_mov_b32_e32 [[NEG_INV:v[0-9]+]], 0xbe22f983
 ; GCN: v_med3_f32 v{{[0-9]+}}, -s{{[0-9]+}}, [[NEG_INV]], [[NEG0]]
-define amdgpu_kernel void @test_fneg_fmed3_r_inv2pi_0(float addrspace(1)* %out, float %src0) #1 {
+define amdgpu_kernel void @test_fneg_fmed3_r_inv2pi_0(ptr addrspace(1) %out, float %src0) #1 {
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float 0x3FC45F3060000000, float 0.0)
   %neg.med3 = fsub float -0.0, %med3
-  store float %neg.med3, float addrspace(1)* %out
+  store float %neg.med3, ptr addrspace(1) %out
   ret void
 }
 
@@ -91,11 +91,11 @@
 ; GCN-DAG: v_mov_b32_e32 [[NEG_INV:v[0-9]+]], 0xbe22f983
 ; GCN: v_med3_f32 [[MED3:v[0-9]+]], -s{{[0-9]+}}, [[NEG_INV]], [[NEG0]]
 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[MED3]]
-define amdgpu_kernel void @test_fneg_fmed3_r_inv2pi_0_foldable_user(float addrspace(1)* %out, float %src0, float %mul.arg) #1 {
+define amdgpu_kernel void @test_fneg_fmed3_r_inv2pi_0_foldable_user(ptr addrspace(1) %out, float %src0, float %mul.arg) #1 {
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float 0x3FC45F3060000000, float 0.0)
   %neg.med3 = fsub float -0.0, %med3
   %mul = fmul float %neg.med3, %mul.arg
-  store float %mul, float addrspace(1)* %out
+  store float %mul, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
index 3763702..e824f89 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll

@@ -9,38 +9,38 @@
 ; GCN-LABEL: {{^}}test_mul_legacy_f32:
 ; GCN: v_mul_legacy_f32{{[_e3264]*}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}}
 ; GFX11: v_mul_dx9_zero_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_mul_legacy_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
   %result = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mul_legacy_undef0_f32:
 ; GCN: v_mul_legacy_f32{{[_e3264]*}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}}
 ; GFX11: v_mul_dx9_zero_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_mul_legacy_undef0_f32(ptr addrspace(1) %out, float %a) #0 {
   %result = call float @llvm.amdgcn.fmul.legacy(float undef, float %a)
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mul_legacy_undef1_f32:
 ; GCN: v_mul_legacy_f32{{[_e3264]*}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}}
 ; GFX11: v_mul_dx9_zero_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_mul_legacy_undef1_f32(ptr addrspace(1) %out, float %a) #0 {
   %result = call float @llvm.amdgcn.fmul.legacy(float %a, float undef)
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mul_legacy_fabs_f32:
 ; GCN: v_mul_legacy_f32{{[_e3264]*}} v{{[0-9]+}}, |s{{[0-9]+}}|, |{{[sv][0-9]+}}|
 ; GFX11: v_mul_dx9_zero_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|, |s{{[0-9]+}}|
-define amdgpu_kernel void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_mul_legacy_fabs_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %result = call float @llvm.amdgcn.fmul.legacy(float %a.fabs, float %b.fabs)
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -50,10 +50,10 @@
 ; GCN: v_add_f32_e{{(32|64)}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}}
 ; GFX11: v_mul_dx9_zero_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX11: v_dual_mov_b32 v{{[0-9]+}}, 0 :: v_dual_add_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @test_add_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #0 {
+define amdgpu_kernel void @test_add_mul_legacy_f32(ptr addrspace(1) %out, float %a, float %b, float %c) #0 {
   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   %add = fadd float %mul, %c
-  store float %add, float addrspace(1)* %out, align 4
+  store float %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -66,10 +66,10 @@
 ; GFX103: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX11: v_mul_dx9_zero_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX11: v_dual_mov_b32 v{{[0-9]+}}, 0 :: v_dual_add_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @test_mad_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #2 {
+define amdgpu_kernel void @test_mad_legacy_f32(ptr addrspace(1) %out, float %a, float %b, float %c) #2 {
   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   %add = fadd float %mul, %c
-  store float %add, float addrspace(1)* %out, align 4
+  store float %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -80,10 +80,10 @@
 ; GFX101: v_mad_legacy_f32 v{{[0-9]+}}, 0x41200000, s{{[0-9]+}}
 ; GFX103: v_mul_legacy_f32_e64 v{{[0-9]+}}, 0x41200000, s{{[0-9]+}}
 ; GFX103: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @test_mad_legacy_f32_imm(float addrspace(1)* %out, float %a, float %c) #2 {
+define amdgpu_kernel void @test_mad_legacy_f32_imm(ptr addrspace(1) %out, float %a, float %c) #2 {
   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float 10.0)
   %add = fadd float %mul, %c
-  store float %add, float addrspace(1)* %out, align 4
+  store float %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -93,12 +93,12 @@
 ; NOMADMACF32: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX11: v_mul_dx9_zero_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, -s{{[0-9]+}}
 ; GFX11: v_dual_mov_b32 v{{[0-9]+}}, 0 :: v_dual_add_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @test_mad_legacy_fneg_f32(float addrspace(1)* %out, float %a, float %b, float %c) #2 {
+define amdgpu_kernel void @test_mad_legacy_fneg_f32(ptr addrspace(1) %out, float %a, float %b, float %c) #2 {
   %a.fneg = fneg float %a
   %b.fneg = fneg float %b
   %mul = call float @llvm.amdgcn.fmul.legacy(float %a.fneg, float %b.fneg)
   %add = fadd float %mul, %c
-  store float %add, float addrspace(1)* %out, align 4
+  store float %add, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
index 026f690..61bcf4c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll

@@ -8,11 +8,11 @@
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fract_f16(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
+  %a.val = load half, ptr addrspace(1) %a
   %r.val = call half @llvm.amdgcn.fract.f16(half %a.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
index 94dc2f4..092010b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll

@@ -6,26 +6,26 @@
 
 ; GCN-LABEL: {{^}}v_fract_f32:
 ; GCN: v_fract_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define amdgpu_kernel void @v_fract_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_fract_f32(ptr addrspace(1) %out, float %src) #1 {
   %fract = call float @llvm.amdgcn.fract.f32(float %src)
-  store float %fract, float addrspace(1)* %out
+  store float %fract, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_fract_f64:
 ; GCN: v_fract_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @v_fract_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @v_fract_f64(ptr addrspace(1) %out, double %src) #1 {
   %fract = call double @llvm.amdgcn.fract.f64(double %src)
-  store double %fract, double addrspace(1)* %out
+  store double %fract, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_fract_undef_f32:
 ; GCN-NOT: v_fract_f32
 ; GCN-NOT: store_dword
-define amdgpu_kernel void @v_fract_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_fract_undef_f32(ptr addrspace(1) %out) #1 {
   %fract = call float @llvm.amdgcn.fract.f32(float undef)
-  store float %fract, float addrspace(1)* %out
+  store float %fract, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
index ee07678..bef7314 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll

@@ -7,12 +7,12 @@
 ; VI:  v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_I16]]
 define amdgpu_kernel void @frexp_exp_f16(
-    i16 addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
+  %a.val = load half, ptr addrspace(1) %a
   %r.val = call i16 @llvm.amdgcn.frexp.exp.i16.f16(half %a.val)
-  store i16 %r.val, i16 addrspace(1)* %r
+  store i16 %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -22,13 +22,13 @@
 ; VI:  v_bfe_i32 v[[R_I32:[0-9]+]], v[[R_I16]], 0, 16{{$}}
 ; GCN: buffer_store_dword v[[R_I32]]
 define amdgpu_kernel void @frexp_exp_f16_sext(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
+  %a.val = load half, ptr addrspace(1) %a
   %r.val = call i16 @llvm.amdgcn.frexp.exp.i16.f16(half %a.val)
   %r.val.sext = sext i16 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -37,12 +37,12 @@
 ; VI:  v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_dword v[[R_I16]]
 define amdgpu_kernel void @frexp_exp_f16_zext(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
+  %a.val = load half, ptr addrspace(1) %a
   %r.val = call i16 @llvm.amdgcn.frexp.exp.i16.f16(half %a.val)
   %r.val.zext = zext i16 %r.val to i32
-  store i32 %r.val.zext, i32 addrspace(1)* %r
+  store i32 %r.val.zext, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
index 0d68614..d821def 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll

@@ -8,55 +8,55 @@
 
 ; GCN-LABEL: {{^}}s_test_frexp_exp_f32:
 ; GCN: v_frexp_exp_i32_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define amdgpu_kernel void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_frexp_exp_f32(ptr addrspace(1) %out, float %src) #1 {
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %src)
-  store i32 %frexp.exp, i32 addrspace(1)* %out
+  store i32 %frexp.exp, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f32:
 ; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}|
-define amdgpu_kernel void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_exp_f32(ptr addrspace(1) %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %fabs.src)
-  store i32 %frexp.exp, i32 addrspace(1)* %out
+  store i32 %frexp.exp, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f32:
 ; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}|
-define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f32(ptr addrspace(1) %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %fneg.fabs.src = fsub float -0.0, %fabs.src
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %fneg.fabs.src)
-  store i32 %frexp.exp, i32 addrspace(1)* %out
+  store i32 %frexp.exp, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_frexp_exp_f64:
 ; GCN: v_frexp_exp_i32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_frexp_exp_f64(ptr addrspace(1) %out, double %src) #1 {
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %src)
-  store i32 %frexp.exp, i32 addrspace(1)* %out
+  store i32 %frexp.exp, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f64:
 ; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, |{{s\[[0-9]+:[0-9]+\]}}|
-define amdgpu_kernel void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_exp_f64(ptr addrspace(1) %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %fabs.src)
-  store i32 %frexp.exp, i32 addrspace(1)* %out
+  store i32 %frexp.exp, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f64:
 ; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, -|{{s\[[0-9]+:[0-9]+\]}}|
-define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f64(ptr addrspace(1) %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %fneg.fabs.src = fsub double -0.0, %fabs.src
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %fneg.fabs.src)
-  store i32 %frexp.exp, i32 addrspace(1)* %out
+  store i32 %frexp.exp, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
index 722cd44..ba5f20a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll

@@ -8,11 +8,11 @@
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @frexp_mant_f16(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
+  %a.val = load half, ptr addrspace(1) %a
   %r.val = call half @llvm.amdgcn.frexp.mant.f16(half %a.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
index 605dc3d..0bc50b8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll

@@ -8,55 +8,55 @@
 
 ; GCN-LABEL: {{^}}s_test_frexp_mant_f32:
 ; GCN: v_frexp_mant_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define amdgpu_kernel void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_frexp_mant_f32(ptr addrspace(1) %out, float %src) #1 {
   %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %src)
-  store float %frexp.mant, float addrspace(1)* %out
+  store float %frexp.mant, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f32:
 ; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}|
-define amdgpu_kernel void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_mant_f32(ptr addrspace(1) %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fabs.src)
-  store float %frexp.mant, float addrspace(1)* %out
+  store float %frexp.mant, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f32:
 ; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}|
-define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f32(ptr addrspace(1) %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %fneg.fabs.src = fsub float -0.0, %fabs.src
   %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fneg.fabs.src)
-  store float %frexp.mant, float addrspace(1)* %out
+  store float %frexp.mant, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_frexp_mant_f64:
 ; GCN: v_frexp_mant_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_frexp_mant_f64(ptr addrspace(1) %out, double %src) #1 {
   %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %src)
-  store double %frexp.mant, double addrspace(1)* %out
+  store double %frexp.mant, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f64:
 ; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, |{{s\[[0-9]+:[0-9]+\]}}|
-define amdgpu_kernel void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_mant_f64(ptr addrspace(1) %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fabs.src)
-  store double %frexp.mant, double addrspace(1)* %out
+  store double %frexp.mant, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f64:
 ; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, -|{{s\[[0-9]+:[0-9]+\]}}|
-define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f64(ptr addrspace(1) %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %fneg.fabs.src = fsub double -0.0, %fabs.src
   %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fneg.fabs.src)
-  store double %frexp.mant, double addrspace(1)* %out
+  store double %frexp.mant, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll
index c0bb6f6..d7f122d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll

@@ -5,9 +5,9 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX10
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX900-GISEL
 
-declare void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* nocapture %gptr, i8 addrspace(3)* nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
+declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
 
-define amdgpu_ps void @global_load_lds_dword_vaddr(i8 addrspace(1)* nocapture %gptr, i8 addrspace(3)* nocapture %lptr) {
+define amdgpu_ps void @global_load_lds_dword_vaddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) {
 ; GFX900-LABEL: global_load_lds_dword_vaddr:
 ; GFX900:       ; %bb.0: ; %main_body
 ; GFX900-NEXT:    v_readfirstlane_b32 s0, v2
@@ -46,11 +46,11 @@
 ; GFX900-GISEL-NEXT:    global_load_dword v[0:1], off offset:16 glc lds
 ; GFX900-GISEL-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* %gptr, i8 addrspace(3)* %lptr, i32 4, i32 16, i32 1)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 16, i32 1)
   ret void
 }
 
-define amdgpu_ps void @global_load_lds_dword_saddr(i8 addrspace(1)* nocapture inreg %gptr, i8 addrspace(3)* nocapture %lptr) {
+define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr) {
 ; GFX900-LABEL: global_load_lds_dword_saddr:
 ; GFX900:       ; %bb.0: ; %main_body
 ; GFX900-NEXT:    v_readfirstlane_b32 s2, v0
@@ -94,11 +94,11 @@
 ; GFX900-GISEL-NEXT:    global_load_dword v0, s[0:1] offset:32 slc lds
 ; GFX900-GISEL-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* %gptr, i8 addrspace(3)* %lptr, i32 4, i32 32, i32 2)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 32, i32 2)
   ret void
 }
 
-define amdgpu_ps void @global_load_lds_dword_saddr_and_vaddr(i8 addrspace(1)* nocapture inreg %gptr, i8 addrspace(3)* nocapture %lptr, i32 %voffset) {
+define amdgpu_ps void @global_load_lds_dword_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) {
 ; GFX900-LABEL: global_load_lds_dword_saddr_and_vaddr:
 ; GFX900:       ; %bb.0: ; %main_body
 ; GFX900-NEXT:    v_readfirstlane_b32 s2, v0
@@ -138,12 +138,12 @@
 ; GFX900-GISEL-NEXT:    s_endpgm
 main_body:
   %voffset.64 = zext i32 %voffset to i64
-  %gep = getelementptr i8, i8 addrspace(1)* %gptr, i64 %voffset.64
-  call void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* %gep, i8 addrspace(3)* %lptr, i32 4, i32 48, i32 16)
+  %gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gep, ptr addrspace(3) %lptr, i32 4, i32 48, i32 16)
   ret void
 }
 
-define amdgpu_ps void @global_load_lds_ushort_vaddr(i8 addrspace(1)* nocapture %gptr, i8 addrspace(3)* nocapture %lptr) {
+define amdgpu_ps void @global_load_lds_ushort_vaddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) {
 ; GFX900-LABEL: global_load_lds_ushort_vaddr:
 ; GFX900:       ; %bb.0: ; %main_body
 ; GFX900-NEXT:    v_readfirstlane_b32 s0, v2
@@ -182,11 +182,11 @@
 ; GFX900-GISEL-NEXT:    global_load_ushort v[0:1], off lds
 ; GFX900-GISEL-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* %gptr, i8 addrspace(3)* %lptr, i32 2, i32 0, i32 4)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 2, i32 0, i32 4)
   ret void
 }
 
-define amdgpu_ps void @global_load_lds_ubyte_vaddr(i8 addrspace(1)* nocapture %gptr, i8 addrspace(3)* nocapture %lptr) {
+define amdgpu_ps void @global_load_lds_ubyte_vaddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) {
 ; GFX900-LABEL: global_load_lds_ubyte_vaddr:
 ; GFX900:       ; %bb.0: ; %main_body
 ; GFX900-NEXT:    v_readfirstlane_b32 s0, v2
@@ -225,6 +225,6 @@
 ; GFX900-GISEL-NEXT:    global_load_ubyte v[0:1], off lds
 ; GFX900-GISEL-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* %gptr, i8 addrspace(3)* %lptr, i32 1, i32 0, i32 0)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 1, i32 0, i32 0)
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
index db4032e..63c2a11 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll

@@ -14,14 +14,14 @@
 ; CHECK-LABEL: {{^}}groupstaticsize_test0:
 ; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
 ; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
+define amdgpu_kernel void @groupstaticsize_test0(ptr addrspace(1) %out, ptr addrspace(1) %lds_size) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 64
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
-  store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  store float %val0, float addrspace(1)* %out, align 4
+  store i32 %static_lds_size, ptr addrspace(1) %lds_size, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds0, i32 0, i32 %idx.0
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
+  store float %val0, ptr addrspace(1) %out, align 4
 
   ret void
 }
@@ -29,25 +29,25 @@
 ; CHECK-LABEL: {{^}}groupstaticsize_test1:
 ; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
 ; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
-define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
+define amdgpu_kernel void @groupstaticsize_test1(ptr addrspace(1) %out, i32 %cond, ptr addrspace(1) %lds_size) {
 entry:
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
-  store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
+  store i32 %static_lds_size, ptr addrspace(1) %lds_size, align 4
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 64
   %tmp = icmp eq i32 %cond, 0
   br i1 %tmp, label %if, label %else
 
 if:                                               ; preds = %entry
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  store float %val0, float addrspace(1)* %out, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds0, i32 0, i32 %idx.0
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
+  store float %val0, ptr addrspace(1) %out, align 4
   br label %endif
 
 else:                                             ; preds = %entry
-  %arrayidx1 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
-  store float %val1, float addrspace(1)* %out, align 4
+  %arrayidx1 = getelementptr inbounds [256 x float], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
+  store float %val1, ptr addrspace(1) %out, align 4
   br label %endif
 
 endif:                                            ; preds = %else, %if
@@ -58,11 +58,11 @@
 ; CHECK-LABEL: {{^}}large_groupstaticsize:
 ; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
 ; HSA: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
-define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
-  %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx
-  store volatile i32 0, i32 addrspace(3)* %gep
+define amdgpu_kernel void @large_groupstaticsize(ptr addrspace(1) %size, i32 %idx) #0 {
+  %gep = getelementptr inbounds [4096 x i32], ptr addrspace(3) @large, i32 0, i32 %idx
+  store volatile i32 0, ptr addrspace(3) %gep
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize()
-  store i32 %static_lds_size, i32 addrspace(1)* %size
+  store i32 %static_lds_size, ptr addrspace(1) %size
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
index 33681c3..22bdfd0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll

@@ -18,7 +18,7 @@
 declare i32 @llvm.amdgcn.icmp.i16(i16, i16, i32) #0
 declare i32 @llvm.amdgcn.icmp.i1(i1, i1, i32) #0
 
-define amdgpu_kernel void @v_icmp_i32_eq(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i32_eq:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -70,11 +70,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i32:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_endpgm
@@ -100,11 +100,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v0, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 30)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_ne(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i32_ne:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -156,11 +156,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_ugt(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i32_ugt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -212,11 +212,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_uge(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i32_uge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -268,11 +268,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_ult(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i32_ult:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -324,11 +324,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_ule(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i32_ule:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -380,11 +380,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_sgt(i32 addrspace(1)* %out, i32 %src) #1 {
+define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
 ; SDAG-GFX11-LABEL: v_icmp_i32_sgt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -436,11 +436,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_sge(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i32_sge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -492,11 +492,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_slt(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i32_slt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -548,11 +548,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_sle(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i32_sle:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -604,11 +604,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i64_eq(i32 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_eq:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -656,11 +656,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i64_ne(i32 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_ne:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -708,11 +708,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_u64_ugt(i32 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_u64_ugt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -760,11 +760,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_u64_uge(i32 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_u64_uge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -812,11 +812,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_u64_ult(i32 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_u64_ult:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -864,11 +864,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_u64_ule(i32 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_u64_ule:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -916,11 +916,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i64_sgt(i32 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_sgt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -968,11 +968,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i64_sge(i32 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_sge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1020,11 +1020,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i64_slt(i32 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_slt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1072,11 +1072,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i64_sle(i32 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_sle:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1124,11 +1124,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_eq(i32 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i16_eq:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1180,11 +1180,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 32)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16(i32 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i16:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_endpgm
@@ -1210,11 +1210,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v0, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 30)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_ne(i32 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i16_ne:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1266,11 +1266,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 33)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_ugt(i32 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i16_ugt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1322,11 +1322,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 34)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_uge(i32 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i16_uge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1378,11 +1378,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 35)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_ult(i32 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i16_ult:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1434,11 +1434,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 36)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_ule(i32 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i16_ule:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1490,11 +1490,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 37)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_sgt(i32 addrspace(1)* %out, i16 %src) #1 {
+define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
 ; SDAG-GFX11-LABEL: v_icmp_i16_sgt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1546,11 +1546,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 38)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_sge(i32 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i16_sge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1602,11 +1602,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 39)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_slt(i32 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i16_slt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1658,11 +1658,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 40)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_sle(i32 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i16_sle:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_clause 0x1
@@ -1714,11 +1714,11 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 41)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i1_ne0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GFX11-LABEL: v_icmp_i1_ne0:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1751,11 +1751,11 @@
   %c1 = icmp ugt i32 %b, 2
   %src = and i1 %c0, %c1
   %result = call i32 @llvm.amdgcn.icmp.i1(i1 %src, i1 false, i32 33)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_ps void @test_intr_icmp_i32_invalid_cc(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_ps void @test_intr_icmp_i32_invalid_cc(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-LABEL: test_intr_icmp_i32_invalid_cc:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_endpgm
@@ -1775,7 +1775,7 @@
 ; GISEL-GFX10-NEXT:    global_store_dword v[0:1], v0, off
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 9999)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
index f76973f7..1f4754c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll

@@ -21,7 +21,7 @@
 declare i64 @llvm.amdgcn.icmp.i16(i16, i16, i32) #0
 declare i64 @llvm.amdgcn.icmp.i1(i1, i1, i32) #0
 
-define amdgpu_kernel void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-LABEL: v_icmp_i32_eq:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -78,11 +78,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i32:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_endpgm
@@ -121,11 +121,11 @@
 ; GISEL-GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 30)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-LABEL: v_icmp_i32_ne:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -182,11 +182,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_ugt(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-LABEL: v_icmp_i32_ugt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -243,11 +243,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_uge(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-LABEL: v_icmp_i32_uge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -304,11 +304,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_ult(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-LABEL: v_icmp_i32_ult:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -365,11 +365,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_ule(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-LABEL: v_icmp_i32_ule:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -426,11 +426,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 {
+define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
 ; GFX11-LABEL: v_icmp_i32_sgt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -487,11 +487,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-LABEL: v_icmp_i32_sge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -548,11 +548,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_slt(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-LABEL: v_icmp_i32_slt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -609,11 +609,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-LABEL: v_icmp_i32_sle:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -670,11 +670,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_eq:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -744,11 +744,11 @@
 ; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_ne:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -818,11 +818,11 @@
 ; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_u64_ugt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -892,11 +892,11 @@
 ; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_u64_uge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -966,11 +966,11 @@
 ; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_u64_ult:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1040,11 +1040,11 @@
 ; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_u64_ule:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1114,11 +1114,11 @@
 ; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_sgt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1188,11 +1188,11 @@
 ; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_sge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1262,11 +1262,11 @@
 ; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i64_slt(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_slt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1336,11 +1336,11 @@
 ; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i64_sle(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_sle:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -1410,11 +1410,11 @@
 ; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_eq(i64 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-LABEL: v_icmp_i16_eq:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -1471,11 +1471,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 32)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16(i64 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i16:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_endpgm
@@ -1514,11 +1514,11 @@
 ; GISEL-GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 30)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_ne(i64 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-LABEL: v_icmp_i16_ne:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -1575,11 +1575,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 33)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_ugt(i64 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-LABEL: v_icmp_i16_ugt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -1636,11 +1636,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 34)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_uge(i64 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-LABEL: v_icmp_i16_uge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -1697,11 +1697,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 35)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_ult(i64 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-LABEL: v_icmp_i16_ult:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -1758,11 +1758,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 36)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_ule(i64 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-LABEL: v_icmp_i16_ule:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -1819,11 +1819,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 37)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_sgt(i64 addrspace(1)* %out, i16 %src) #1 {
+define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
 ; GFX11-LABEL: v_icmp_i16_sgt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -1880,11 +1880,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 38)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_sge(i64 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-LABEL: v_icmp_i16_sge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -1941,11 +1941,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 39)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_slt(i64 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-LABEL: v_icmp_i16_slt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -2002,11 +2002,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 40)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i16_sle(i64 addrspace(1)* %out, i16 %src) {
+define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-LABEL: v_icmp_i16_sle:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -2063,11 +2063,11 @@
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 41)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_icmp_i1_ne0(i64 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GFX11-LABEL: v_icmp_i1_ne0:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
@@ -2119,11 +2119,11 @@
   %c1 = icmp ugt i32 %b, 2
   %src = and i1 %c0, %c1
   %result = call i64 @llvm.amdgcn.icmp.i1(i1 %src, i1 false, i32 33)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_ps void @test_intr_icmp_i32_invalid_cc(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_ps void @test_intr_icmp_i32_invalid_cc(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-LABEL: test_intr_icmp_i32_invalid_cc:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_endpgm
@@ -2152,7 +2152,7 @@
 ; GISEL-GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 9999)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 6248bef..1d18e05 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

@@ -11,7 +11,7 @@
   ret void
 }
 
-define amdgpu_kernel void @test_iglp_opt_mfma_gemm(<32 x float> addrspace(3)* noalias %in, <32 x float> addrspace(3)* noalias %out) #0 {
+define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_iglp_opt_mfma_gemm:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -122,31 +122,31 @@
 entry:
   call void @llvm.amdgcn.iglp.opt(i32 0)
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %load.0.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %in, i32 %idx
-  %load.0 = load <32 x float>, <32 x float> addrspace(3)* %load.0.addr
-  %load.1.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.0.addr, i32 64
-  %load.1 = load <32 x float>, <32 x float> addrspace(3)* %load.1.addr
-  %load.2.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.1.addr, i32 128
-  %load.2 = load <32 x float>, <32 x float> addrspace(3)* %load.2.addr
-  %load.3.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.2.addr, i32 192
-  %load.3 = load <32 x float>, <32 x float> addrspace(3)* %load.3.addr
-  %load.4.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.3.addr, i32 256
-  %load.4 = load <32 x float>, <32 x float> addrspace(3)* %load.4.addr
+  %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx
+  %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr
+  %load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64
+  %load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr
+  %load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128
+  %load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr
+  %load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192
+  %load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr
+  %load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256
+  %load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr
   %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0)
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0)
   %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0)
   %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0)
   %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0)
-  %store.0.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 %idx
-  store <32 x float> %mai.0, <32 x float> addrspace(3)* %store.0.addr
-  %store.1.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 64
-  store <32 x float> %mai.1, <32 x float> addrspace(3)* %store.1.addr
-  %store.2.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 128
-  store <32 x float> %mai.2, <32 x float> addrspace(3)* %store.2.addr
-  %store.3.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 192
-  store <32 x float> %mai.3, <32 x float> addrspace(3)* %store.3.addr
-  %store.4.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 256
-  store <32 x float> %mai.4, <32 x float> addrspace(3)* %store.4.addr
+  %store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx
+  store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr
+  %store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64
+  store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr
+  %store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128
+  store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr
+  %store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192
+  store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr
+  %store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256
+  store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
index 15269a8..5865f86 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll

@@ -11,12 +11,12 @@
 
 ; GCN-LABEL: {{^}}load_1d_lwe:
 ; GCN: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}}
-define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
 main_body:
   %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 2, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -46,12 +46,12 @@
 
 ; GCN-LABEL: {{^}}load_cube_lwe:
 ; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
-define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
+define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -73,12 +73,12 @@
 
 ; GCN-LABEL: {{^}}load_2darray_lwe:
 ; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
-define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
+define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -269,13 +269,13 @@
 
 ; GCN-LABEL: image_load_mmo
 ; GCN: image_load v1, v[{{[0-9:]+}}], s[0:7] dmask:0x1 unorm
-define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3)* %lds, <2 x i32> %c) #0 {
-  store float 0.000000e+00, float addrspace(3)* %lds
+define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, ptr addrspace(3) %lds, <2 x i32> %c) #0 {
+  store float 0.000000e+00, ptr addrspace(3) %lds
   %c0 = extractelement <2 x i32> %c, i32 0
   %c1 = extractelement <2 x i32> %c, i32 1
   %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
-  %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
-  store float 0.000000e+00, float addrspace(3)* %tmp2
+  %tmp2 = getelementptr float, ptr addrspace(3) %lds, i32 4
+  store float 0.000000e+00, ptr addrspace(3) %tmp2
   ret float %tex
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
index 060c115..51d41c7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll

@@ -41,7 +41,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
 ; VERDE-LABEL: load_1d_tfe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -141,11 +141,11 @@
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
 ; VERDE-LABEL: load_1d_lwe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -245,7 +245,7 @@
   %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 2, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -284,7 +284,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) {
+define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t) {
 ; VERDE-LABEL: load_2d_tfe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -389,7 +389,7 @@
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -428,7 +428,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) {
+define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %r) {
 ; VERDE-LABEL: load_3d_tfe_lwe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -537,7 +537,7 @@
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -576,7 +576,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
+define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
 ; VERDE-LABEL: load_cube_lwe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -685,7 +685,7 @@
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -724,7 +724,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %slice) {
+define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %slice) {
 ; VERDE-LABEL: load_1darray_tfe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -829,7 +829,7 @@
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -868,7 +868,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
+define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
 ; VERDE-LABEL: load_2darray_lwe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -977,7 +977,7 @@
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -1016,7 +1016,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
+define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) {
 ; VERDE-LABEL: load_2dmsaa_both:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -1125,7 +1125,7 @@
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -1164,7 +1164,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
 ; VERDE-LABEL: load_2darraymsaa_tfe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -1278,7 +1278,7 @@
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -1317,7 +1317,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %mip) {
+define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %mip) {
 ; VERDE-LABEL: load_mip_1d_lwe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -1422,7 +1422,7 @@
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 2, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -1461,7 +1461,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %mip) {
+define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %mip) {
 ; VERDE-LABEL: load_mip_2d_tfe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -1570,7 +1570,7 @@
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -1880,7 +1880,7 @@
   ret float %vv
 }
 
-define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
 ; VERDE-LABEL: load_1d_tfe_V4_dmask3:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v4, v0
@@ -1972,11 +1972,11 @@
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
 ; VERDE-LABEL: load_1d_tfe_V4_dmask2:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v3, v0
@@ -2061,11 +2061,11 @@
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
 ; VERDE-LABEL: load_1d_tfe_V4_dmask1:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v2, v0
@@ -2142,11 +2142,11 @@
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
 ; VERDE-LABEL: load_1d_tfe_V2_dmask1:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v2, v0
@@ -2223,7 +2223,7 @@
   %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<2 x float>, i32} %v, 0
   %v.err = extractvalue {<2 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <2 x float> %v.vec
 }
 
@@ -3779,7 +3779,7 @@
   ret void
 }
 
-define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3)* %lds, <2 x i32> %c) #0 {
+define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, ptr addrspace(3) %lds, <2 x i32> %c) #0 {
 ; VERDE-LABEL: image_load_mmo:
 ; VERDE:       ; %bb.0:
 ; VERDE-NEXT:    image_load v1, v[1:2], s[0:7] dmask:0x1 unorm
@@ -3843,12 +3843,12 @@
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  store float 0.000000e+00, float addrspace(3)* %lds
+  store float 0.000000e+00, ptr addrspace(3) %lds
   %c0 = extractelement <2 x i32> %c, i32 0
   %c1 = extractelement <2 x i32> %c, i32 1
   %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
-  %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
-  store float 0.000000e+00, float addrspace(3)* %tmp2
+  %tmp2 = getelementptr float, ptr addrspace(3) %lds, i32 4
+  store float 0.000000e+00, ptr addrspace(3) %tmp2
   ret float %tex
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
index a8f395f..ffef278 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll

@@ -10,12 +10,12 @@
 
 ; GCN-LABEL: {{^}}load_2dmsaa_both:
 ; GFX11: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ;
-define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
+define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) {
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32i32.i32(i32 2, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -29,12 +29,12 @@
 
 ; GCN-LABEL: {{^}}load_2darraymsaa_tfe:
 ; GFX11: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ;
-define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32i32.i32(i32 8, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -72,12 +72,12 @@
 
 ; GCN-LABEL: {{^}}load_2dmsaa_tfe_d16:
 ; GFX11: image_msaa_load v[0:2], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ;
-define amdgpu_ps <4 x half> @load_2dmsaa_tfe_d16(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
+define amdgpu_ps <4 x half> @load_2dmsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) {
 main_body:
   %v = call {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16i32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<4 x half>, i32} %v, 0
   %v.err = extractvalue {<4 x half>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x half> %v.vec
 }
 
@@ -91,12 +91,12 @@
 
 ; GCN-LABEL: {{^}}load_2darraymsaa_tfe_d16:
 ; GFX11: image_msaa_load v[0:2], v[0:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ;
-define amdgpu_ps <4 x half> @load_2darraymsaa_tfe_d16(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+define amdgpu_ps <4 x half> @load_2darraymsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
 main_body:
   %v = call {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f16i32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<4 x half>, i32} %v, 0
   %v.err = extractvalue {<4 x half>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x half> %v.vec
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.x.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.x.ll
index a90fc629..934dc96 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.x.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.x.ll

@@ -10,12 +10,12 @@
 
 ; GCN-LABEL: {{^}}load_2dmsaa_both:
 ; GFX10: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ;
-define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
+define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) {
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.x.2dmsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -29,56 +29,56 @@
 
 ; GCN-LABEL: {{^}}load_2darraymsaa_tfe:
 ; GFX10: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ;
-define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.x.2darraymsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
 ; GCN-LABEL: {{^}}load_2dmsaa_tfe_V4_dmask3:
 ; GFX10: image_msaa_load v[0:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe ;
-define amdgpu_ps <4 x float> @load_2dmsaa_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
+define amdgpu_ps <4 x float> @load_2dmsaa_tfe_V4_dmask3(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) {
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.x.2dmsaa.v4f32i32.i32(i32 7, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
 ; GCN-LABEL: {{^}}load_2dmsaa_tfe_V4_dmask2:
 ; GFX10: image_msaa_load v[0:2], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe ;
-define amdgpu_ps <4 x float> @load_2dmsaa_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
+define amdgpu_ps <4 x float> @load_2dmsaa_tfe_V4_dmask2(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) {
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.x.2dmsaa.v4f32i32.i32(i32 6, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
 ; GCN-LABEL: {{^}}load_2dmsaa_tfe_V4_dmask1:
 ; GFX10: image_msaa_load v[0:1], [v4, v3, v2], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe ;
-define amdgpu_ps <4 x float> @load_2dmsaa_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
+define amdgpu_ps <4 x float> @load_2dmsaa_tfe_V4_dmask1(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) {
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.x.2dmsaa.v4f32i32.i32(i32 8, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
 ; GCN-LABEL: {{^}}load_2dmsaa_tfe_V2_dmask1:
 ; GFX10: image_msaa_load v[0:1], [v4, v3, v2], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe ;
-define amdgpu_ps <2 x float> @load_2dmsaa_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
+define amdgpu_ps <2 x float> @load_2dmsaa_tfe_V2_dmask1(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) {
 main_body:
   %v = call {<2 x float>,i32} @llvm.amdgcn.image.msaa.load.x.2dmsaa.v2f32i32.i32(i32 8, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
   %v.vec = extractvalue {<2 x float>, i32} %v, 0
   %v.err = extractvalue {<2 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <2 x float> %v.vec
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
index d61fb52..35e2f01 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll

@@ -46,7 +46,7 @@
   ret half %tex
 }
 
-define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, i32 addrspace(1)* inreg %out) {
+define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, ptr addrspace(1) inreg %out) {
 ; TONGA-LABEL: image_sample_2d_f16_tfe:
 ; TONGA:       ; %bb.0: ; %main_body
 ; TONGA-NEXT:    s_mov_b64 s[14:15], exec
@@ -129,7 +129,7 @@
   %tex = call {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
   %tex.vec = extractvalue {half, i32} %tex, 0
   %tex.err = extractvalue {half, i32} %tex, 1
-  store i32 %tex.err, i32 addrspace(1)* %out, align 4
+  store i32 %tex.err, ptr addrspace(1) %out, align 4
   ret half %tex.vec
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll
index e712a18..28d859b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll

@@ -13,12 +13,12 @@
 ; GFX90A-LABEL: {{^}}sample_1d_lwe:
 ; GFX90A-NOT: s_wqm_b64
 ; GFX90A:     image_sample v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf lwe
-define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
+define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, ptr addrspace(1) inreg %out, float %s) {
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index 34fa7e9..90c357b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll

@@ -36,7 +36,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
+define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, ptr addrspace(1) inreg %out, float %s) {
 ; VERDE-LABEL: sample_1d_tfe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[14:15], exec
@@ -122,11 +122,11 @@
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, ptr addrspace(1) inreg %out, float %s) {
 ; VERDE-LABEL: sample_1d_tfe_adjust_writemask_1:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -499,7 +499,7 @@
   ret <4 x float> %res
 }
 
-define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
+define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, ptr addrspace(1) inreg %out, float %s) {
 ; VERDE-LABEL: sample_1d_lwe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[14:15], exec
@@ -585,7 +585,7 @@
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
   %v.err = extractvalue {<4 x float>, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret <4 x float> %v.vec
 }
 
@@ -1588,7 +1588,7 @@
   ret float %v
 }
 
-define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, i32 addrspace(1)* inreg %out) {
+define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, ptr addrspace(1) inreg %out) {
 ; VERDE-LABEL: sample_c_d_o_2darray_V1_tfe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v9, 0
@@ -1643,7 +1643,7 @@
   %v = call {float,i32} @llvm.amdgcn.image.sample.c.d.o.2darray.f32i32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
   %v.vec = extractvalue {float, i32} %v, 0
   %v.err = extractvalue {float, i32} %v, 1
-  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  store i32 %v.err, ptr addrspace(1) %out, align 4
   ret float %v.vec
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll
index a084fa0..77f57b0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll

@@ -1,24 +1,22 @@
 ; RUN: not llc -mtriple=amdgcn-amd-amdhsa < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: in function test_kernel{{.*}}: non-hsa intrinsic with hsa target
-define amdgpu_kernel void @test_kernel(i32 addrspace(1)* %out) #1 {
-  %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr()
-  %header_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)*
-  %value = load i32, i32 addrspace(4)* %header_ptr
-  store i32 %value, i32 addrspace(1)* %out
+define amdgpu_kernel void @test_kernel(ptr addrspace(1) %out) #1 {
+  %implicit_buffer_ptr = call ptr addrspace(4) @llvm.amdgcn.implicit.buffer.ptr()
+  %value = load i32, ptr addrspace(4) %implicit_buffer_ptr
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
 ; ERROR: in function test_func{{.*}}: non-hsa intrinsic with hsa target
-define void @test_func(i32 addrspace(1)* %out) #1 {
-  %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr()
-  %header_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)*
-  %value = load i32, i32 addrspace(4)* %header_ptr
-  store i32 %value, i32 addrspace(1)* %out
+define void @test_func(ptr addrspace(1) %out) #1 {
+  %implicit_buffer_ptr = call ptr addrspace(4) @llvm.amdgcn.implicit.buffer.ptr()
+  %value = load i32, ptr addrspace(4) %implicit_buffer_ptr
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
-declare i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() #0
+declare ptr addrspace(4) @llvm.amdgcn.implicit.buffer.ptr() #0
 
 attributes #0 = { nounwind readnone speculatable }
 attributes #1 = { nounwind  }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
index 6b22d92..e9d9b66 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll

@@ -9,10 +9,9 @@
 ; GCN-NEXT: ; return
 define amdgpu_ps i32 @test_ps() #1 {
   %alloca = alloca i32, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
-  %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr()
-  %buffer_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)*
-  %value = load volatile i32, i32 addrspace(4)* %buffer_ptr
+  store volatile i32 0, ptr addrspace(5) %alloca
+  %implicit_buffer_ptr = call ptr addrspace(4) @llvm.amdgcn.implicit.buffer.ptr()
+  %value = load volatile i32, ptr addrspace(4) %implicit_buffer_ptr
   ret i32 %value
 }
 
@@ -22,14 +21,13 @@
 ; GCN: s_load_dword s0, s[0:1], 0x0
 define amdgpu_cs i32 @test_cs() #1 {
   %alloca = alloca i32, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
-  %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr()
-  %buffer_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)*
-  %value = load volatile i32, i32 addrspace(4)* %buffer_ptr
+  store volatile i32 0, ptr addrspace(5) %alloca
+  %implicit_buffer_ptr = call ptr addrspace(4) @llvm.amdgcn.implicit.buffer.ptr()
+  %value = load volatile i32, ptr addrspace(4) %implicit_buffer_ptr
   ret i32 %value
 }
 
-declare i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() #0
+declare ptr addrspace(4) @llvm.amdgcn.implicit.buffer.ptr() #0
 
 attributes #0 = { nounwind readnone speculatable }
 attributes #1 = { nounwind }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
index 92261d7..c085a2d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll

@@ -15,9 +15,8 @@
 
 ; COV5: .amdhsa_kernarg_size 256
 define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 {
-  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
-  %load = load volatile i32, i32 addrspace(4)* %cast
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load volatile i32, ptr addrspace(4) %implicitarg.ptr
   ret void
 }
 
@@ -37,9 +36,8 @@
 
 ; COV5: .amdhsa_kernarg_size 0
 define amdgpu_kernel void @kernel_implicitarg_ptr_empty_0implicit() #3 {
-  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
-  %load = load volatile i32, i32 addrspace(4)* %cast
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load volatile i32, ptr addrspace(4) %implicitarg.ptr
   ret void
 }
 
@@ -57,9 +55,8 @@
 
 ; COV5: .amdhsa_kernarg_size 48
 define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 {
-  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
-  %load = load volatile i32, i32 addrspace(4)* %cast
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load volatile i32, ptr addrspace(4) %implicitarg.ptr
   ret void
 }
 
@@ -77,9 +74,8 @@
 
 ; COV5: .amdhsa_kernarg_size 368
 define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {
-  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
-  %load = load volatile i32, i32 addrspace(4)* %cast
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load volatile i32, ptr addrspace(4) %implicitarg.ptr
   ret void
 }
 
@@ -97,9 +93,8 @@
 
 ; COV5: .amdhsa_kernarg_size 160
 define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 {
-  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
-  %load = load volatile i32, i32 addrspace(4)* %cast
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load volatile i32, ptr addrspace(4) %implicitarg.ptr
   ret void
 }
 
@@ -109,9 +104,8 @@
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @func_implicitarg_ptr() #0 {
-  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
-  %load = load volatile i32, i32 addrspace(4)* %cast
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load volatile i32, ptr addrspace(4) %implicitarg.ptr
   ret void
 }
 
@@ -121,9 +115,8 @@
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @opencl_func_implicitarg_ptr() #0 {
-  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
-  %load = load volatile i32, i32 addrspace(4)* %cast
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load volatile i32, ptr addrspace(4) %implicitarg.ptr
   ret void
 }
 
@@ -249,12 +242,10 @@
 ; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
 ; GCN: s_waitcnt lgkmcnt(0)
 define void @func_kernarg_implicitarg_ptr() #0 {
-  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
-  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
-  %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
-  %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
-  %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
+  %kernarg.segment.ptr = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load0 = load volatile i32, ptr addrspace(4) %kernarg.segment.ptr
+  %load1 = load volatile i32, ptr addrspace(4) %implicitarg.ptr
   ret void
 }
 
@@ -265,12 +256,10 @@
 ; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
 ; GCN: s_waitcnt lgkmcnt(0)
 define void @opencl_func_kernarg_implicitarg_ptr() #0 {
-  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
-  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
-  %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
-  %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
-  %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
+  %kernarg.segment.ptr = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load0 = load volatile i32, ptr addrspace(4) %kernarg.segment.ptr
+  %load1 = load volatile i32, ptr addrspace(4) %implicitarg.ptr
   ret void
 }
 
@@ -291,9 +280,8 @@
 
 ; COV5: .amdhsa_kernarg_size 120
 define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>, i32) #1 {
-  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
-  %load = load volatile i32, i32 addrspace(4)* %cast
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load volatile i32, ptr addrspace(4) %implicitarg.ptr
   ret void
 }
 
@@ -396,8 +384,8 @@
 ; COV5-NEXT:    .kernarg_segment_size: 120
 ; COV5-LABEL:   .name:           kernel_implicitarg_no_struct_align_padding
 
-declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2
-declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2
+declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #2
+declare ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #2
 
 attributes #0 = { nounwind noinline }
 attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
index f7f1f96..3920356 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll

@@ -99,20 +99,20 @@
   %array1 = alloca [20 x i32], align 16, addrspace(5)
   call void @llvm.amdgcn.init.exec(i64 -1)
 
-  %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
-  store i32 %a, i32 addrspace(5)* %ptr0, align 4
+  %ptr0 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 1
+  store i32 %a, ptr addrspace(5) %ptr0, align 4
 
-  %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
-  store i32 %a, i32 addrspace(5)* %ptr1, align 4
+  %ptr1 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 1
+  store i32 %a, ptr addrspace(5) %ptr1, align 4
 
-  %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
-  store i32 %b, i32 addrspace(5)* %ptr2, align 4
+  %ptr2 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 2
+  store i32 %b, ptr addrspace(5) %ptr2, align 4
 
-  %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
-  %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
+  %ptr3 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 %b
+  %v3 = load i32, ptr addrspace(5) %ptr3, align 4
 
-  %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
-  %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
+  %ptr4 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 %b
+  %v4 = load i32, ptr addrspace(5) %ptr4, align 4
 
   %v5 = add i32 %v3, %v4
   %v = bitcast i32 %v5 to float
@@ -133,20 +133,20 @@
   %array1 = alloca [20 x i32], align 16, addrspace(5)
   call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
 
-  %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
-  store i32 %a, i32 addrspace(5)* %ptr0, align 4
+  %ptr0 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 1
+  store i32 %a, ptr addrspace(5) %ptr0, align 4
 
-  %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
-  store i32 %a, i32 addrspace(5)* %ptr1, align 4
+  %ptr1 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 1
+  store i32 %a, ptr addrspace(5) %ptr1, align 4
 
-  %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
-  store i32 %b, i32 addrspace(5)* %ptr2, align 4
+  %ptr2 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 2
+  store i32 %b, ptr addrspace(5) %ptr2, align 4
 
-  %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
-  %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
+  %ptr3 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 %b
+  %v3 = load i32, ptr addrspace(5) %ptr3, align 4
 
-  %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
-  %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
+  %ptr4 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 %b
+  %v4 = load i32, ptr addrspace(5) %ptr4, align 4
 
   %v5 = add i32 %v3, %v4
   %v = bitcast i32 %v5 to float
@@ -178,20 +178,20 @@
 endif:
   call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
 
-  %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
-  store i32 %a, i32 addrspace(5)* %ptr0, align 4
+  %ptr0 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 1
+  store i32 %a, ptr addrspace(5) %ptr0, align 4
 
-  %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
-  store i32 %a, i32 addrspace(5)* %ptr1, align 4
+  %ptr1 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 1
+  store i32 %a, ptr addrspace(5) %ptr1, align 4
 
-  %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
-  store i32 %b, i32 addrspace(5)* %ptr2, align 4
+  %ptr2 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 2
+  store i32 %b, ptr addrspace(5) %ptr2, align 4
 
-  %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
-  %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
+  %ptr3 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 %b
+  %v3 = load i32, ptr addrspace(5) %ptr3, align 4
 
-  %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
-  %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
+  %ptr4 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 %b
+  %v4 = load i32, ptr addrspace(5) %ptr4, align 4
 
   %v5 = add i32 %v3, %v4
   %v6 = add i32 %v5, %count

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
index cdd6139..bd9d543 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll

@@ -73,7 +73,7 @@
   ret void
 }
 
-define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg %m0) #0 {
+define amdgpu_ps void @v_interp_f32_many_vm(ptr addrspace(1) %ptr, i32 inreg %m0) #0 {
 ; GCN-LABEL: v_interp_f32_many_vm:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:4
@@ -99,10 +99,10 @@
 ; GCN-NEXT:    exp mrt0 v6, v7, v8, v0 done
 ; GCN-NEXT:    s_endpgm
 main_body:
-  %i.ptr = getelementptr float, float addrspace(1)* %ptr, i32 1
-  %i = load float, float addrspace(1)* %i.ptr, align 4
-  %j.ptr = getelementptr float, float addrspace(1)* %ptr, i32 2
-  %j = load float, float addrspace(1)* %j.ptr, align 4
+  %i.ptr = getelementptr float, ptr addrspace(1) %ptr, i32 1
+  %i = load float, ptr addrspace(1) %i.ptr, align 4
+  %j.ptr = getelementptr float, ptr addrspace(1) %ptr, i32 2
+  %j = load float, ptr addrspace(1) %j.ptr, align 4
   %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
   %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
   %p2 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 2, i32 %m0)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
index 2ba23b7..fd27d7e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll

@@ -10,7 +10,7 @@
 ; GCN-DAG: v_interp_p1_f32{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}}
 ; GCN-DAG: v_interp_p2_f32{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}}
 ; GCN-DAG: v_interp_mov_f32{{(_e32)*}} v{{[0-9]+}}, p0, attr0.x{{$}}
-define amdgpu_ps void @v_interp(<16 x i8> addrspace(4)* inreg %arg, <16 x i8> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x float> %arg4) #0 {
+define amdgpu_ps void @v_interp(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x float> %arg4) #0 {
 main_body:
   %i = extractelement <2 x float> %arg4, i32 0
   %j = extractelement <2 x float> %arg4, i32 1
@@ -54,18 +54,18 @@
   %p0_10 = call float @llvm.amdgcn.interp.p1(float %i, i32 3, i32 64, i32 256)
   %p0_11 = call float @llvm.amdgcn.interp.p1(float %i, i32 4, i32 64, i32 256)
 
-  store volatile float %p0_0, float addrspace(1)* undef
-  store volatile float %p0_1, float addrspace(1)* undef
-  store volatile float %p0_2, float addrspace(1)* undef
-  store volatile float %p0_3, float addrspace(1)* undef
-  store volatile float %p0_4, float addrspace(1)* undef
-  store volatile float %p0_5, float addrspace(1)* undef
-  store volatile float %p0_6, float addrspace(1)* undef
-  store volatile float %p0_7, float addrspace(1)* undef
-  store volatile float %p0_8, float addrspace(1)* undef
-  store volatile float %p0_9, float addrspace(1)* undef
-  store volatile float %p0_10, float addrspace(1)* undef
-  store volatile float %p0_11, float addrspace(1)* undef
+  store volatile float %p0_0, ptr addrspace(1) undef
+  store volatile float %p0_1, ptr addrspace(1) undef
+  store volatile float %p0_2, ptr addrspace(1) undef
+  store volatile float %p0_3, ptr addrspace(1) undef
+  store volatile float %p0_4, ptr addrspace(1) undef
+  store volatile float %p0_5, ptr addrspace(1) undef
+  store volatile float %p0_6, ptr addrspace(1) undef
+  store volatile float %p0_7, ptr addrspace(1) undef
+  store volatile float %p0_8, ptr addrspace(1) undef
+  store volatile float %p0_9, ptr addrspace(1) undef
+  store volatile float %p0_10, ptr addrspace(1) undef
+  store volatile float %p0_11, ptr addrspace(1) undef
   ret void
 }
 
@@ -93,15 +93,15 @@
   %p2_7 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 0, i32 64, i32 256)
   %p2_8 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 4, i32 64, i32 256)
 
-  store volatile float %p2_0, float addrspace(1)* undef
-  store volatile float %p2_1, float addrspace(1)* undef
-  store volatile float %p2_2, float addrspace(1)* undef
-  store volatile float %p2_3, float addrspace(1)* undef
-  store volatile float %p2_4, float addrspace(1)* undef
-  store volatile float %p2_5, float addrspace(1)* undef
-  store volatile float %p2_6, float addrspace(1)* undef
-  store volatile float %p2_7, float addrspace(1)* undef
-  store volatile float %p2_8, float addrspace(1)* undef
+  store volatile float %p2_0, ptr addrspace(1) undef
+  store volatile float %p2_1, ptr addrspace(1) undef
+  store volatile float %p2_2, ptr addrspace(1) undef
+  store volatile float %p2_3, ptr addrspace(1) undef
+  store volatile float %p2_4, ptr addrspace(1) undef
+  store volatile float %p2_5, ptr addrspace(1) undef
+  store volatile float %p2_6, ptr addrspace(1) undef
+  store volatile float %p2_7, ptr addrspace(1) undef
+  store volatile float %p2_8, ptr addrspace(1) undef
   ret void
 }
 
@@ -140,21 +140,21 @@
   %mov_11 = call float @llvm.amdgcn.interp.mov(i32 3, i32 1, i32 64, i32 256)
   %mov_12 = call float @llvm.amdgcn.interp.mov(i32 10, i32 4, i32 64, i32 256)
 
-  store volatile float %mov_0, float addrspace(1)* undef
-  store volatile float %mov_1, float addrspace(1)* undef
-  store volatile float %mov_2, float addrspace(1)* undef
-  store volatile float %mov_3, float addrspace(1)* undef
+  store volatile float %mov_0, ptr addrspace(1) undef
+  store volatile float %mov_1, ptr addrspace(1) undef
+  store volatile float %mov_2, ptr addrspace(1) undef
+  store volatile float %mov_3, ptr addrspace(1) undef
 
-  store volatile float %mov_4, float addrspace(1)* undef
-  store volatile float %mov_5, float addrspace(1)* undef
-  store volatile float %mov_6, float addrspace(1)* undef
-  store volatile float %mov_7, float addrspace(1)* undef
-  store volatile float %mov_8, float addrspace(1)* undef
+  store volatile float %mov_4, ptr addrspace(1) undef
+  store volatile float %mov_5, ptr addrspace(1) undef
+  store volatile float %mov_6, ptr addrspace(1) undef
+  store volatile float %mov_7, ptr addrspace(1) undef
+  store volatile float %mov_8, ptr addrspace(1) undef
 
-  store volatile float %mov_9, float addrspace(1)* undef
-  store volatile float %mov_10, float addrspace(1)* undef
-  store volatile float %mov_11, float addrspace(1)* undef
-  store volatile float %mov_12, float addrspace(1)* undef
+  store volatile float %mov_9, ptr addrspace(1) undef
+  store volatile float %mov_10, ptr addrspace(1) undef
+  store volatile float %mov_11, ptr addrspace(1) undef
+  store volatile float %mov_12, ptr addrspace(1) undef
   ret void
 }
 
@@ -170,12 +170,12 @@
 ; TODO-VI-DAG: v_interp_mov_f32_e32 v{{[0-9]+}}, p0, attr0.x{{$}}
 ; TODO-VI: s_mov_b32 m0, -1{{$}}
 ; TODO-VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-;define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) #0 {
+;define amdgpu_ps void @v_interp_readnone(ptr addrspace(3) %lds) #0 {
 ;bb:
-;  store float 0.000000e+00, float addrspace(3)* %lds
+;  store float 0.000000e+00, ptr addrspace(3) %lds
 ;  %tmp1 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 0)
-;  %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
-;  store float 0.000000e+00, float addrspace(3)* %tmp2
+;  %tmp2 = getelementptr float, ptr addrspace(3) %lds, i32 4
+;  store float 0.000000e+00, ptr addrspace(3) %tmp2
 ;  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
 ;  ret void
 ;}
@@ -185,7 +185,7 @@
 
 ; GCN-LABEL: {{^}}v_interp_p1_bank16_bug:
 ; 16BANK-NOT: v_interp_p1_f32{{(_e32)*}} [[DST:v[0-9]+]], [[DST]]
-define amdgpu_ps void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(4)* inreg %arg, [17 x <16 x i8>] addrspace(4)* inreg %arg13, [17 x <4 x i32>] addrspace(4)* inreg %arg14, [34 x <8 x i32>] addrspace(4)* inreg %arg15, float inreg %arg16, i32 inreg %arg17, <2 x i32> %arg18, <2 x i32> %arg19, <2 x i32> %arg20, <3 x i32> %arg21, <2 x i32> %arg22, <2 x i32> %arg23, <2 x i32> %arg24, float %arg25, float %arg26, float %arg27, float %arg28, float %arg29, float %arg30, i32 %arg31, float %arg32, float %arg33) #0 {
+define amdgpu_ps void @v_interp_p1_bank16_bug(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg13, ptr addrspace(4) inreg %arg14, ptr addrspace(4) inreg %arg15, float inreg %arg16, i32 inreg %arg17, <2 x i32> %arg18, <2 x i32> %arg19, <2 x i32> %arg20, <3 x i32> %arg21, <2 x i32> %arg22, <2 x i32> %arg23, <2 x i32> %arg24, float %arg25, float %arg26, float %arg27, float %arg28, float %arg29, float %arg30, i32 %arg31, float %arg32, float %arg33) #0 {
 main_body:
   %i.i = extractelement <2 x i32> %arg19, i32 0
   %j.i = extractelement <2 x i32> %arg19, i32 1

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 5cf6a1b..5c31404 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll

@@ -160,7 +160,7 @@
 
 ; TODO: NSA reassign is very limited and cannot work with VGPR tuples and subregs.
 
-define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
+define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
 ; GFX1013:       ; %bb.0: ; %main_body
 ; GFX1013-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
@@ -240,10 +240,10 @@
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
-  %node_ptr = load i32, i32* %gep_node_ptr, align 4
-  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
-  %ray_extent = load float, float* %gep_ray, align 4
+  %gep_node_ptr = getelementptr inbounds i32, ptr %p_node_ptr, i32 %lid
+  %node_ptr = load i32, ptr %gep_node_ptr, align 4
+  %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid
+  %ray_extent = load float, ptr %gep_ray, align 4
   %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
   %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
   %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
@@ -254,11 +254,11 @@
   %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
   %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
-  store <4 x i32> %v, <4 x i32>* undef
+  store <4 x i32> %v, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
+define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 ; GFX1013:       ; %bb.0: ; %main_body
 ; GFX1013-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
@@ -330,10 +330,10 @@
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
-  %node_ptr = load i32, i32* %gep_node_ptr, align 4
-  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
-  %ray_extent = load float, float* %gep_ray, align 4
+  %gep_node_ptr = getelementptr inbounds i32, ptr %p_node_ptr, i32 %lid
+  %node_ptr = load i32, ptr %gep_node_ptr, align 4
+  %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid
+  %ray_extent = load float, ptr %gep_ray, align 4
   %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
   %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
   %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
@@ -344,11 +344,11 @@
   %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
   %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
-  store <4 x i32> %v, <4 x i32>* undef
+  store <4 x i32> %v, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) {
+define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
 ; GFX1013:       ; %bb.0: ; %main_body
 ; GFX1013-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
@@ -426,8 +426,8 @@
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
-  %ray_extent = load float, float* %gep_ray, align 4
+  %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid
+  %ray_extent = load float, ptr %gep_ray, align 4
   %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
   %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
   %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
@@ -438,11 +438,11 @@
   %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
   %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
-  store <4 x i32> %v, <4 x i32>* undef
+  store <4 x i32> %v, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) {
+define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
 ; GFX1013:       ; %bb.0: ; %main_body
 ; GFX1013-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
@@ -512,8 +512,8 @@
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
-  %ray_extent = load float, float* %gep_ray, align 4
+  %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid
+  %ray_extent = load float, ptr %gep_ray, align 4
   %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
   %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
   %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
@@ -524,7 +524,7 @@
   %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
   %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
-  store <4 x i32> %v, <4 x i32>* undef
+  store <4 x i32> %v, ptr undef
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index f5c137a..7760c41 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll

@@ -8,13 +8,13 @@
 ; GFX9: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16
 ; GCN: v_cmp_eq_u32_e32 vcc, [[APERTURE]], v[[PTR_HI]]
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
-define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) {
+define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i8*, i8* addrspace(1)* %ptr.ptr, i32 %id
-  %ptr = load volatile i8*, i8* addrspace(1)* %gep
-  %val = call i1 @llvm.amdgcn.is.private(i8* %ptr)
+  %gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id
+  %ptr = load volatile ptr, ptr addrspace(1) %gep
+  %val = call i1 @llvm.amdgcn.is.private(ptr %ptr)
   %ext = zext i1 %val to i32
-  store i32 %ext, i32 addrspace(1)* undef
+  store i32 %ext, ptr addrspace(1) undef
   ret void
 }
 
@@ -31,12 +31,12 @@
 
 ; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]]
 ; GCN: s_cbranch_vccnz
-define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
-  %val = call i1 @llvm.amdgcn.is.private(i8* %ptr)
+define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
+  %val = call i1 @llvm.amdgcn.is.private(ptr %ptr)
   br i1 %val, label %bb0, label %bb1
 
 bb0:
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %bb1
 
 bb1:
@@ -44,6 +44,6 @@
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
-declare i1 @llvm.amdgcn.is.private(i8* nocapture) #0
+declare i1 @llvm.amdgcn.is.private(ptr nocapture) #0
 
 attributes #0 = { nounwind readnone speculatable }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index f98676b..69e4906 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll

@@ -9,13 +9,13 @@
 
 ; GCN: v_cmp_eq_u32_e32 vcc, [[APERTURE]], v[[PTR_HI]]
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
-define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) {
+define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i8*, i8* addrspace(1)* %ptr.ptr, i32 %id
-  %ptr = load volatile i8*, i8* addrspace(1)* %gep
-  %val = call i1 @llvm.amdgcn.is.shared(i8* %ptr)
+  %gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id
+  %ptr = load volatile ptr, ptr addrspace(1) %gep
+  %val = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
   %ext = zext i1 %val to i32
-  store i32 %ext, i32 addrspace(1)* undef
+  store i32 %ext, ptr addrspace(1) undef
   ret void
 }
 
@@ -32,12 +32,12 @@
 
 ; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]]
 ; GCN: s_cbranch_vccnz
-define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
-  %val = call i1 @llvm.amdgcn.is.shared(i8* %ptr)
+define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
+  %val = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
   br i1 %val, label %bb0, label %bb1
 
 bb0:
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %bb1
 
 bb1:
@@ -45,6 +45,6 @@
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
-declare i1 @llvm.amdgcn.is.shared(i8* nocapture) #0
+declare i1 @llvm.amdgcn.is.shared(ptr nocapture) #0
 
 attributes #0 = { nounwind readnone speculatable }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
index 3ae0f77..0dadc39 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll

@@ -10,12 +10,11 @@
 ; CO-V2: s_load_dword s{{[0-9]+}}, s[4:5], 0xa
 
 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa
-define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 {
-  %kernarg.segment.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
-  %header.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
-  %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10
-  %value = load i32, i32 addrspace(4)* %gep
-  store i32 %value, i32 addrspace(1)* %out
+define amdgpu_kernel void @test(ptr addrspace(1) %out) #1 {
+  %kernarg.segment.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+  %gep = getelementptr i32, ptr addrspace(4) %kernarg.segment.ptr, i64 10
+  %value = load i32, ptr addrspace(4) %gep
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -26,12 +25,11 @@
 
 ; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15
 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0x15
-define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
-  %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %header.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
-  %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10
-  %value = load i32, i32 addrspace(4)* %gep
-  store i32 %value, i32 addrspace(1)* %out
+define amdgpu_kernel void @test_implicit(ptr addrspace(1) %out) #1 {
+  %implicitarg.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i32, ptr addrspace(4) %implicitarg.ptr, i64 10
+  %value = load i32, ptr addrspace(4) %gep
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -47,11 +45,10 @@
 ; ALL: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[VAL]]
 ; MESA: buffer_store_dword [[V_VAL]]
 ; HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]]
-define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 {
-  %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
-  %val = load i32, i32 addrspace(4)* %arg.ptr
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @test_implicit_alignment(ptr addrspace(1) %out, <2 x i8> %in) #1 {
+  %implicitarg.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %val = load i32, ptr addrspace(4) %implicitarg.ptr
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -67,11 +64,10 @@
 ; ALL: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[VAL]]
 ; MESA: buffer_store_dword [[V_VAL]]
 ; HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]]
-define amdgpu_kernel void @opencl_test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #2 {
-  %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
-  %val = load i32, i32 addrspace(4)* %arg.ptr
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @opencl_test_implicit_alignment(ptr addrspace(1) %out, <2 x i8> %in) #2 {
+  %implicitarg.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %val = load i32, ptr addrspace(4) %implicitarg.ptr
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -84,11 +80,10 @@
 ; HSA: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 ; HSA: s_load_dword s{{[0-9]+}}, [[NULL]], 0xa{{$}}
 define amdgpu_kernel void @test_no_kernargs() #1 {
-  %kernarg.segment.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
-  %header.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
-  %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10
-  %value = load i32, i32 addrspace(4)* %gep
-  store volatile i32 %value, i32 addrspace(1)* undef
+  %kernarg.segment.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+  %gep = getelementptr i32, ptr addrspace(4) %kernarg.segment.ptr, i64 10
+  %value = load i32, ptr addrspace(4) %gep
+  store volatile i32 %value, ptr addrspace(1) undef
   ret void
 }
 
@@ -97,10 +92,9 @@
 ; OS-MESA3d: kernarg_segment_byte_size = 16
 ; CO-V2: kernarg_segment_alignment = 4
 define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs() #2 {
-  %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
-  %val = load volatile i32, i32 addrspace(4)* %arg.ptr
-  store volatile i32 %val, i32 addrspace(1)* null
+  %implicitarg.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %val = load volatile i32, ptr addrspace(4) %implicitarg.ptr
+  store volatile i32 %val, ptr addrspace(1) null
   ret void
 }
 
@@ -109,15 +103,14 @@
 ; OS-MESA3D: kernarg_segment_byte_size = 16
 ; CO-V2: kernarg_segment_alignment = 4
 define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs_round_up() #3 {
-  %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
-  %val = load volatile i32, i32 addrspace(4)* %arg.ptr
-  store volatile i32 %val, i32 addrspace(1)* null
+  %implicitarg.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %val = load volatile i32, ptr addrspace(4) %implicitarg.ptr
+  store volatile i32 %val, ptr addrspace(1) null
   ret void
 }
 
-declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
-declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
+declare ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #0
+declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #0
 
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
index 0d17279..b9934f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll

@@ -74,13 +74,13 @@
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    i32 addrspace(1)* %b) {
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load i32, i32 addrspace(1)* %b
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load i32, ptr addrspace(1) %b
   %r.val = call half @llvm.amdgcn.ldexp.f16(half %a.val, i32 %b.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -139,11 +139,11 @@
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-    half addrspace(1)* %r,
-    i32 addrspace(1)* %b) {
-  %b.val = load i32, i32 addrspace(1)* %b
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %b) {
+  %b.val = load i32, ptr addrspace(1) %b
   %r.val = call half @llvm.amdgcn.ldexp.f16(half 2.0, i32 %b.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -202,10 +202,10 @@
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-    half addrspace(1)* %r,
-    half addrspace(1)* %a) {
-  %a.val = load half, half addrspace(1)* %a
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
+  %a.val = load half, ptr addrspace(1) %a
   %r.val = call half @llvm.amdgcn.ldexp.f16(half %a.val, i32 2)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
index 1ab4e8b..d428206 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll

@@ -7,25 +7,25 @@
 ; SI-LABEL: {{^}}test_ldexp_f32:
 ; SI: v_ldexp_f32
 ; SI: s_endpgm
-define amdgpu_kernel void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
+define amdgpu_kernel void @test_ldexp_f32(ptr addrspace(1) %out, float %a, i32 %b) nounwind {
   %result = call float @llvm.amdgcn.ldexp.f32(float %a, i32 %b) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; SI-LABEL: {{^}}test_ldexp_f64:
 ; SI: v_ldexp_f64
 ; SI: s_endpgm
-define amdgpu_kernel void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
+define amdgpu_kernel void @test_ldexp_f64(ptr addrspace(1) %out, double %a, i32 %b) nounwind {
   %result = call double @llvm.amdgcn.ldexp.f64(double %a, i32 %b) nounwind readnone
-  store double %result, double addrspace(1)* %out, align 8
+  store double %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; SI-LABEL: {{^}}test_ldexp_undef_f32:
 ; SI-NOT: v_ldexp_f32
-define amdgpu_kernel void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind {
+define amdgpu_kernel void @test_ldexp_undef_f32(ptr addrspace(1) %out, i32 %b) nounwind {
   %result = call float @llvm.amdgcn.ldexp.f32(float undef, i32 %b) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 4e8cc7b..61818da 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll

@@ -4,7 +4,7 @@
 declare i32 @llvm.amdgcn.lds.kernel.id()
 declare i32 @llvm.amdgcn.workgroup.id.x()
 
-define void @function_lds_id(i32 addrspace(1)* %out) {
+define void @function_lds_id(ptr addrspace(1) %out) {
 ; GCN-LABEL: function_lds_id:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16,11 +16,11 @@
   %tmp0 = call i32 @llvm.amdgcn.lds.kernel.id()
   %help = call i32 @llvm.amdgcn.workgroup.id.x()
   %both = add i32 %tmp0, %help
-  store i32 %both, i32 addrspace(1)* %out
+  store i32 %both, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @kernel_lds_id(i32 addrspace(1)* %out) !llvm.amdgcn.lds.kernel.id !0 {
+define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
 ; GCN-LABEL: kernel_lds_id:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -34,11 +34,11 @@
   %tmp0 = call i32 @llvm.amdgcn.lds.kernel.id()
   %help = call i32 @llvm.amdgcn.workgroup.id.x()
   %both = add i32 %tmp0, %help
-  store i32 %both, i32 addrspace(1)* %out
+  store i32 %both, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @indirect_lds_id(i32 addrspace(1)* %out) !llvm.amdgcn.lds.kernel.id !1 {
+define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !1 {
 ; GCN-LABEL: indirect_lds_id:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
@@ -59,11 +59,11 @@
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    s_endpgm
-  call void @function_lds_id(i32 addrspace(1) * %out)
+  call void @function_lds_id(ptr addrspace(1) %out)
   ret void
 }
 
-define amdgpu_kernel void @doesnt_use_it(i32 addrspace(1)* %out) !llvm.amdgcn.lds.kernel.id !0 {
+define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
 ; GCN-LABEL: doesnt_use_it:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -73,7 +73,7 @@
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
-  store i32 100, i32 addrspace(1)* %out
+  store i32 100, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
index 294d11a..92bdbe5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll

@@ -5,9 +5,9 @@
 
 ; GCN-LABEL: {{^}}v_lerp:
 ; GCN: v_lerp_u8 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind {
+define amdgpu_kernel void @v_lerp(ptr addrspace(1) %out, i32 %src) nounwind {
   %result= call i32 @llvm.amdgcn.lerp(i32 %src, i32 100, i32 100) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
index a96127d..7ed8b81 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll

@@ -7,9 +7,9 @@
 
 ; GCN-LABEL: {{^}}v_log_clamp_f32:
 ; GCN: v_log_clamp_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define amdgpu_kernel void @v_log_clamp_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_log_clamp_f32(ptr addrspace(1) %out, float %src) #1 {
   %log.clamp = call float @llvm.amdgcn.log.clamp.f32(float %src) #0
-  store float %log.clamp, float addrspace(1)* %out
+  store float %log.clamp, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
index 88d6bea..45c3a1a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll

@@ -5,7 +5,7 @@
 ; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[LO:v[0-9]+]], -1, 0
 ; SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
 ; VI: v_mbcnt_hi_u32_b32 {{v[0-9]+}}, -1, [[LO]]
-define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(4)* inreg %arg, <16 x i8> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3) {
+define amdgpu_ps void @mbcnt_intrinsics(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3) {
 main_body:
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #0

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
index 95adef6..c9cee8b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll

@@ -52,13 +52,13 @@
 ; GFX908:          global_store_dwordx4
 ; GFX90A-NOT:      v_accvgpr_read_b32
 ; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}],
-define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %a = bitcast i32 1 to <2 x i16>
   %b = bitcast i32 2 to <2 x i16>
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -73,13 +73,13 @@
 ; GFX908:          global_store_dwordx4
 ; GFX90A-NOT:      v_accvgpr_read_b32
 ; GFX90A-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}],
-define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %a = bitcast i32 1 to <2 x i16>
   %b = bitcast i32 2 to <2 x i16>
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -94,13 +94,13 @@
 ; GFX908:         global_store_dwordx4
 ; GFX90A-NOT:     v_accvgpr_read_b32
 ; GFX90A:         global_store_dwordx4 v{{[0-9]+}}, [[RES]],
-define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %a = bitcast i32 1 to <2 x i16>
   %b = bitcast i32 2 to <2 x i16>
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -115,13 +115,13 @@
 ; GFX908:          global_store_dwordx4
 ; GFX90A-NOT:      v_accvgpr_read_b32
 ; GFX90A-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}],
-define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %a = bitcast i32 1 to <2 x i16>
   %b = bitcast i32 2 to <2 x i16>
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -136,13 +136,13 @@
 ; GFX908:         global_store_dwordx4
 ; GFX90A-NOT:     v_accvgpr_read_b32
 ; GFX90A:         global_store_dwordx4 v{{[0-9]+}}, [[RES]],
-define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %a = bitcast i32 1 to <2 x i16>
   %b = bitcast i32 2 to <2 x i16>
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 72a4327..e28fba2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll

@@ -20,13 +20,13 @@
 ; GFX940:      v_mfma_f32_32x32x4_2b_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:     v_accvgpr_read_b32
 ; GCN-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %a = bitcast i64 1 to <4 x i16>
   %b = bitcast i64 2 to <4 x i16>
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -39,13 +39,13 @@
 ; GFX940:       v_mfma_f32_16x16x4_4b_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:      v_accvgpr_read_b32
 ; GCN-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %a = bitcast i64 1 to <4 x i16>
   %b = bitcast i64 2 to <4 x i16>
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -58,13 +58,13 @@
 ; GFX940:      v_mfma_f32_4x4x4_16b_bf16 [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:     v_accvgpr_read_b32
 ; GCN:         global_store_dwordx4 v{{[0-9]+}}, [[RES]],
-define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %a = bitcast i64 1 to <4 x i16>
   %b = bitcast i64 2 to <4 x i16>
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -77,13 +77,13 @@
 ; GFX940:       v_mfma_f32_32x32x8_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:      v_accvgpr_read_b32
 ; GCN-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %a = bitcast i64 1 to <4 x i16>
   %b = bitcast i64 2 to <4 x i16>
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -96,13 +96,13 @@
 ; GFX940:      v_mfma_f32_16x16x16_bf16 [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:     v_accvgpr_read_b32
 ; GCN:         global_store_dwordx4 v{{[0-9]+}}, [[RES]],
-define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %a = bitcast i64 1 to <4 x i16>
   %b = bitcast i64 2 to <4 x i16>
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -112,11 +112,11 @@
 ; GFX940: v_mfma_f64_4x4x4_4b_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}}
 ; GFX940: v_mfma_f64_4x4x4_4b_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
 ; GCN:    global_store_dwordx2
-define amdgpu_kernel void @test_mfma_f64_4x4x4f64(double addrspace(1)* %arg, double %a, double %b) #0 {
+define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 {
 bb:
   %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
   %mai.2 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %mai.1, i32 1, i32 2, i32 3)
-  store double %mai.2, double addrspace(1)* %arg
+  store double %mai.2, ptr addrspace(1) %arg
   ret void
 }
 
@@ -126,11 +126,11 @@
 ; GFX940: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 neg:[1,1,0]
 ; GCN:    global_store_dwordx4
 ; GCN:    global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f64_16x16x4f64(<4 x double> addrspace(1)* %arg, double %a, double %b) #0 {
+define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 {
 bb:
-  %in.1 = load <4 x double>, <4 x double> addrspace(1)* %arg
+  %in.1 = load <4 x double>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %in.1, i32 1, i32 2, i32 3)
-  store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
+  store <4 x double> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -141,11 +141,11 @@
 ; GFX940: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
 ; GCN:    global_store_dwordx4
 ; GCN:    global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) #0 {
+define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %arg, double %a, double %b) #0 {
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, i32 0, i32 0, i32 0)
   %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
-  store <4 x double> %mai.2, <4 x double> addrspace(1)* %arg
+  store <4 x double> %mai.2, ptr addrspace(1) %arg
   ret void
 }
 
@@ -154,10 +154,10 @@
 ; GFX940: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
 ; GCN:    global_store_dwordx4
 ; GCN:    global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) #0 {
+define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 {
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
-  store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
+  store <4 x double> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -168,10 +168,10 @@
 ; GFX940:  v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
 ; GCN:     global_store_dwordx4
 ; GCN:     global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(<4 x double> addrspace(1)* %arg, double %a, double %b) #0 {
+define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 {
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
-  store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
+  store <4 x double> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
index bec1fb3..d5b6d44 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll

@@ -40,11 +40,11 @@
 ; GISEL:       v_mfma_i32_16x16x32_i8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:     v_accvgpr_read_b32
 ; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_i32_16x16x32i8(<4 x i32> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg
+  %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg
+  store <4 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -58,11 +58,11 @@
 ; GISEL:        v_mfma_i32_32x32x16_i8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:      v_accvgpr_read_b32
 ; GCN-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_i32_32x32x16i8(<16 x i32> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg
+  %in.1 = load <16 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 4294967298, i64 12884901892, <16 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
+  store <16 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -76,11 +76,11 @@
 ; GISEL:       v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:     v_accvgpr_read_b32
 ; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -94,11 +94,11 @@
 ; GISEL:       v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:     v_accvgpr_read_b32
 ; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -112,11 +112,11 @@
 ; GISEL:       v_mfma_f32_16x16x32_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:     v_accvgpr_read_b32
 ; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -130,11 +130,11 @@
 ; GISEL:       v_mfma_f32_16x16x32_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:     v_accvgpr_read_b32
 ; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -148,11 +148,11 @@
 ; GISEL:       v_mfma_f32_16x16x32_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:     v_accvgpr_read_b32
 ; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -166,11 +166,11 @@
 ; GISEL:       v_mfma_f32_16x16x32_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:     v_accvgpr_read_b32
 ; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -184,11 +184,11 @@
 ; GISEL:       v_mfma_f32_32x32x16_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:     v_accvgpr_read_b32
 ; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -202,11 +202,11 @@
 ; GISEL:       v_mfma_f32_32x32x16_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:     v_accvgpr_read_b32
 ; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -220,11 +220,11 @@
 ; GISEL:       v_mfma_f32_32x32x16_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:     v_accvgpr_read_b32
 ; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -238,11 +238,11 @@
 ; GISEL:       v_mfma_f32_32x32x16_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:     v_accvgpr_read_b32
 ; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -254,11 +254,11 @@
 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
 ; GCN:        v_smfmac_f32_16x16x32_f16 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
 ; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:[[RHI]]]
-define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(<4 x float> addrspace(1)* %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
+define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -273,11 +273,11 @@
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
-define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(<16 x float> addrspace(1)* %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
+define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -289,11 +289,11 @@
 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
 ; GCN:        v_smfmac_f32_16x16x32_bf16 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
 ; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:[[RHI]]]
-define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(<4 x float> addrspace(1)* %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
+define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -308,11 +308,11 @@
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
-define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(<16 x float> addrspace(1)* %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
+define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -324,11 +324,11 @@
 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
 ; GCN:        v_smfmac_i32_16x16x64_i8 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
 ; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:[[RHI]]]
-define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(<4 x i32> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
 bb:
-  %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg
+  %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %in.1, i32 %idx, i32 1, i32 2)
-  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg
+  store <4 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -343,11 +343,11 @@
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
-define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(<16 x i32> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
 bb:
-  %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg
+  %in.1 = load <16 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %in.1, i32 %idx, i32 1, i32 2)
-  store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
+  store <16 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -359,11 +359,11 @@
 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
 ; GCN:        v_smfmac_f32_16x16x64_bf8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
 ; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
-define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -375,11 +375,11 @@
 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
 ; GCN:        v_smfmac_f32_16x16x64_bf8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
 ; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
-define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -391,11 +391,11 @@
 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
 ; GCN:        v_smfmac_f32_16x16x64_fp8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
 ; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
-define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -407,11 +407,11 @@
 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
 ; GCN:        v_smfmac_f32_16x16x64_fp8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
 ; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
-define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -426,11 +426,11 @@
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
-define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -445,11 +445,11 @@
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
-define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -464,11 +464,11 @@
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
-define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -483,11 +483,11 @@
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
 ; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
-define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
index ee041ca..b5beaa5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll

@@ -16,11 +16,11 @@
 ; GFX908:          global_store_dwordx4
 ; GFX90A-NOT:      v_accvgpr_read_b32
 ; GFX90A-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_i32_32x32x8i8(<16 x i32> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg
+  %in.1 = load <16 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
+  store <16 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -35,11 +35,11 @@
 ; GFX908:         global_store_dwordx4
 ; GFX90A-NOT:     v_accvgpr_read_b32
 ; GFX90A:         global_store_dwordx4 v{{[0-9]+}}, [[RES]]
-define amdgpu_kernel void @test_mfma_i32_16x16x16i8(<4 x i32> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg
+  %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg
+  store <4 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 5dabf3d..d3294c6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll

@@ -68,11 +68,11 @@
 ; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
 ; GFX90A-NOT:     v_accvgpr_read_b32
 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -88,11 +88,11 @@
 ; GFX908-COUNT-4:    global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
 ; GFX90A-NOT:        v_accvgpr_read_b32
 ; GFX90A-COUNT-4:    global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x1f32(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -108,11 +108,11 @@
 ; GFX908:           global_store_dwordx4
 ; GFX90A-NOT:       v_accvgpr_read_b32
 ; GFX90A:           global_store_dwordx4 {{v[0-9]+}}, [[RES]]
-define amdgpu_kernel void @test_mfma_f32_4x4x1f32(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -128,11 +128,11 @@
 ; GFX908-COUNT-4:    global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
 ; GFX90A-NOT:        v_accvgpr_read_b32
 ; GFX90A-COUNT-4:    global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x2f32(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -148,11 +148,11 @@
 ; GFX908:           global_store_dwordx4
 ; GFX90A-NOT:       v_accvgpr_read_b32
 ; GFX90A:           global_store_dwordx4 {{v[0-9]+}}, [[RES]],
-define amdgpu_kernel void @test_mfma_f32_16x16x4f32(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -167,14 +167,14 @@
 ; GFX908:            global_store_dwordx4
 ; GFX90A-NOT:        v_accvgpr_read_b32
 ; GFX90A-COUNT-8:    global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
-define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 {
 bb:
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
-  %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c
-  %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1
-  %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
+  %c.1 = load <4 x half>, ptr addrspace(1) %c
+  %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1
+  %c.2 = load <4 x half>, ptr addrspace(1) %c2p
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x float> %in.1, i32 1, i32 2, i32 3)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -188,14 +188,14 @@
 ; GFX908:            global_store_dwordx4
 ; GFX90A-NOT:        v_accvgpr_read_b32
 ; GFX90A-COUNT-4:    global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
-define amdgpu_kernel void @test_mfma_f32_16x16x4f16(<16 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
-  %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c
-  %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1
-  %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
+  %c.1 = load <4 x half>, ptr addrspace(1) %c
+  %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1
+  %c.2 = load <4 x half>, ptr addrspace(1) %c2p
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -210,14 +210,14 @@
 ; GFX908:           global_store_dwordx4
 ; GFX90A-NOT:       v_accvgpr_read_b32
 ; GFX90A:           global_store_dwordx4 {{v[0-9]+}}, [[RES]],
-define amdgpu_kernel void @test_mfma_f32_4x4x4f16(<4 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) #0 {
+define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
-  %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c
-  %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1
-  %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
+  %c.1 = load <4 x half>, ptr addrspace(1) %c
+  %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1
+  %c.2 = load <4 x half>, ptr addrspace(1) %c2p
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -233,14 +233,14 @@
 ; GFX908:            global_store_dwordx4
 ; GFX90A-NOT:        v_accvgpr_read_b32
 ; GFX90A-COUNT-4:    global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
-define amdgpu_kernel void @test_mfma_f32_32x32x8f16(<16 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
-  %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c
-  %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1
-  %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
+  %c.1 = load <4 x half>, ptr addrspace(1) %c
+  %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1
+  %c.2 = load <4 x half>, ptr addrspace(1) %c2p
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -255,14 +255,14 @@
 ; GFX908:           global_store_dwordx4
 ; GFX90A-NOT:       v_accvgpr_read_b32
 ; GFX90A:           global_store_dwordx4 {{v[0-9]+}}, [[RES]],
-define amdgpu_kernel void @test_mfma_f32_16x16x16f16(<4 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
-  %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c
-  %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1
-  %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
+  %c.1 = load <4 x half>, ptr addrspace(1) %c
+  %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1
+  %c.2 = load <4 x half>, ptr addrspace(1) %c2p
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -310,11 +310,11 @@
 ; GFX908:          global_store_dwordx4
 ; GFX90A-NOT:      v_accvgpr_read_b32
 ; GFX90A-COUNT-8:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
-define amdgpu_kernel void @test_mfma_i32_32x32x4i8(<32 x i32> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
+  %in.1 = load <32 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 2, <32 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  store <32 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -330,11 +330,11 @@
 ; GFX908:            global_store_dwordx4
 ; GFX90A-NOT:        v_accvgpr_read_b32
 ; GFX90A-COUNT-4:    global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
-define amdgpu_kernel void @test_mfma_i32_16x16x4i8(<16 x i32> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg
+  %in.1 = load <16 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
+  store <16 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -350,11 +350,11 @@
 ; GFX908:           global_store_dwordx4
 ; GFX90A-NOT:       v_accvgpr_read_b32
 ; GFX90A:           global_store_dwordx4 {{v[0-9]+}}, [[RES]],
-define amdgpu_kernel void @test_mfma_i32_4x4x4i8(<4 x i32> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg
+  %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg
+  store <4 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -363,12 +363,12 @@
 ; GFX908_A-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
 ; GFX940:        v_mfma_f32_32x32x1_2b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
 ; GFX940-NEXT:   v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
   %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.2, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.2, ptr addrspace(1) %arg
   ret void
 }
 
@@ -377,12 +377,12 @@
 ; GFX908_A-NEXT: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
 ; GFX940:        v_mfma_f32_16x16x1_4b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
 ; GFX940-NEXT:   v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
-define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
   %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %mai.1, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.2, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.2, ptr addrspace(1) %arg
   ret void
 }
 
@@ -392,12 +392,12 @@
 ; GFX940:        v_mfma_f32_4x4x1_16b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
 ; GFX940-NEXT:   s_nop 1
 ; GFX940-NEXT:   v_mfma_f32_4x4x1_16b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
-define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
   %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %mai.1, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.2, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.2, ptr addrspace(1) %arg
   ret void
 }
 
@@ -416,10 +416,10 @@
 ; GFX908:         global_store_dwordx4
 ; GFX90A-NOT:     v_accvgpr_read_b32
 ; GFX90A:         global_store_dwordx4 {{v[0-9]+}}, [[RES]],
-define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %arg) #0 {
 bb:
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -435,10 +435,10 @@
 ; GFX908:          global_store_dwordx4
 ; GFX90A-NOT:      v_accvgpr_read_b32
 ; GFX90A-COUNT-4:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
-define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %arg) #0 {
 bb:
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -454,10 +454,10 @@
 ; GFX908:          global_store_dwordx4
 ; GFX90A-NOT:      v_accvgpr_read_b32
 ; GFX90A-COUNT-4:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
-define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %arg) #0 {
 bb:
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -473,10 +473,10 @@
 ; GFX908:          global_store_dwordx4
 ; GFX90A-NOT:      v_accvgpr_read_b32
 ; GFX90A-COUNT-8:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %arg) #0 {
 bb:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -493,10 +493,10 @@
 ; GFX908:         global_store_dwordx4
 ; GFX90A-NOT:     v_accvgpr_read_b32
 ; GFX90A:         global_store_dwordx4 {{v[0-9]+}}, [[RES]],
-define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 {
 bb:
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -511,10 +511,10 @@
 ; GFX908:          global_store_dwordx4
 ; GFX90A-NOT:      v_accvgpr_read_b32
 ; GFX90A-COUNT-4:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
-define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #0 {
 bb:
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 2.0>, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -587,10 +587,10 @@
 ; GFX908:          global_store_dwordx4
 ; GFX90A-NOT:      v_accvgpr_read_b32
 ; GFX90A-COUNT-8:  global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #0 {
 bb:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -610,13 +610,13 @@
 ; GFX908:         global_store_dwordx4
 ; GFX90A-NOT:     v_accvgpr_read_b32
 ; GFX90A:         global_store_dwordx4 {{v[0-9]+}}, [[RES]]
-define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(<4 x float> addrspace(1)* %arg, i64 %idx) #0 {
+define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %arg, i64 %idx) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid
+  %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0)
-  ;store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %gep
+  ;store <4 x float> %mai.1, ptr addrspace(1) %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %gep
   ret void
 }
 
@@ -634,13 +634,13 @@
 ; GFX908-COUNT-4: v_accvgpr_read_b32
 ; GFX908:    global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}]
 ; GFX90A_40: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid
+  %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
 
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -659,13 +659,13 @@
 ; GFX908-COUNT-8:    global_store_dwordx4
 ; GFX90A_40-NOT:     v_accvgpr_read_b32
 ; GFX90A_40-COUNT-5: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid
+  %in.1 = load <32 x float>, ptr addrspace(1) %gep
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
+  store <32 x float> %mai.1, ptr addrspace(1) %gep
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
index 481161e..acb6398 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll

@@ -11,9 +11,9 @@
 ; PREGFX10: s_nop 1
 ; VI-OPT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
 ; VI-NOOPT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x01,0x08,0x11]
-define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
-  store i32 %tmp0, i32 addrspace(1)* %out
+  store i32 %tmp0, ptr addrspace(1) %out
   ret void
 }
 
@@ -26,10 +26,10 @@
 ; PREGFX10: s_nop 1
 ; VI-OPT: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1
 ; VI-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1
-define amdgpu_kernel void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @dpp_wait_states(ptr addrspace(1) %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
   %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -43,17 +43,17 @@
 ; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1
 ; PREGFX10: s_nop 1
 ; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1
-define amdgpu_kernel void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) {
+define amdgpu_kernel void @dpp_first_in_bb(ptr addrspace(1) %out, ptr addrspace(1) %in, float %cond, float %a, float %b) {
   %cmp = fcmp oeq float %cond, 0.0
   br i1 %cmp, label %if, label %else
 
 if:
-  %out_val = load float, float addrspace(1)* %out
+  %out_val = load float, ptr addrspace(1) %out
   %if_val = fadd float %a, %out_val
   br label %endif
 
 else:
-  %in_val = load float, float addrspace(1)* %in
+  %in_val = load float, ptr addrspace(1) %in
   %else_val = fadd float %b, %in_val
   br label %endif
 
@@ -64,16 +64,16 @@
   %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
   %tmp2 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp1, i32 1, i32 1, i32 1, i1 1) #0
   %tmp_float = bitcast i32 %tmp2 to float
-  store float %tmp_float, float addrspace(1)* %out
+  store float %tmp_float, ptr addrspace(1) %out
   ret void
 }
 
 ; VI-LABEL: {{^}}mov_dpp64_test:
 ; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-define amdgpu_kernel void @mov_dpp64_test(i64 addrspace(1)* %out, i64 %in1) {
+define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) {
   %tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 %in1, i32 1, i32 1, i32 1, i1 0) #0
-  store i64 %tmp0, i64 addrspace(1)* %out
+  store i64 %tmp0, ptr addrspace(1) %out
   ret void
 }
 
@@ -85,9 +85,9 @@
 ; VI-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[OLD_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; VI-OPT-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; VI-NOOPT-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-define amdgpu_kernel void @mov_dpp64_imm_test(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @mov_dpp64_imm_test(ptr addrspace(1) %out) {
   %tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 123451234512345, i32 1, i32 1, i32 1, i1 0) #0
-  store i64 %tmp0, i64 addrspace(1)* %out
+  store i64 %tmp0, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
index b49d188..fc94f95 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll

@@ -6,9 +6,9 @@
 ; GFX10PLUS-LABEL: {{^}}dpp8_test:
 ; GFX10PLUS: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
 ; GFX10PLUS: v_mov_b32_dpp [[SRC]], [[SRC]]  dpp8:[1,0,0,0,0,0,0,0]{{$}}
-define amdgpu_kernel void @dpp8_test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @dpp8_test(ptr addrspace(1) %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %in, i32 1) #0
-  store i32 %tmp0, i32 addrspace(1)* %out
+  store i32 %tmp0, ptr addrspace(1) %out
   ret void
 }
 
@@ -17,10 +17,10 @@
 ; GFX10PLUS: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s{{[0-9]+}}
 ; GFX10PLUS: v_mov_b32_dpp [[VGPR0]], [[VGPR0]] dpp8:[1,0,0,0,0,0,0,0]{{$}}
 ; GFX10PLUS: v_mov_b32_dpp [[VGPR0]], [[VGPR0]] dpp8:[5,0,0,0,0,0,0,0]{{$}}
-define amdgpu_kernel void @dpp8_wait_states(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @dpp8_wait_states(ptr addrspace(1) %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %in, i32 1) #0
   %tmp1 = call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %tmp0, i32 5) #0
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
index 83bc8b2..5bf6c93 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll

@@ -7,11 +7,11 @@
 ; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ; GCN-DAG: v_mov_b32_e32 v5, v1
 ; GCN-DAG: v_mov_b32_e32 v4, v0
-define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_mqsad_pk_u16_u8(ptr addrspace(1) %out, i64 %src) {
   %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[4:5]},v"(i64 %src) #0
   %tmp1 = call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %tmp, i32 100, i64 100) #0
   %tmp2 = call i64 asm ";; force constraint", "=v,{v[4:5]}"(i64 %tmp1) #0
-  store i64 %tmp2, i64 addrspace(1)* %out, align 4
+  store i64 %tmp2, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -19,13 +19,13 @@
 ; GCN: v_mqsad_pk_u16_u8 v[0:1], v[2:3], v4, v[6:7]
 ; GCN-DAG: v_mov_b32_e32 v3, v1
 ; GCN-DAG: v_mov_b32_e32 v2, v0
-define amdgpu_kernel void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
+define amdgpu_kernel void @v_mqsad_pk_u16_u8_non_immediate(ptr addrspace(1) %out, i64 %src, i32 %a, i64 %b) {
   %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[2:3]},v"(i64 %src) #0
   %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={v4},v"(i32 %a) #0
   %tmp2 = call i64 asm "v_lshlrev_b64 $0, $1, 1", "={v[6:7]},v"(i64 %b) #0
   %tmp3 = call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %tmp, i32 %tmp1, i64 %tmp2) #0
   %tmp4 = call i64 asm ";; force constraint", "=v,{v[2:3]}"(i64 %tmp3) #0
-  store i64 %tmp4, i64 addrspace(1)* %out, align 4
+  store i64 %tmp4, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
index 685b5e0..488866e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll

@@ -7,12 +7,12 @@
 ; GCN-DAG: v_mov_b32_e32 v0, v2
 ; GCN-DAG: v_mov_b32_e32 v1, v3
 ; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(ptr addrspace(1) %out, i64 %src, i32 %a) {
   %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[2:3]},v"(i64 %src) #0
   %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={v4},v"(i32 %a) #0
   %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> <i32 10, i32 20, i32 30, i32 40>) #0
   %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{v[2:5]}"(<4 x i32> %tmp2) #0
-  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
+  store <4 x i32> %tmp3, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -20,12 +20,12 @@
 ; GCN-DAG: v_mov_b32_e32 v0, v2
 ; GCN-DAG: v_mov_b32_e32 v1, v3
 ; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) {
+define amdgpu_kernel void @v_mqsad_u32_u8_non_immediate(ptr addrspace(1) %out, i64 %src, i32 %a, <4 x i32> %b) {
   %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[2:3]},v"(i64 %src) #0
   %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={v4},v"(i32 %a) #0
   %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> %b) #0
   %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{v[2:5]}"(<4 x i32> %tmp2) #0
-  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
+  store <4 x i32> %tmp3, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -33,12 +33,12 @@
 ; GCN-DAG: v_mov_b32_e32 v0, v2
 ; GCN-DAG: v_mov_b32_e32 v1, v3
 ; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+define amdgpu_kernel void @v_mqsad_u32_u8_inline_fp_immediate(ptr addrspace(1) %out, i64 %src, i32 %a) {
   %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[2:3]},v"(i64 %src) #0
   %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={v4},v"(i32 %a) #0
   %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> <i32 1065353216, i32 0, i32 0, i32 0>) #0
   %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{v[2:5]}"(<4 x i32> %tmp2) #0
-  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
+  store <4 x i32> %tmp3, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -46,13 +46,13 @@
 ; GCN-DAG: v_mov_b32_e32 v0, v2
 ; GCN-DAG: v_mov_b32_e32 v1, v3
 ; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) {
-  %in = load <4 x i32>, <4 x i32> addrspace(1) * %input
+define amdgpu_kernel void @v_mqsad_u32_u8_use_sgpr_vgpr(ptr addrspace(1) %out, i64 %src, i32 %a, ptr addrspace(1) %input) {
+  %in = load <4 x i32>, ptr addrspace(1) %input
   %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[2:3]},v"(i64 %src) #0
   %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={v4},v"(i32 %a) #0
   %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> %in) #0
   %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{v[2:5]}"(<4 x i32> %tmp2) #0
-  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
+  store <4 x i32> %tmp3, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
index 2eb3179..ee9d866 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll

@@ -5,17 +5,17 @@
 
 ; GCN-LABEL: {{^}}v_msad_u8:
 ; GCN: v_msad_u8 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_msad_u8(ptr addrspace(1) %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 100, i32 100) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_msad_u8_non_immediate:
 ; GCN: v_msad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_msad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_msad_u8_non_immediate(ptr addrspace(1) %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 %a, i32 %b) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.i24.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.i24.ll
index a1dbe9a..b58e571 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.i24.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.i24.ll

@@ -2,9 +2,9 @@
 
 ; GCN-LABEL: {{^}}test_mul_i24:
 ; GCN: v_mul_i32_i24
-define amdgpu_kernel void @test_mul_i24(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @test_mul_i24(ptr addrspace(1) %out, i32 %src1, i32 %src2) #1 {
   %val = call i32 @llvm.amdgcn.mul.i24(i32 %src1, i32 %src2) #0
-  store i32 %val, i32 addrspace(1)* %out
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.u24.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.u24.ll
index 810b503..58efab8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.u24.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mul.u24.ll

@@ -2,9 +2,9 @@
 
 ; GCN-LABEL: {{^}}test_mul_u24:
 ; GCN: v_mul_u32_u24
-define amdgpu_kernel void @test_mul_u24(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @test_mul_u24(ptr addrspace(1) %out, i32 %src1, i32 %src2) #1 {
   %val = call i32 @llvm.amdgcn.mul.u24(i32 %src1, i32 %src2) #0
-  store i32 %val, i32 addrspace(1)* %out
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.ll
index 4d9ba39..bb3bf2b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.ll

@@ -5,41 +5,41 @@
 
 ; GCN-LABEL: {{^}}v_perm_b32_v_v_v:
 ; GCN: v_perm_b32 v{{[0-9]+}}, v0, v1, v2
-define amdgpu_ps void @v_perm_b32_v_v_v(i32 %src1, i32 %src2, i32 %src3, i32 addrspace(1)* %out) #1 {
+define amdgpu_ps void @v_perm_b32_v_v_v(i32 %src1, i32 %src2, i32 %src3, ptr addrspace(1) %out) #1 {
   %val = call i32 @llvm.amdgcn.perm(i32 %src1, i32 %src2, i32 %src3) #0
-  store i32 %val, i32 addrspace(1)* %out
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_perm_b32_v_v_c:
 ; GCN: v_perm_b32 v{{[0-9]+}}, v0, v1, {{[vs][0-9]+}}
-define amdgpu_ps void @v_perm_b32_v_v_c(i32 %src1, i32 %src2, i32 addrspace(1)* %out) #1 {
+define amdgpu_ps void @v_perm_b32_v_v_c(i32 %src1, i32 %src2, ptr addrspace(1) %out) #1 {
   %val = call i32 @llvm.amdgcn.perm(i32 %src1, i32 %src2, i32 12345) #0
-  store i32 %val, i32 addrspace(1)* %out
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_perm_b32_s_v_c:
 ; GCN: v_perm_b32 v{{[0-9]+}}, s0, v0, v{{[0-9]+}}
-define amdgpu_ps void @v_perm_b32_s_v_c(i32 inreg %src1, i32 %src2, i32 addrspace(1)* %out) #1 {
+define amdgpu_ps void @v_perm_b32_s_v_c(i32 inreg %src1, i32 %src2, ptr addrspace(1) %out) #1 {
   %val = call i32 @llvm.amdgcn.perm(i32 %src1, i32 %src2, i32 12345) #0
-  store i32 %val, i32 addrspace(1)* %out
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_perm_b32_s_s_c:
 ; GCN: v_perm_b32 v{{[0-9]+}}, s0, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_ps void @v_perm_b32_s_s_c(i32 inreg %src1, i32 inreg %src2, i32 addrspace(1)* %out) #1 {
+define amdgpu_ps void @v_perm_b32_s_s_c(i32 inreg %src1, i32 inreg %src2, ptr addrspace(1) %out) #1 {
   %val = call i32 @llvm.amdgcn.perm(i32 %src1, i32 %src2, i32 12345) #0
-  store i32 %val, i32 addrspace(1)* %out
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_perm_b32_v_s_i:
 ; GCN: v_perm_b32 v{{[0-9]+}}, v0, s0, 1
-define amdgpu_ps void @v_perm_b32_v_s_i(i32 %src1, i32 inreg %src2, i32 addrspace(1)* %out) #1 {
+define amdgpu_ps void @v_perm_b32_v_s_i(i32 %src1, i32 inreg %src2, ptr addrspace(1) %out) #1 {
   %val = call i32 @llvm.amdgcn.perm(i32 %src1, i32 %src2, i32 1) #0
-  store i32 %val, i32 addrspace(1)* %out
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index ca48ce8..c3c0d6c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll

@@ -11,18 +11,18 @@
 ; GCN-LABEL: {{^}}v_permlane16_b32_vss:
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
-define amdgpu_kernel void @v_permlane16_b32_vss(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlane16_b32_vii:
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2{{$}}
-define amdgpu_kernel void @v_permlane16_b32_vii(i32 addrspace(1)* %out, i32 %src0) #1 {
+define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0) #1 {
   %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 1, i32 2, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
@@ -31,9 +31,9 @@
 ; GFX10PLUS-DAG: s_movk_i32 [[SRC1:s[0-9]+]], 0x1234
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], 0xc1d1{{$}}
-define amdgpu_kernel void @v_permlane16_b32_vll(i32 addrspace(1)* %out, i32 %src0) #1 {
+define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0) #1 {
   %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
@@ -45,11 +45,11 @@
 ; GFX11-DAG: v_readfirstlane_b32 [[SRC1:s[0-9]+]], [[VSRC1]]
 ; GFX11-DAG: v_readfirstlane_b32 [[SRC2:s[0-9]+]], [[VSRC2]]
 ; GFX10PLUS: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}}
-define amdgpu_kernel void @v_permlane16_b32_vvv(i32 addrspace(1)* %out, i32 %src0) #1 {
+define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
@@ -58,10 +58,10 @@
 ; GFX10PLUS: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], s{{[0-9]+}}{{$}}
-define amdgpu_kernel void @v_permlane16_b32_vvs(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
@@ -70,55 +70,55 @@
 ; GFX10PLUS: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v{{[0-9]+}}
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, [[SRC2]]{{$}}
-define amdgpu_kernel void @v_permlane16_b32_vsv(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlane16_b32_vss_fi:
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[{{1,0|1,0,0,1}}]{{$}}
-define amdgpu_kernel void @v_permlane16_b32_vss_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlane16_b32_vss_bc:
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[{{0,1|0,1,0,0}}]{{$}}
-define amdgpu_kernel void @v_permlane16_b32_vss_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 1)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlane16_b32_vss_fi_bc:
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[{{1,1|1,1,0,1}}]{{$}}
-define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 1)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlanex16_b32_vss:
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
-define amdgpu_kernel void @v_permlanex16_b32_vss(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlanex16_b32_vii:
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2{{$}}
-define amdgpu_kernel void @v_permlanex16_b32_vii(i32 addrspace(1)* %out, i32 %src0) #1 {
+define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src0) #1 {
   %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 1, i32 2, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
@@ -127,9 +127,9 @@
 ; GFX10PLUS-DAG: s_movk_i32 [[SRC1:s[0-9]+]], 0x1234
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], 0xc1d1{{$}}
-define amdgpu_kernel void @v_permlanex16_b32_vll(i32 addrspace(1)* %out, i32 %src0) #1 {
+define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src0) #1 {
   %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
@@ -141,11 +141,11 @@
 ; GFX11-DAG: v_readfirstlane_b32 [[SRC1:s[0-9]+]], [[VSRC1]]
 ; GFX11-DAG: v_readfirstlane_b32 [[SRC2:s[0-9]+]], [[VSRC2]]
 ; GFX10PLUS: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}}
-define amdgpu_kernel void @v_permlanex16_b32_vvv(i32 addrspace(1)* %out, i32 %src0) #1 {
+define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src0) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
@@ -154,10 +154,10 @@
 ; GFX10PLUS: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], s{{[0-9]+}}{{$}}
-define amdgpu_kernel void @v_permlanex16_b32_vvs(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
@@ -166,153 +166,153 @@
 ; GFX10PLUS: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v{{[0-9]+}}
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, [[SRC2]]{{$}}
-define amdgpu_kernel void @v_permlanex16_b32_vsv(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlanex16_b32_vss_fi:
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[{{1,0|1,0,0,1}}]{{$}}
-define amdgpu_kernel void @v_permlanex16_b32_vss_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlanex16_b32_vss_bc:
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[{{0,1|0,1,0,0}}]{{$}}
-define amdgpu_kernel void @v_permlanex16_b32_vss_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlanex16_b32_vss_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 1)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlanex16_b32_vss_fi_bc:
 ; GFX10PLUS-NOT: v_readfirstlane_b32
 ; GFX10PLUS: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[{{1,1|1,1,0,1}}]{{$}}
-define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 1)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlane16_b32_tid_tid:
 ; GFX10PLUS: v_permlane16_b32 v0, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
-define amdgpu_kernel void @v_permlane16_b32_tid_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlane16_b32_undef_tid:
 ; GFX10PLUS: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
-define amdgpu_kernel void @v_permlane16_b32_undef_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlane16_b32_i_tid:
 ; GFX10PLUS: v_{{(dual_)?}}mov_b32{{(_e32)?}} [[OLD:v[0-9]+]], 0x3039
 ; GFX10PLUS: v_permlane16_b32 [[OLD]], v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
-define amdgpu_kernel void @v_permlane16_b32_i_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_fi:
 ; GFX10PLUS-NOT: 0x3039
 ; GFX10PLUS: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[{{1,0|1,0,0,1}}]{{$}}
-define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_bc:
 ; GFX10PLUS-NOT: 0x3039
 ; GFX10PLUS: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[{{0,1|0,1,0,0}}]{{$}}
-define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 1)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_fi_bc:
 ; GFX10PLUS-NOT: 0x3039
 ; GFX10PLUS: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[{{1,1|1,1,0,1}}]{{$}}
-define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 1)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlanex16_b32_tid_tid:
 ; GFX10PLUS: v_permlanex16_b32 v0, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
-define amdgpu_kernel void @v_permlanex16_b32_tid_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlanex16_b32_undef_tid:
 ; GFX10PLUS: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
-define amdgpu_kernel void @v_permlanex16_b32_undef_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid:
 ; GFX10PLUS: v_{{(dual_)?}}mov_b32{{(_e32)?}} [[OLD:v[0-9]+]], 0x3039
 ; GFX10PLUS: v_permlanex16_b32 [[OLD]], v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
-define amdgpu_kernel void @v_permlanex16_b32_i_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_fi:
 ; GFX10PLUS-NOT: 0x3039
 ; GFX10PLUS: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[{{1,0|1,0,0,1}}]{{$}}
-define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_bc:
 ; GFX10PLUS-NOT: 0x3039
 ; GFX10PLUS: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[{{0,1|0,1,0,0}}]{{$}}
-define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 1)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_fi_bc:
 ; GFX10PLUS-NOT: 0x3039
 ; GFX10PLUS: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[{{1,1|1,1,0,1}}]{{$}}
-define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 1)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index 05b535a..3e80947 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll

@@ -5,7 +5,7 @@
 declare i32 @llvm.amdgcn.permlane64(i32)
 declare i32 @llvm.amdgcn.workitem.id.x()
 
-define amdgpu_kernel void @test_s(i32 addrspace(1)* %out, i32 %src0) {
+define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) {
 ; GFX11-LABEL: test_s:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -19,11 +19,11 @@
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane64(i32 %src0)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_i(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_i(ptr addrspace(1) %out) {
 ; GFX11-LABEL: test_i:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
@@ -35,11 +35,11 @@
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane64(i32 99)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_v(i32 addrspace(1)* %out, i32 %src0) #1 {
+define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
 ; GFX11-SDAG-LABEL: test_v:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
@@ -61,6 +61,6 @@
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane64(i32 %tidx)
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
index 1f46613..634af67 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll

@@ -7,11 +7,11 @@
 ; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ; GCN-DAG: v_mov_b32_e32 v5, v1
 ; GCN-DAG: v_mov_b32_e32 v4, v0
-define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_qsad_pk_u16_u8(ptr addrspace(1) %out, i64 %src) {
   %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[4:5]},v"(i64 %src) #0
   %tmp1 = call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %tmp, i32 100, i64 100) #0
   %tmp2 = call i64 asm ";; force constraint", "=v,{v[4:5]}"(i64 %tmp1) #0
-  store i64 %tmp2, i64 addrspace(1)* %out, align 4
+  store i64 %tmp2, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -19,13 +19,13 @@
 ; GCN: v_qsad_pk_u16_u8 v[0:1], v[2:3], v4, v[6:7]
 ; GCN-DAG: v_mov_b32_e32 v3, v1
 ; GCN-DAG: v_mov_b32_e32 v2, v0
-define amdgpu_kernel void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
+define amdgpu_kernel void @v_qsad_pk_u16_u8_non_immediate(ptr addrspace(1) %out, i64 %src, i32 %a, i64 %b) {
   %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[2:3]},v"(i64 %src) #0
   %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={v4},v"(i32 %a) #0
   %tmp2 = call i64 asm "v_lshlrev_b64 $0, $1, 1", "={v[6:7]},v"(i64 %b) #0
   %tmp3 = call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %tmp, i32 %tmp1, i64 %tmp2) #0
   %tmp4 = call i64 asm ";; force constraint", "=v,{v[2:3]}"(i64 %tmp3) #0
-  store i64 %tmp4, i64 addrspace(1)* %out, align 4
+  store i64 %tmp4, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
index c7d1bbb..312ac21 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll

@@ -6,14 +6,13 @@
 ; GCN-LABEL: {{^}}test:
 ; GCN: enable_sgpr_queue_ptr = 1
 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
-  %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
-  %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
-  %value = load i32, i32 addrspace(4)* %header_ptr
-  store i32 %value, i32 addrspace(1)* %out
+define amdgpu_kernel void @test(ptr addrspace(1) %out) {
+  %queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0
+  %value = load i32, ptr addrspace(4) %queue_ptr
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
-declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+declare noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0
 
 attributes #0 = { nounwind readnone }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll
index 6dfec8c..48ec7cf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll

@@ -2,9 +2,9 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN
 
-declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
+declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
 
-define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
+define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
 ; GCN-LABEL: buffer_load_lds_dword:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    s_mov_b32 m0, s4
@@ -18,15 +18,14 @@
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 main_body:
-  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
-  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
-  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
-  %ptr = bitcast i8 addrspace(3)* %lds to float addrspace(3)*
-  %res = load float, float addrspace(3)* %ptr
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
+  %res = load float, ptr addrspace(3) %lds
   ret float %res
 }
 
-define amdgpu_ps void @buffer_load_lds_dword_imm_voffset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
+define amdgpu_ps void @buffer_load_lds_dword_imm_voffset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
 ; GCN-LABEL: buffer_load_lds_dword_imm_voffset:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x800
@@ -35,11 +34,11 @@
 ; GCN-NEXT:    buffer_load_dword v0, s[0:3], 0 offen lds
 ; GCN-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 2048, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 2048, i32 0, i32 0, i32 0)
   ret void
 }
 
-define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset) {
+define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset) {
 ; GCN-LABEL: buffer_load_lds_dword_v_offset:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    s_mov_b32 m0, s4
@@ -47,11 +46,11 @@
 ; GCN-NEXT:    buffer_load_dword v0, s[0:3], 0 offen lds
 ; GCN-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %voffset, i32 0, i32 0, i32 0)
   ret void
 }
 
-define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 inreg %soffset) {
+define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 inreg %soffset) {
 ; GCN-LABEL: buffer_load_lds_dword_s_offset:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    s_mov_b32 m0, s4
@@ -59,11 +58,11 @@
 ; GCN-NEXT:    buffer_load_dword off, s[0:3], s5 lds
 ; GCN-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 %soffset, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 %soffset, i32 0, i32 0)
   ret void
 }
 
-define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset, i32 inreg %soffset) {
+define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) {
 ; GCN-LABEL: buffer_load_lds_dword_vs_offset:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    s_mov_b32 m0, s4
@@ -71,11 +70,11 @@
 ; GCN-NEXT:    buffer_load_dword v0, s[0:3], s5 offen lds
 ; GCN-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 %soffset, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %voffset, i32 %soffset, i32 0, i32 0)
   ret void
 }
 
-define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset, i32 inreg %soffset) {
+define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) {
 ; GCN-LABEL: buffer_load_lds_dword_vs_imm_offset:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    s_mov_b32 m0, s4
@@ -83,11 +82,11 @@
 ; GCN-NEXT:    buffer_load_dword v0, s[0:3], s5 offen offset:2048 lds
 ; GCN-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 %soffset, i32 2048, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %voffset, i32 %soffset, i32 2048, i32 0)
   ret void
 }
 
-define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
+define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
 ; GCN-LABEL: buffer_load_lds_ushort:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x800
@@ -96,11 +95,11 @@
 ; GCN-NEXT:    buffer_load_ushort v0, s[0:3], 0 offen lds
 ; GCN-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 2, i32 2048, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 2, i32 2048, i32 0, i32 0, i32 0)
   ret void
 }
 
-define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
+define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
 ; GCN-LABEL: buffer_load_lds_ubyte:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    s_mov_b32 m0, s4
@@ -108,6 +107,6 @@
 ; GCN-NEXT:    buffer_load_ubyte off, s[0:3], 0 offset:2048 lds
 ; GCN-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 1, i32 0, i32 0, i32 2048, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 1, i32 0, i32 0, i32 2048, i32 0)
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
index 51816da..50b894c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll

@@ -110,12 +110,12 @@
 ; CHECK-LABEL: buffer_load_mmo:
 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) {
+define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, ptr addrspace(3) %lds) {
 entry:
-  store float 0.0, float addrspace(3)* %lds
+  store float 0.0, ptr addrspace(3) %lds
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
-  %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
-  store float 0.0, float addrspace(3)* %tmp2
+  %tmp2 = getelementptr float, ptr addrspace(3) %lds, i32 4
+  store float 0.0, ptr addrspace(3) %tmp2
   ret float %val
 }
 
@@ -350,10 +350,10 @@
 ;CHECK-NEXT: buffer_load_{{ushort|u16}} [[VAL:v[0-9]+]], off, s[0:3], 0
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_{{write|store}}_b16 v0, [[VAL]]
-define amdgpu_ps void @raw_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr) {
+define amdgpu_ps void @raw_buffer_load_f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) {
 main_body:
   %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
-  store half %val, half addrspace(3)* %ptr
+  store half %val, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -362,10 +362,10 @@
 ;CHECK-NEXT: buffer_load_{{dword|b32}} [[VAL:v[0-9]+]], off, s[0:3], 0
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_{{write|store}}_b32 v0, [[VAL]]
-define amdgpu_ps void @raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr) {
+define amdgpu_ps void @raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) {
 main_body:
   %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
-  store <2 x half> %val, <2 x half> addrspace(3)* %ptr
+  store <2 x half> %val, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -374,10 +374,10 @@
 ;CHECK-NEXT: buffer_load_{{dwordx2|b64}} [[VAL:v\[[0-9]+:[0-9]+\]]], off, s[0:3], 0
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_{{write|store}}_b64 v0, [[VAL]]
-define amdgpu_ps void @raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr) {
+define amdgpu_ps void @raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) {
 main_body:
   %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
-  store <4 x half> %val, <4 x half> addrspace(3)* %ptr
+  store <4 x half> %val, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -386,10 +386,10 @@
 ;CHECK-NEXT: buffer_load_{{dword|b32}} [[VAL:v[0-9]+]], off, s[0:3], 0
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_{{write|store}}_b32 v0, [[VAL]]
-define amdgpu_ps void @raw_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr) {
+define amdgpu_ps void @raw_buffer_load_v2i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) {
 main_body:
   %val = call <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
-  store <2 x i16> %val, <2 x i16> addrspace(3)* %ptr
+  store <2 x i16> %val, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -398,10 +398,10 @@
 ;CHECK-NEXT: buffer_load_{{dwordx2|b64}} [[VAL:v\[[0-9]+:[0-9]+\]]], off, s[0:3], 0
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_{{write|store}}_b64 v0, [[VAL]]
-define amdgpu_ps void @raw_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr) {
+define amdgpu_ps void @raw_buffer_load_v4i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr) {
 main_body:
   %val = call <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
-  store <4 x i16> %val, <4 x i16> addrspace(3)* %ptr
+  store <4 x i16> %val, ptr addrspace(3) %ptr
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
index 0f1fa15..f49c503 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll

@@ -8,11 +8,11 @@
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @rcp_f16(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
+  %a.val = load half, ptr addrspace(1) %a
   %r.val = call half @llvm.amdgcn.rcp.f16(half %a.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
index 71db76d..f175540 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll

@@ -1,40 +1,40 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: not llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
-; ERROR: error: <unknown>:0:0: in function rcp_legacy_f32 void (float addrspace(1)*, float): intrinsic not supported on subtarget
+; ERROR: error: <unknown>:0:0: in function rcp_legacy_f32 void (ptr addrspace(1), float): intrinsic not supported on subtarget
 
 declare float @llvm.amdgcn.rcp.legacy(float) #0
 
 ; GCN-LABEL: {{^}}rcp_legacy_f32:
 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define amdgpu_kernel void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rcp_legacy_f32(ptr addrspace(1) %out, float %src) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float %src) #0
-  store float %rcp, float addrspace(1)* %out, align 4
+  store float %rcp, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; TODO: Really these should be constant folded
 ; GCN-LABEL: {{^}}rcp_legacy_f32_constant_4.0
 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, 4.0
-define amdgpu_kernel void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_legacy_f32_constant_4.0(ptr addrspace(1) %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float 4.0) #0
-  store float %rcp, float addrspace(1)* %out, align 4
+  store float %rcp, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}rcp_legacy_f32_constant_100.0
 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000
-define amdgpu_kernel void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_legacy_f32_constant_100.0(ptr addrspace(1) %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float 100.0) #0
-  store float %rcp, float addrspace(1)* %out, align 4
+  store float %rcp, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}rcp_legacy_undef_f32:
 ; GCN-NOT: v_rcp_legacy_f32
-define amdgpu_kernel void @rcp_legacy_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_legacy_undef_f32(ptr addrspace(1) %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float undef)
-  store float %rcp, float addrspace(1)* %out, align 4
+  store float %rcp, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
index 929f935..74d3404 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll

@@ -10,27 +10,27 @@
 ; SI: v_mov_b32_e32 [[NAN:v[0-9]+]], 0x7fc00000
 ; SI-NOT: [[NAN]]
 ; SI: buffer_store_dword [[NAN]]
-define amdgpu_kernel void @rcp_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_undef_f32(ptr addrspace(1) %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.f32(float undef)
-  store float %rcp, float addrspace(1)* %out, align 4
+  store float %rcp, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}rcp_2_f32:
 ; SI-NOT: v_rcp_f32
 ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0.5
-define amdgpu_kernel void @rcp_2_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_2_f32(ptr addrspace(1) %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.f32(float 2.0)
-  store float %rcp, float addrspace(1)* %out, align 4
+  store float %rcp, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}rcp_10_f32:
 ; SI-NOT: v_rcp_f32
 ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0x3dcccccd
-define amdgpu_kernel void @rcp_10_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_10_f32(ptr addrspace(1) %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.f32(float 10.0)
-  store float %rcp, float addrspace(1)* %out, align 4
+  store float %rcp, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -38,9 +38,9 @@
 ; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @safe_no_fp32_denormals_rcp_f32(ptr addrspace(1) %out, float %src) #1 {
   %rcp = fdiv float 1.0, %src, !fpmath !0
-  store float %rcp, float addrspace(1)* %out, align 4
+  store float %rcp, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -48,35 +48,35 @@
 ; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 {
+define amdgpu_kernel void @safe_f32_denormals_rcp_pat_f32(ptr addrspace(1) %out, float %src) #4 {
   %rcp = fdiv float 1.0, %src, !fpmath !0
-  store float %rcp, float addrspace(1)* %out, align 4
+  store float %rcp, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}unsafe_f32_denormals_rcp_pat_f32:
 ; SI: v_div_scale_f32
-define amdgpu_kernel void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #3 {
+define amdgpu_kernel void @unsafe_f32_denormals_rcp_pat_f32(ptr addrspace(1) %out, float %src) #3 {
   %rcp = fdiv float 1.0, %src
-  store float %rcp, float addrspace(1)* %out, align 4
+  store float %rcp, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32:
 ; SI: v_rsq_f32_e32
-define amdgpu_kernel void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @safe_rsq_rcp_pat_f32(ptr addrspace(1) %out, float %src) #1 {
   %sqrt = call float @llvm.sqrt.f32(float %src)
   %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
-  store float %rcp, float addrspace(1)* %out, align 4
+  store float %rcp, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f32:
 ; SI: v_rsq_f32_e32
-define amdgpu_kernel void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 {
+define amdgpu_kernel void @unsafe_rsq_rcp_pat_f32(ptr addrspace(1) %out, float %src) #2 {
   %sqrt = call float @llvm.sqrt.f32(float %src)
   %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
-  store float %rcp, float addrspace(1)* %out, align 4
+  store float %rcp, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -84,9 +84,9 @@
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @rcp_f64(ptr addrspace(1) %out, double %src) #1 {
   %rcp = call double @llvm.amdgcn.rcp.f64(double %src)
-  store double %rcp, double addrspace(1)* %out, align 8
+  store double %rcp, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -94,17 +94,17 @@
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define amdgpu_kernel void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rcp_f64(ptr addrspace(1) %out, double %src) #2 {
   %rcp = call double @llvm.amdgcn.rcp.f64(double %src)
-  store double %rcp, double addrspace(1)* %out, align 8
+  store double %rcp, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}rcp_pat_f64:
 ; SI: v_div_scale_f64
-define amdgpu_kernel void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @rcp_pat_f64(ptr addrspace(1) %out, double %src) #1 {
   %rcp = fdiv double 1.0, %src
-  store double %rcp, double addrspace(1)* %out, align 8
+  store double %rcp, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -116,9 +116,9 @@
 ; SI: v_fma_f64
 ; SI: v_fma_f64
 ; SI: v_fma_f64
-define amdgpu_kernel void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 {
   %rcp = fdiv double 1.0, %src
-  store double %rcp, double addrspace(1)* %out, align 8
+  store double %rcp, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -126,10 +126,10 @@
 ; SI-NOT: v_rsq_f64_e32
 ; SI: v_sqrt_f64
 ; SI: v_rcp_f64
-define amdgpu_kernel void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @safe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #1 {
   %sqrt = call double @llvm.sqrt.f64(double %src)
   %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
-  store double %rcp, double addrspace(1)* %out, align 8
+  store double %rcp, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -137,10 +137,10 @@
 ; SI: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SQRT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 {
   %sqrt = call double @llvm.sqrt.f64(double %src)
   %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
-  store double %rcp, double addrspace(1)* %out, align 8
+  store double %rcp, ptr addrspace(1) %out, align 8
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 991afdb..e789db1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll

@@ -4,9 +4,9 @@
 
 ; CHECK-LABEL: {{^}}test_readfirstlane:
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
-define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
+define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) #1 {
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
-  store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
+  store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -14,7 +14,7 @@
 ; CHECK: s_mov_b32 [[SGPR_VAL:s[0-9]]], 32
 ; CHECK-NOT: [[SGPR_VAL]]
 ; CHECK: ; use [[SGPR_VAL]]
-define amdgpu_kernel void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_imm(ptr addrspace(1) %out) #1 {
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
   call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
   ret void
@@ -24,9 +24,9 @@
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
 ; CHECK-NOT: [[VVAL]]
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
-define amdgpu_kernel void @test_readfirstlane_imm_fold(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_imm_fold(ptr addrspace(1) %out) #1 {
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
-  store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
+  store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -34,10 +34,10 @@
 ; CHECK: s_mov_b32 m0, -1
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
-define amdgpu_kernel void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
-  store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
+  store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -49,19 +49,19 @@
 ; CHECK-NOT: readfirstlane
 ; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]]
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %out) #1 {
   %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr)
-  store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
+  store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; Make sure this doesn't crash.
 ; CHECK-LABEL: {{^}}test_readfirstlane_fi:
 ; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 4
-define amdgpu_kernel void @test_readfirstlane_fi(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 {
   %alloca = alloca i32, addrspace(5)
-  %int = ptrtoint i32 addrspace(5)* %alloca to i32
+  %int = ptrtoint ptr addrspace(5) %alloca to i32
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int)
   call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
   ret void

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 230d7af..51465f6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll

@@ -21,23 +21,23 @@
 
 ; CHECK-LABEL: {{^}}test_readlane_imm_sreg:
 ; CHECK-NOT: v_readlane_b32
-define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_imm_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
   %readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1)
-  store i32 %readlane, i32 addrspace(1)* %out, align 4
+  store i32 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}test_readlane_vregs:
 ; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
-define amdgpu_kernel void @test_readlane_vregs(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_readlane_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
-  %args = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in
+  %gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid
+  %args = load <2 x i32>, ptr addrspace(1) %gep.in
   %value = extractelement <2 x i32> %args, i32 0
   %lane = extractelement <2 x i32> %args, i32 1
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %lane)
-  store i32 %readlane, i32 addrspace(1)* %out, align 4
+  store i32 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -46,19 +46,19 @@
 ; CHECK: s_mov_b32 m0, -1
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
-define amdgpu_kernel void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %m0, i32 %src1)
-  store i32 %readlane, i32 addrspace(1)* %out, align 4
+  store i32 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}test_readlane_vgpr_imm:
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
-define amdgpu_kernel void @test_readlane_vgpr_imm(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_readlane_vgpr_imm(ptr addrspace(1) %out) #1 {
   %vgpr = call i32 asm sideeffect "; def $0", "=v"()
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %vgpr, i32 32) #0
-  store i32 %readlane, i32 addrspace(1)* %out, align 4
+  store i32 %readlane, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -70,10 +70,10 @@
 ; CHECK-NOT: readlane
 ; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]]
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
-define amdgpu_kernel void @test_readlane_copy_from_sgpr(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_readlane_copy_from_sgpr(ptr addrspace(1) %out) #1 {
   %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
   %readfirstlane = call i32 @llvm.amdgcn.readlane(i32 %sgpr, i32 7)
-  store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
+  store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
index 53c8a8c..9b62d6c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll

@@ -12,9 +12,9 @@
 ; VI-DAG: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
 ; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xff7fffff, [[MIN]]
 ; VI: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rsq_clamp_f32(ptr addrspace(1) %out, float %src) #0 {
   %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
-  store float %rsq_clamp, float addrspace(1)* %out
+  store float %rsq_clamp, ptr addrspace(1) %out
   ret void
 }
 
@@ -30,17 +30,17 @@
 ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
 ; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s[[[LOW1]]:[[HIGH1]]]
 ; VI-DAG: v_max_f64 v[0:1], v[0:1], s[[[LOW1]]:[[HIGH2]]]
-define amdgpu_kernel void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
+define amdgpu_kernel void @rsq_clamp_f64(ptr addrspace(1) %out, double %src) #0 {
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
-  store double %rsq_clamp, double addrspace(1)* %out
+  store double %rsq_clamp, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}rsq_clamp_undef_f32:
 ; SI-NOT: v_rsq_clamp_f32
-define amdgpu_kernel void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @rsq_clamp_undef_f32(ptr addrspace(1) %out) #0 {
   %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
-  store float %rsq_clamp, float addrspace(1)* %out
+  store float %rsq_clamp, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
index fd48021..4c34446 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll

@@ -8,11 +8,11 @@
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @rsq_f16(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
+  %a.val = load half, ptr addrspace(1) %a
   %r.val = call half @llvm.amdgcn.rsq.f16(half %a.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
index 7f4c2cb..d987746 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll

@@ -4,34 +4,34 @@
 
 ; FUNC-LABEL: {{^}}rsq_legacy_f32:
 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define amdgpu_kernel void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_legacy_f32(ptr addrspace(1) %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float %src) #0
-  store float %rsq, float addrspace(1)* %out, align 4
+  store float %rsq, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; TODO: Really these should be constant folded
 ; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_4.0
 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 4.0
-define amdgpu_kernel void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_legacy_f32_constant_4.0(ptr addrspace(1) %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float 4.0) #0
-  store float %rsq, float addrspace(1)* %out, align 4
+  store float %rsq, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_100.0
 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000
-define amdgpu_kernel void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_legacy_f32_constant_100.0(ptr addrspace(1) %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float 100.0) #0
-  store float %rsq, float addrspace(1)* %out, align 4
+  store float %rsq, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}rsq_legacy_undef_f32:
 ; SI-NOT: v_rsq_legacy_f32
-define amdgpu_kernel void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_legacy_undef_f32(ptr addrspace(1) %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float undef)
-  store float %rsq, float addrspace(1)* %out, align 4
+  store float %rsq, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
index 0ce26d0..2be94f4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll

@@ -6,43 +6,43 @@
 
 ; FUNC-LABEL: {{^}}rsq_f32:
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define amdgpu_kernel void @rsq_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float %src) #0
-  store float %rsq, float addrspace(1)* %out, align 4
+  store float %rsq, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; TODO: Really these should be constant folded
 ; FUNC-LABEL: {{^}}rsq_f32_constant_4.0
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0
-define amdgpu_kernel void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f32_constant_4.0(ptr addrspace(1) %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float 4.0) #0
-  store float %rsq, float addrspace(1)* %out, align 4
+  store float %rsq, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}rsq_f32_constant_100.0
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000
-define amdgpu_kernel void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f32_constant_100.0(ptr addrspace(1) %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float 100.0) #0
-  store float %rsq, float addrspace(1)* %out, align 4
+  store float %rsq, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}rsq_f64:
 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @rsq_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @rsq_f64(ptr addrspace(1) %out, double %src) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double %src) #0
-  store double %rsq, double addrspace(1)* %out, align 4
+  store double %rsq, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; TODO: Really these should be constant folded
 ; FUNC-LABEL: {{^}}rsq_f64_constant_4.0
 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, 4.0
-define amdgpu_kernel void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f64_constant_4.0(ptr addrspace(1) %out) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double 4.0) #0
-  store double %rsq, double addrspace(1)* %out, align 4
+  store double %rsq, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -50,17 +50,17 @@
 ; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0x40590000
 ; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0{{$}}
 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f64_constant_100.0(ptr addrspace(1) %out) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double 100.0) #0
-  store double %rsq, double addrspace(1)* %out, align 4
+  store double %rsq, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}rsq_undef_f32:
 ; SI-NOT: v_rsq_f32
-define amdgpu_kernel void @rsq_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_undef_f32(ptr addrspace(1) %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float undef)
-  store float %rsq, float addrspace(1)* %out, align 4
+  store float %rsq, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 4a3c1a5..48c4e02 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll

@@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT2 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT3 %s
 
-define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
+define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT0-LABEL: test_barrier:
 ; VARIANT0:       ; %bb.0: ; %entry
 ; VARIANT0-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -87,14 +87,14 @@
 ; VARIANT3-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
-  store i32 %tmp, i32 addrspace(1)* %tmp1
+  %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+  store i32 %tmp, ptr addrspace(1) %tmp1
   call void @llvm.amdgcn.s.barrier()
   %tmp3 = sub i32 %size, 1
   %tmp4 = sub i32 %tmp3, %tmp
-  %tmp5 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp4
-  %tmp6 = load i32, i32 addrspace(1)* %tmp5
-  store i32 %tmp6, i32 addrspace(1)* %tmp1
+  %tmp5 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp4
+  %tmp6 = load i32, ptr addrspace(1) %tmp5
+  store i32 %tmp6, ptr addrspace(1) %tmp1
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
index 49bb769..1042290 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll

@@ -214,7 +214,7 @@
 define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 %index) {
 main_body:
   %tmp = shl i32 %index, 4
-  store i32 %tmp, i32 addrspace(1)* @gv
+  store i32 %tmp, ptr addrspace(1) @gv
   br label %bb1
 
 bb1:                                              ; preds = %main_body

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
index b7fb96a..d899aeb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll

@@ -24,7 +24,7 @@
   br label %end
 
 end:
-  store volatile i32 3, i32 addrspace(1)* undef
+  store volatile i32 3, ptr addrspace(1) undef
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
index 41f398f..fc8d61f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll

@@ -24,7 +24,7 @@
   br label %end
 
 end:
-  store volatile i32 3, i32 addrspace(1)* undef
+  store volatile i32 3, ptr addrspace(1) undef
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
index 9026d48..43f2f61 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll

@@ -22,7 +22,7 @@
   br label %end
 
 end:
-  store volatile i32 3, i32 addrspace(1)* undef
+  store volatile i32 3, ptr addrspace(1) undef
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
index 7f6e115..4ba28cf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll

@@ -22,7 +22,7 @@
   br label %end
 
 end:
-  store volatile i32 3, i32 addrspace(1)* undef
+  store volatile i32 3, ptr addrspace(1) undef
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll
index 111fd35..46953d3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll

@@ -8,11 +8,11 @@
 ; GFX10: s_waitcnt lgkmcnt(0)
 ; GFX10: v_mov_b32_e32 [[VDEST:v[0-9]+]], [[DEST]]
 ; GFX10: global_store_dword v{{[0-9]+}}, [[VDEST]], s{{\[[0-9]+:[0-9]+\]$}}
-define amdgpu_kernel void @test_s_get_waveid_in_workgroup(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_s_get_waveid_in_workgroup(ptr addrspace(1) %out) {
 ; Make sure %out is loaded and assiciated wait count already inserted
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   %v = call i32 @llvm.amdgcn.s.get.waveid.in.workgroup()
-  store i32 %v, i32 addrspace(1)* %out
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll
index 22e15e2..962b6c8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll

@@ -6,9 +6,9 @@
 ; GCN: s_load_dwordx2
 ; GCN-DAG: s_getpc_b64 s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @test_s_getpc(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_s_getpc(ptr addrspace(1) %out) #0 {
   %tmp = call i64 @llvm.amdgcn.s.getpc() #1
-  store volatile i64 %tmp, i64 addrspace(1)* %out, align 8
+  store volatile i64 %tmp, ptr addrspace(1) %out, align 8
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
index 69cbf62..def4394 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll

@@ -9,20 +9,20 @@
 
 ; GCN-LABEL: {{^}}s_getreg_test:
 ; GCN: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23)
-define amdgpu_kernel void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
+define amdgpu_kernel void @s_getreg_test(ptr addrspace(1) %out) { ; simm16=45574 for lds size.
   %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574)
   %lds_size_bytes = shl i32 %lds_size_64dwords, 8
-  store i32 %lds_size_bytes, i32 addrspace(1)* %out
+  store i32 %lds_size_bytes, ptr addrspace(1) %out
   ret void
 }
 
 ; Call site has additional readnone knowledge.
 ; GCN-LABEL: {{^}}readnone_s_getreg_test:
 ; GCN: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23)
-define amdgpu_kernel void @readnone_s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
+define amdgpu_kernel void @readnone_s_getreg_test(ptr addrspace(1) %out) { ; simm16=45574 for lds size.
   %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574) #1
   %lds_size_bytes = shl i32 %lds_size_64dwords, 8
-  store i32 %lds_size_bytes, i32 addrspace(1)* %out
+  store i32 %lds_size_bytes, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
index 43d95dc..e4bfb08 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll

@@ -13,12 +13,12 @@
 ; GCN-NOT: lgkmcnt
 ; GCN: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: _store_dwordx2
-define amdgpu_kernel void @test_s_memrealtime(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_s_memrealtime(ptr addrspace(1) %out) #0 {
   %cycle0 = call i64 @llvm.amdgcn.s.memrealtime()
-  store volatile i64 %cycle0, i64 addrspace(1)* %out
+  store volatile i64 %cycle0, ptr addrspace(1) %out
 
   %cycle1 = call i64 @llvm.amdgcn.s.memrealtime()
-  store volatile i64 %cycle1, i64 addrspace(1)* %out
+  store volatile i64 %cycle1, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
index cd2eada..df8fb22 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll

@@ -13,12 +13,12 @@
 ; SIVI-NOT: lgkmcnt
 ; GCN: s_memtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: {{buffer|global}}_store_dwordx2
-define amdgpu_kernel void @test_s_memtime(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_s_memtime(ptr addrspace(1) %out) #0 {
   %cycle0 = call i64 @llvm.amdgcn.s.memtime()
-  store volatile i64 %cycle0, i64 addrspace(1)* %out
+  store volatile i64 %cycle0, ptr addrspace(1) %out
 
   %cycle1 = call i64 @llvm.amdgcn.s.memtime()
-  store volatile i64 %cycle1, i64 addrspace(1)* %out
+  store volatile i64 %cycle1, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
index 560d320..8471626 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll

@@ -5,17 +5,17 @@
 
 ; GCN-LABEL: {{^}}v_sad_hi_u8:
 ; GCN: v_sad_hi_u8 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_sad_hi_u8(ptr addrspace(1) %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 100, i32 100) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_sad_hi_u8_non_immediate:
 ; GCN: v_sad_hi_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_hi_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_sad_hi_u8_non_immediate(ptr addrspace(1) %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 %a, i32 %b) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
index b159b84..b2876f6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll

@@ -5,17 +5,17 @@
 
 ; GCN-LABEL: {{^}}v_sad_u16:
 ; GCN: v_sad_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_sad_u16(ptr addrspace(1) %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 100, i32 100) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_sad_u16_non_immediate:
 ; GCN: v_sad_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u16_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_sad_u16_non_immediate(ptr addrspace(1) %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 %a, i32 %b) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
index cde7e7c..8c0ca22 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll

@@ -5,17 +5,17 @@
 
 ; GCN-LABEL: {{^}}v_sad_u8:
 ; GCN: v_sad_u8 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_sad_u8(ptr addrspace(1) %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 100, i32 100) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_sad_u8_non_immediate:
 ; GCN: v_sad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_sad_u8_non_immediate(ptr addrspace(1) %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 %a, i32 %b) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
index 8281a21..d15a60e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll

@@ -3,60 +3,60 @@
 
 ; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg:
 ; GCN: v_bfe_i32
-define amdgpu_kernel void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_i32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src1)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}bfe_i32_arg_arg_imm:
 ; GCN: v_bfe_i32
-define amdgpu_kernel void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_i32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 123)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}bfe_i32_arg_imm_arg:
 ; GCN: v_bfe_i32
-define amdgpu_kernel void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_i32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 123, i32 %src2)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}bfe_i32_imm_arg_arg:
 ; GCN: v_bfe_i32
-define amdgpu_kernel void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_i32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 123, i32 %src1, i32 %src2)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_bfe_print_arg:
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
-define amdgpu_kernel void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) #0 {
-  %load = load i32, i32 addrspace(1)* %src0, align 4
+define amdgpu_kernel void @v_bfe_print_arg(ptr addrspace(1) %out, ptr addrspace(1) %src0) #0 {
+  %load = load i32, ptr addrspace(1) %src0, align 4
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 2, i32 8)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset:
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 0)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset:
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 8, i32 0)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -64,11 +64,11 @@
 ; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @bfe_i32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -78,11 +78,11 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @bfe_i32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 0, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -90,11 +90,11 @@
 ; GCN: buffer_load_dword
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @bfe_i32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -103,10 +103,10 @@
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @bfe_i32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %x = load i32, ptr addrspace(1) %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -115,10 +115,10 @@
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @bfe_i32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %x = load i32, ptr addrspace(1) %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -127,10 +127,10 @@
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @bfe_i32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %x = load i32, ptr addrspace(1) %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 8, i32 24)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -139,10 +139,10 @@
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @bfe_i32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %x = load i32, ptr addrspace(1) %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 24, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -150,22 +150,22 @@
 ; GCN: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @bfe_i32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = ashr i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+  store i32 %bfe, ptr addrspace(1) %out, align 4 ret void
 }
 
 ; GCN-LABEL: {{^}}bfe_i32_test_14:
 ; GCN-NOT: lshr
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @bfe_i32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = lshr i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+  store i32 %bfe, ptr addrspace(1) %out, align 4 ret void
 }
 
 ; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_0:
@@ -173,9 +173,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_0(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 0)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -184,9 +184,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_1(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 12334, i32 0, i32 0)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -195,9 +195,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_2(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 1)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -206,9 +206,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_3(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 1, i32 0, i32 1)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -217,9 +217,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_4(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 0, i32 1)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -228,9 +228,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_5(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 7, i32 1)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -239,9 +239,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_6(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 0, i32 8)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -250,9 +250,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_7(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 0, i32 8)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -261,9 +261,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_8(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 6, i32 8)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -272,9 +272,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_9(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65536, i32 16, i32 8)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -283,9 +283,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_10(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65535, i32 16, i32 16)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -294,9 +294,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -6
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_11(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 4)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -305,9 +305,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_12(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 31, i32 1)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -316,9 +316,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_13(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 131070, i32 16, i32 16)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -327,9 +327,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_14(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 2, i32 30)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -338,9 +338,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_15(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 28)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -349,9 +349,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_16(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 1, i32 7)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -360,9 +360,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_17(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 1, i32 31)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -371,9 +371,9 @@
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_18(ptr addrspace(1) %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 31, i32 1)
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_i32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -383,12 +383,12 @@
 ; GCN-NOT: v_ashr
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24
 ; GCN: buffer_store_dword [[BFE]],
-define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %x = load i32, ptr addrspace(1) %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 0, i32 24)
   %shl = shl i32 %bfe, 8
   %ashr = ashr i32 %shl, 8
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  store i32 %ashr, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -399,21 +399,21 @@
 ; GCN: v_add_{{[iu]}}32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]]
 ; GCN: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
 ; GCN: buffer_store_dword [[TMP2]]
-define amdgpu_kernel void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %src = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %src = load i32, ptr addrspace(1) %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16)
   %div = sdiv i32 %bfe, 2
-  store i32 %div, i32 addrspace(1)* %out, align 4
+  store i32 %div, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}bfe_0_width:
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
+define amdgpu_kernel void @bfe_0_width(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+  %load = load i32, ptr addrspace(1) %ptr, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 8, i32 0)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -421,22 +421,22 @@
 ; GCN: v_bfe_i32
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
+define amdgpu_kernel void @bfe_8_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+  %load = load i32, ptr addrspace(1) %ptr, align 4
   %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8)
   %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 8)
-  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  store i32 %bfe1, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}bfe_8_bfe_16:
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
+define amdgpu_kernel void @bfe_8_bfe_16(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+  %load = load i32, ptr addrspace(1) %ptr, align 4
   %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8)
   %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 16)
-  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  store i32 %bfe1, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -445,11 +445,11 @@
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define amdgpu_kernel void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
+define amdgpu_kernel void @bfe_16_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+  %load = load i32, ptr addrspace(1) %ptr, align 4
   %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 16)
   %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 8)
-  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  store i32 %bfe1, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -458,22 +458,22 @@
 ; GCN: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 0, i32 8)
   %shl = shl i32 %bfe, 24
   %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  store i32 %ashr, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong:
-define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 8, i32 0)
   %shl = shl i32 %bfe, 24
   %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  store i32 %ashr, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -481,13 +481,13 @@
 ; GCN: buffer_load_sbyte
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define amdgpu_kernel void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
-  %load = load i8, i8 addrspace(1)* %ptr, align 1
+define amdgpu_kernel void @sextload_i8_to_i32_bfe(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+  %load = load i8, ptr addrspace(1) %ptr, align 1
   %sext = sext i8 %load to i32
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %sext, i32 0, i32 8)
   %shl = shl i32 %bfe, 24
   %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  store i32 %ashr, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -495,13 +495,13 @@
 ; GCN-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
-  %load = load i8, i8 addrspace(1)* %ptr, align 1
+define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+  %load = load i8, ptr addrspace(1) %ptr, align 1
   %sext = sext i8 %load to i32
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %sext, i32 8, i32 0)
   %shl = shl i32 %bfe, 24
   %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  store i32 %ashr, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -510,12 +510,12 @@
 ; GCN-NOT: shl
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
 ; GCN: s_endpgm
-define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = shl i32 %x, 31
   %shr = ashr i32 %shl, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -525,12 +525,12 @@
 ; GCN-NOT: shr
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
 ; GCN: s_endpgm
-define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = shl i32 %x, 30
   %shr = ashr i32 %shl, 30
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 1, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -541,12 +541,12 @@
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
 ; GCN: s_endpgm
-define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = shl i32 %x, 30
   %shr = ashr i32 %shl, 30
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 1, i32 2)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index 00d1cbb..035903b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll

@@ -26,7 +26,7 @@
   ret void
 }
 
-define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(<32 x i32> addrspace(1)* noalias %in, <32 x i32> addrspace(1)* noalias %out) #0 {
+define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -161,11 +161,11 @@
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #2
-  %gep1 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %in, i32 %tid
-  %load = load <32 x i32>, <32 x i32> addrspace(1)* %gep1
+  %gep1 = getelementptr <32 x i32>, ptr addrspace(1) %in, i32 %tid
+  %load = load <32 x i32>, ptr addrspace(1) %gep1
   %mul = mul <32 x i32> %load, %load
-  %gep2 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %out, i32 %tid
-  store <32 x i32> %mul, <32 x i32> addrspace(1)* %gep2
+  %gep2 = getelementptr <32 x i32>, ptr addrspace(1) %out, i32 %tid
+  store <32 x i32> %mul, ptr addrspace(1) %gep2
   ; 8 VMEM read
   call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 8, i32 0)
   ; 30 VALU
@@ -175,7 +175,7 @@
   ret void
 }
 
-define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(<32 x i32> addrspace(1)* noalias %in, <32 x i32> addrspace(1)* noalias %out) #0 {
+define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -336,11 +336,11 @@
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #2
-  %gep1 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %in, i32 %tid
-  %load = load <32 x i32>, <32 x i32> addrspace(1)* %gep1
+  %gep1 = getelementptr <32 x i32>, ptr addrspace(1) %in, i32 %tid
+  %load = load <32 x i32>, ptr addrspace(1) %gep1
   %mul = mul <32 x i32> %load, %load
-  %gep2 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %out, i32 %tid
-  store <32 x i32> %mul, <32 x i32> addrspace(1)* %gep2
+  %gep2 = getelementptr <32 x i32>, ptr addrspace(1) %out, i32 %tid
+  store <32 x i32> %mul, ptr addrspace(1) %gep2
   ; 1 VMEM read
   call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
   ; 2 VALU
@@ -378,7 +378,7 @@
   ret void
 }
 
-define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(<32 x i32> addrspace(1)* noalias %in, <32 x i32> addrspace(1)* noalias %out) #0 {
+define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -555,11 +555,11 @@
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #2
-  %gep1 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %in, i32 %tid
-  %load = load <32 x i32>, <32 x i32> addrspace(1)* %gep1
+  %gep1 = getelementptr <32 x i32>, ptr addrspace(1) %in, i32 %tid
+  %load = load <32 x i32>, ptr addrspace(1) %gep1
   %mul = mul <32 x i32> %load, %load
-  %gep2 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %out, i32 %tid
-  store <32 x i32> %mul, <32 x i32> addrspace(1)* %gep2
+  %gep2 = getelementptr <32 x i32>, ptr addrspace(1) %out, i32 %tid
+  store <32 x i32> %mul, ptr addrspace(1) %gep2
   ; 1 VMEM read
   call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
   ; 2 VALU
@@ -611,7 +611,7 @@
   ret void
 }
 
-define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(<32 x float> addrspace(3)* noalias %in, <32 x float> addrspace(3)* noalias %out) #0 {
+define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -825,31 +825,31 @@
 ; EXACTCUTOFF-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %load.0.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %in, i32 %idx
-  %load.0 = load <32 x float>, <32 x float> addrspace(3)* %load.0.addr
-  %load.1.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.0.addr, i32 64
-  %load.1 = load <32 x float>, <32 x float> addrspace(3)* %load.1.addr
-  %load.2.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.1.addr, i32 128
-  %load.2 = load <32 x float>, <32 x float> addrspace(3)* %load.2.addr
-  %load.3.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.2.addr, i32 192
-  %load.3 = load <32 x float>, <32 x float> addrspace(3)* %load.3.addr
-  %load.4.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.3.addr, i32 256
-  %load.4 = load <32 x float>, <32 x float> addrspace(3)* %load.4.addr
+  %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx
+  %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr
+  %load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64
+  %load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr
+  %load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128
+  %load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr
+  %load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192
+  %load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr
+  %load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256
+  %load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr
   %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0)
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0)
   %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0)
   %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0)
   %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0)
-  %store.0.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 %idx
-  store <32 x float> %mai.0, <32 x float> addrspace(3)* %store.0.addr
-  %store.1.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 64
-  store <32 x float> %mai.1, <32 x float> addrspace(3)* %store.1.addr
-  %store.2.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 128
-  store <32 x float> %mai.2, <32 x float> addrspace(3)* %store.2.addr
-  %store.3.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 192
-  store <32 x float> %mai.3, <32 x float> addrspace(3)* %store.3.addr
-  %store.4.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 256
-  store <32 x float> %mai.4, <32 x float> addrspace(3)* %store.4.addr
+  %store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx
+  store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr
+  %store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64
+  store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr
+  %store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128
+  store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr
+  %store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192
+  store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr
+  %store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256
+  store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr
   ; 40 DS read
   call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 40, i32 0)
   ; 5 MFMA
@@ -859,7 +859,7 @@
   ret void
 }
 
-define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(<32 x float> addrspace(3)* noalias %in, <32 x float> addrspace(3)* noalias %out) #0 {
+define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1127,31 +1127,31 @@
 ; EXACTCUTOFF-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %load.0.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %in, i32 %idx
-  %load.0 = load <32 x float>, <32 x float> addrspace(3)* %load.0.addr
-  %load.1.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.0.addr, i32 64
-  %load.1 = load <32 x float>, <32 x float> addrspace(3)* %load.1.addr
-  %load.2.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.1.addr, i32 128
-  %load.2 = load <32 x float>, <32 x float> addrspace(3)* %load.2.addr
-  %load.3.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.2.addr, i32 192
-  %load.3 = load <32 x float>, <32 x float> addrspace(3)* %load.3.addr
-  %load.4.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.3.addr, i32 256
-  %load.4 = load <32 x float>, <32 x float> addrspace(3)* %load.4.addr
+  %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx
+  %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr
+  %load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64
+  %load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr
+  %load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128
+  %load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr
+  %load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192
+  %load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr
+  %load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256
+  %load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr
   %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0)
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0)
   %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0)
   %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0)
   %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0)
-  %store.0.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 %idx
-  store <32 x float> %mai.0, <32 x float> addrspace(3)* %store.0.addr
-  %store.1.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 64
-  store <32 x float> %mai.1, <32 x float> addrspace(3)* %store.1.addr
-  %store.2.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 128
-  store <32 x float> %mai.2, <32 x float> addrspace(3)* %store.2.addr
-  %store.3.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 192
-  store <32 x float> %mai.3, <32 x float> addrspace(3)* %store.3.addr
-  %store.4.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 256
-  store <32 x float> %mai.4, <32 x float> addrspace(3)* %store.4.addr
+  %store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx
+  store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr
+  %store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64
+  store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr
+  %store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128
+  store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr
+  %store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192
+  store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr
+  %store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256
+  store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr
   ; 8 DS read
   call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 8, i32 0)
   ; 1 MFMA

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll
index 241c0de..2ec4fb7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll

@@ -10,16 +10,16 @@
 ; GFX908: v_dot2_i32_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 ; GFX10:  v_dot2_i32_i16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_sdot2_clamp(
-    i32 addrspace(1)* %r,
-    <2 x i16> addrspace(1)* %a,
-    <2 x i16> addrspace(1)* %b,
-    i32 addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
-  %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
-  %c.val = load i32, i32 addrspace(1)* %c
+  %a.val = load <2 x i16>, ptr addrspace(1) %a
+  %b.val = load <2 x i16>, ptr addrspace(1) %b
+  %c.val = load i32, ptr addrspace(1) %c
   %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 1)
-  store i32 %r.val, i32 addrspace(1)* %r
+  store i32 %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -28,15 +28,15 @@
 ; GFX908: v_dot2c_i32_i16_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
 ; GFX10:  v_dot2_i32_i16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_sdot2_no_clamp(
-    i32 addrspace(1)* %r,
-    <2 x i16> addrspace(1)* %a,
-    <2 x i16> addrspace(1)* %b,
-    i32 addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
-  %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
-  %c.val = load i32, i32 addrspace(1)* %c
+  %a.val = load <2 x i16>, ptr addrspace(1) %a
+  %b.val = load <2 x i16>, ptr addrspace(1) %b
+  %c.val = load i32, ptr addrspace(1) %c
   %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 0)
-  store i32 %r.val, i32 addrspace(1)* %r
+  store i32 %r.val, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
index 4166101..aa20f35 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll

@@ -10,18 +10,18 @@
 ; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 ; GFX10:  v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_sdot4_clamp(
-    i32 addrspace(1)* %r,
-    <4 x i8> addrspace(1)* %a,
-    <4 x i8> addrspace(1)* %b,
-    i32 addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a
-  %b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b
+  %a.val = load <4 x i8>, ptr addrspace(1) %a
+  %b.val = load <4 x i8>, ptr addrspace(1) %b
   %a.val.cast = bitcast <4 x i8> %a.val to i32
   %b.val.cast = bitcast <4 x i8> %b.val to i32
-  %c.val = load i32, i32 addrspace(1)* %c
+  %c.val = load i32, ptr addrspace(1) %c
   %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
-  store i32 %r.val, i32 addrspace(1)* %r
+  store i32 %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -29,17 +29,17 @@
 ; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
 ; GFX10:  v_dot4c_i32_i8_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_sdot4_no_clamp(
-    i32 addrspace(1)* %r,
-    <4 x i8> addrspace(1)* %a,
-    <4 x i8> addrspace(1)* %b,
-    i32 addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a
-  %b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b
+  %a.val = load <4 x i8>, ptr addrspace(1) %a
+  %b.val = load <4 x i8>, ptr addrspace(1) %b
   %a.val.cast = bitcast <4 x i8> %a.val to i32
   %b.val.cast = bitcast <4 x i8> %b.val to i32
-  %c.val = load i32, i32 addrspace(1)* %c
+  %c.val = load i32, ptr addrspace(1) %c
   %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
-  store i32 %r.val, i32 addrspace(1)* %r
+  store i32 %r.val, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
index 948bc53..a8cadaa8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll

@@ -12,18 +12,18 @@
 ; GFX908:  v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 ; GFX10:   v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_sdot8_clamp(
-    i32 addrspace(1)* %r,
-    <8 x i4> addrspace(1)* %a,
-    <8 x i4> addrspace(1)* %b,
-    i32 addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a
-  %b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b
+  %a.val = load <8 x i4>, ptr addrspace(1) %a
+  %b.val = load <8 x i4>, ptr addrspace(1) %b
   %a.val.cast = bitcast <8 x i4> %a.val to i32
   %b.val.cast = bitcast <8 x i4> %b.val to i32
-  %c.val = load i32, i32 addrspace(1)* %c
+  %c.val = load i32, ptr addrspace(1) %c
   %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
-  store i32 %r.val, i32 addrspace(1)* %r
+  store i32 %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -32,17 +32,17 @@
 ; GFX908:  v_dot8c_i32_i4_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
 ; GFX10:   v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_sdot8_no_clamp(
-    i32 addrspace(1)* %r,
-    <8 x i4> addrspace(1)* %a,
-    <8 x i4> addrspace(1)* %b,
-    i32 addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a
-  %b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b
+  %a.val = load <8 x i4>, ptr addrspace(1) %a
+  %b.val = load <8 x i4>, ptr addrspace(1) %b
   %a.val.cast = bitcast <8 x i4> %a.val to i32
   %b.val.cast = bitcast <8 x i4> %b.val to i32
-  %c.val = load i32, i32 addrspace(1)* %c
+  %c.val = load i32, ptr addrspace(1) %c
   %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
-  store i32 %r.val, i32 addrspace(1)* %r
+  store i32 %r.val, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
index ae42d68..5cc5963 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll

@@ -2,7 +2,7 @@
 ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG %s
 ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL %s
 
-define amdgpu_kernel void @test_get_doorbell(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) {
 ; GFX11-SDAG-LABEL: test_get_doorbell:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
@@ -23,11 +23,11 @@
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 128)
-  store i32 %ret, i32 addrspace(1)* %out
+  store i32 %ret, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_get_ddid(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) {
 ; GFX11-SDAG-LABEL: test_get_ddid:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
@@ -48,11 +48,11 @@
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 129)
-  store i32 %ret, i32 addrspace(1)* %out
+  store i32 %ret, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_get_tma(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) {
 ; GFX11-LABEL: test_get_tma:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
@@ -64,11 +64,11 @@
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 130)
-  store i64 %ret, i64 addrspace(1)* %out
+  store i64 %ret, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_get_realtime(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) {
 ; GFX11-LABEL: test_get_realtime:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
@@ -80,11 +80,11 @@
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 131)
-  store i64 %ret, i64 addrspace(1)* %out
+  store i64 %ret, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_savewave(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) {
 ; GFX11-SDAG-LABEL: test_savewave:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
@@ -105,11 +105,11 @@
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 132)
-  store i32 %ret, i32 addrspace(1)* %out
+  store i32 %ret, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_get_tba(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) {
 ; GFX11-LABEL: test_get_tba:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
@@ -121,11 +121,11 @@
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 133)
-  store i64 %ret, i64 addrspace(1)* %out
+  store i64 %ret, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_get_0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) {
 ; GFX11-SDAG-LABEL: test_get_0_i32:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
@@ -146,11 +146,11 @@
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 0)
-  store i32 %ret, i32 addrspace(1)* %out
+  store i32 %ret, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_get_99999_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) {
 ; GFX11-LABEL: test_get_99999_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
@@ -162,7 +162,7 @@
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 99999)
-  store i64 %ret, i64 addrspace(1)* %out
+  store i64 %ret, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 21a36c5..6d1ea93 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll

@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -early-live-intervals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-define amdgpu_kernel void @set_inactive(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: set_inactive:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -17,11 +17,11 @@
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
-  store i32 %tmp, i32 addrspace(1)* %out
+  store i32 %tmp, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @set_inactive_64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
 ; GCN-LABEL: set_inactive_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -39,11 +39,11 @@
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
-  store i64 %tmp, i64 addrspace(1)* %out
+  store i64 %tmp, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @set_inactive_scc(i32 addrspace(1)* %out, i32 %in, <4 x i32> inreg %desc) {
+define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) {
 ; GCN-LABEL: set_inactive_scc:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -82,12 +82,12 @@
   br i1 %cmp, label %.zero, label %.one
 
 .zero:
-  store i32 %tmp, i32 addrspace(1)* %out
+  store i32 %tmp, ptr addrspace(1) %out
   br label %.exit
 
 .one:
   %tmp.1 = add i32 %tmp, 1
-  store i32 %tmp.1, i32 addrspace(1)* %out
+  store i32 %tmp.1, ptr addrspace(1) %out
   br label %.exit
 
 .exit:

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
index d29ae5b..1eb4675 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll

@@ -8,9 +8,9 @@
 ; GCN: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]]
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
-define amdgpu_kernel void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
+define amdgpu_kernel void @s_flbit(ptr addrspace(1) noalias %out, i32 %val) #0 {
   %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
-  store i32 %r, i32 addrspace(1)* %out, align 4
+  store i32 %r, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -18,10 +18,10 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
-define amdgpu_kernel void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
-  %val = load i32, i32 addrspace(1)* %valptr, align 4
+define amdgpu_kernel void @v_flbit(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
+  %val = load i32, ptr addrspace(1) %valptr, align 4
   %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
-  store i32 %r, i32 addrspace(1)* %out, align 4
+  store i32 %r, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
index 4b930bf..238604f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll

@@ -8,11 +8,11 @@
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @sin_f16(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
+  %a.val = load half, ptr addrspace(1) %a
   %r.val = call half @llvm.amdgcn.sin.f16(half %a.val)
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
index 06d2473..a605e31 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll

@@ -5,9 +5,9 @@
 
 ; GCN-LABEL: {{^}}v_sin_f32:
 ; GCN: v_sin_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define amdgpu_kernel void @v_sin_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_sin_f32(ptr addrspace(1) %out, float %src) #1 {
   %sin = call float @llvm.amdgcn.sin.f32(float %src) #0
-  store float %sin, float addrspace(1)* %out
+  store float %sin, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
index 4e89591..3446dcd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll

@@ -122,10 +122,10 @@
 ;CHECK-LABEL: {{^}}buffer_load_v4i32_tfe:
 ;CHECK: buffer_load_format_xyzw v[2:6], {{v[0-9]+}}, s[0:3], 0 idxen tfe
 ;CHECK: s_waitcnt
-define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, <4 x i32> addrspace(1)* %out) {
+define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) {
   %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <4 x i32>, i32 } %load, 0
-  store <4 x i32> %data, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %data, ptr addrspace(1) %out
   %status = extractvalue { <4 x i32>, i32 } %load, 1
   %fstatus = bitcast i32 %status to float
   ret float %fstatus
@@ -134,10 +134,10 @@
 ;CHECK-LABEL: {{^}}buffer_load_v4f32_tfe:
 ;CHECK: buffer_load_format_xyzw v[2:6], {{v[0-9]+}}, s[0:3], 0 idxen tfe
 ;CHECK: s_waitcnt
-define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, <4 x float> addrspace(1)* %out) {
+define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) {
   %load = call { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <4 x float>, i32 } %load, 0
-  store <4 x float> %data, <4 x float> addrspace(1)* %out
+  store <4 x float> %data, ptr addrspace(1) %out
   %status = extractvalue { <4 x float>, i32 } %load, 1
   %fstatus = bitcast i32 %status to float
   ret float %fstatus
@@ -146,10 +146,10 @@
 ;CHECK-LABEL: {{^}}buffer_load_v3i32_tfe:
 ;CHECK: buffer_load_format_xyz v[2:5], {{v[0-9]+}}, s[0:3], 0 idxen tfe
 ;CHECK: s_waitcnt
-define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, <3 x i32> addrspace(1)* %out) {
+define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) {
   %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <3 x i32>, i32 } %load, 0
-  store <3 x i32> %data, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %data, ptr addrspace(1) %out
   %status = extractvalue { <3 x i32>, i32 } %load, 1
   %fstatus = bitcast i32 %status to float
   ret float %fstatus
@@ -158,10 +158,10 @@
 ;CHECK-LABEL: {{^}}buffer_load_v3f32_tfe:
 ;CHECK: buffer_load_format_xyz v[2:5], {{v[0-9]+}}, s[0:3], 0 idxen tfe
 ;CHECK: s_waitcnt
-define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, <3 x float> addrspace(1)* %out) {
+define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) {
   %load = call { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <3 x float>, i32 } %load, 0
-  store <3 x float> %data, <3 x float> addrspace(1)* %out
+  store <3 x float> %data, ptr addrspace(1) %out
   %status = extractvalue { <3 x float>, i32 } %load, 1
   %fstatus = bitcast i32 %status to float
   ret float %fstatus
@@ -171,10 +171,10 @@
 ;GFX6: buffer_load_format_xyz v[2:5], {{v[0-9]+}}, s[0:3], 0 idxen tfe
 ;GFX8PLUS: buffer_load_format_xy v[2:4], {{v[0-9]+}}, s[0:3], 0 idxen tfe
 ;CHECK: s_waitcnt
-define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, <2 x i32> addrspace(1)* %out) {
+define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) {
   %load = call { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <2 x i32>, i32 } %load, 0
-  store <2 x i32> %data, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %data, ptr addrspace(1) %out
   %status = extractvalue { <2 x i32>, i32 } %load, 1
   %fstatus = bitcast i32 %status to float
   ret float %fstatus
@@ -184,10 +184,10 @@
 ;GFX6: buffer_load_format_xyz v[2:5], {{v[0-9]+}}, s[0:3], 0 idxen tfe
 ;GFX8PLUS: buffer_load_format_xy v[2:4], {{v[0-9]+}}, s[0:3], 0 idxen tfe
 ;CHECK: s_waitcnt
-define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, <2 x float> addrspace(1)* %out) {
+define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) {
   %load = call { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <2 x float>, i32 } %load, 0
-  store <2 x float> %data, <2 x float> addrspace(1)* %out
+  store <2 x float> %data, ptr addrspace(1) %out
   %status = extractvalue { <2 x float>, i32 } %load, 1
   %fstatus = bitcast i32 %status to float
   ret float %fstatus
@@ -196,10 +196,10 @@
 ;CHECK-LABEL: {{^}}buffer_load_i32_tfe:
 ;CHECK: buffer_load_format_x v[2:3], {{v[0-9]+}}, s[0:3], 0 idxen tfe
 ;CHECK: s_waitcnt
-define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, i32 addrspace(1)* %out) {
+define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) {
   %load = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { i32, i32 } %load, 0
-  store i32 %data, i32 addrspace(1)* %out
+  store i32 %data, ptr addrspace(1) %out
   %status = extractvalue { i32, i32 } %load, 1
   %fstatus = bitcast i32 %status to float
   ret float %fstatus
@@ -208,10 +208,10 @@
 ;CHECK-LABEL: {{^}}buffer_load_f32_tfe:
 ;CHECK: buffer_load_format_x v[2:3], {{v[0-9]+}}, s[0:3], 0 idxen tfe
 ;CHECK: s_waitcnt
-define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, float addrspace(1)* %out) {
+define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %out) {
   %load = call { float, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { float, i32 } %load, 0
-  store float %data, float addrspace(1)* %out
+  store float %data, ptr addrspace(1) %out
   %status = extractvalue { float, i32 } %load, 1
   %fstatus = bitcast i32 %status to float
   ret float %fstatus

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
index 3ddff95..1a7fd5e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll

@@ -94,11 +94,11 @@
   %i7 = zext i16 %i4 to i32
   %i8 = zext i16 %i6 to i32
   %i9 = add nuw nsw i32 0, 7
-  %i10 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %i9
-  store i32 %i7, i32 addrspace(3)* %i10, align 4
+  %i10 = getelementptr [0 x i32], ptr addrspace(3) @esgs_ring, i32 0, i32 %i9
+  store i32 %i7, ptr addrspace(3) %i10, align 4
   %i11 = add nuw nsw i32 0, 8
-  %i12 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %i11
-  store i32 %i8, i32 addrspace(3)* %i12, align 4
+  %i12 = getelementptr [0 x i32], ptr addrspace(3) @esgs_ring, i32 0, i32 %i11
+  store i32 %i8, ptr addrspace(3) %i12, align 4
   unreachable
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll
index b94ba83..b0ece85 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll

@@ -2,9 +2,9 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,SDAG
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GISEL
 
-declare void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
+declare void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
 
-define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) {
+define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
 ; SDAG-LABEL: buffer_load_lds_dword:
 ; SDAG:       ; %bb.0: ; %main_body
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 8
@@ -32,15 +32,14 @@
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    ; return to shader part epilog
 main_body:
-  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0)
-  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)
-  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2)
-  %ptr = bitcast i8 addrspace(3)* %lds to float addrspace(3)*
-  %res = load float, float addrspace(3)* %ptr
+  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)
+  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2)
+  %res = load float, ptr addrspace(3) %lds
   ret float %res
 }
 
-define amdgpu_ps void @buffer_load_lds_dword_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) {
+define amdgpu_ps void @buffer_load_lds_dword_imm_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex) {
 ; GCN-LABEL: buffer_load_lds_dword_imm_offset:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    s_mov_b32 m0, s4
@@ -48,11 +47,11 @@
 ; GCN-NEXT:    buffer_load_dword v0, s[0:3], 0 idxen offset:2048 lds
 ; GCN-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 0, i32 0, i32 2048, i32 0)
+  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %vindex, i32 0, i32 0, i32 2048, i32 0)
   ret void
 }
 
-define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset) {
+define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset) {
 ; GCN-LABEL: buffer_load_lds_dword_v_offset:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    s_mov_b32 m0, s4
@@ -60,11 +59,11 @@
 ; GCN-NEXT:    buffer_load_dword v[0:1], s[0:3], 0 idxen offen lds
 ; GCN-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 0)
   ret void
 }
 
-define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 inreg %soffset) {
+define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 inreg %soffset) {
 ; GCN-LABEL: buffer_load_lds_dword_s_offset:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    s_mov_b32 m0, s4
@@ -72,11 +71,11 @@
 ; GCN-NEXT:    buffer_load_dword v0, s[0:3], s5 idxen lds
 ; GCN-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 0, i32 %soffset, i32 0, i32 0)
+  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %vindex, i32 0, i32 %soffset, i32 0, i32 0)
   ret void
 }
 
-define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
 ; GCN-LABEL: buffer_load_lds_dword_vs_offset:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    s_mov_b32 m0, s4
@@ -84,11 +83,11 @@
 ; GCN-NEXT:    buffer_load_dword v[0:1], s[0:3], s5 idxen offen lds
 ; GCN-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 0, i32 0)
+  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 0, i32 0)
   ret void
 }
 
-define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
 ; GCN-LABEL: buffer_load_lds_dword_vs_imm_offset:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    s_mov_b32 m0, s4
@@ -96,11 +95,11 @@
 ; GCN-NEXT:    buffer_load_dword v[0:1], s[0:3], s5 idxen offen offset:2048 lds
 ; GCN-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 2048, i32 0)
+  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 2048, i32 0)
   ret void
 }
 
-define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) {
+define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex) {
 ; GCN-LABEL: buffer_load_lds_ushort:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x800
@@ -109,11 +108,11 @@
 ; GCN-NEXT:    buffer_load_ushort v[0:1], s[0:3], 0 idxen offen lds
 ; GCN-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 2, i32 %vindex, i32 2048, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 2, i32 %vindex, i32 2048, i32 0, i32 0, i32 0)
   ret void
 }
 
-define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) {
+define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex) {
 ; GCN-LABEL: buffer_load_lds_ubyte:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    s_mov_b32 m0, s4
@@ -121,6 +120,6 @@
 ; GCN-NEXT:    buffer_load_ubyte v0, s[0:3], 0 idxen offset:2048 lds
 ; GCN-NEXT:    s_endpgm
 main_body:
-  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 1, i32 %vindex, i32 0, i32 0, i32 2048, i32 0)
+  call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 1, i32 %vindex, i32 0, i32 0, i32 2048, i32 0)
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
index 84e0f28ba..6fc304c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll

@@ -116,12 +116,12 @@
 ; CHECK-LABEL: buffer_load_mmo:
 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) {
+define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, ptr addrspace(3) %lds) {
 entry:
-  store float 0.0, float addrspace(3)* %lds
+  store float 0.0, ptr addrspace(3) %lds
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
-  %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
-  store float 0.0, float addrspace(3)* %tmp2
+  %tmp2 = getelementptr float, ptr addrspace(3) %lds, i32 4
+  store float 0.0, ptr addrspace(3) %tmp2
   ret float %val
 }
 
@@ -205,10 +205,10 @@
 ;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b16 v0, [[VAL]]
-define amdgpu_ps void @struct_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr, i32 %idx) {
+define amdgpu_ps void @struct_buffer_load_f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
 main_body:
   %val = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
-  store half %val, half addrspace(3)* %ptr
+  store half %val, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -217,10 +217,10 @@
 ;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b32 v0, [[VAL]]
-define amdgpu_ps void @struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr, i32 %idx) {
+define amdgpu_ps void @struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
 main_body:
   %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
-  store <2 x half> %val, <2 x half> addrspace(3)* %ptr
+  store <2 x half> %val, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -229,10 +229,10 @@
 ;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b64 v0, [[VAL]]
-define amdgpu_ps void @struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr, i32 %idx) {
+define amdgpu_ps void @struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
 main_body:
   %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
-  store <4 x half> %val, <4 x half> addrspace(3)* %ptr
+  store <4 x half> %val, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -241,10 +241,10 @@
 ;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b16 v0, [[VAL]]
-define amdgpu_ps void @struct_buffer_load_i16(<4 x i32> inreg %rsrc, i16 addrspace(3)* %ptr, i32 %idx) {
+define amdgpu_ps void @struct_buffer_load_i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
 main_body:
   %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
-  store i16 %val, i16 addrspace(3)* %ptr
+  store i16 %val, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -253,10 +253,10 @@
 ;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b32 v0, [[VAL]]
-define amdgpu_ps void @struct_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr, i32 %idx) {
+define amdgpu_ps void @struct_buffer_load_v2i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
 main_body:
   %val = call <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
-  store <2 x i16> %val, <2 x i16> addrspace(3)* %ptr
+  store <2 x i16> %val, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -265,10 +265,10 @@
 ;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b64 v0, [[VAL]]
-define amdgpu_ps void @struct_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr, i32 %idx) {
+define amdgpu_ps void @struct_buffer_load_v4i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
 main_body:
   %val = call <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
-  store <4 x i16> %val, <4 x i16> addrspace(3)* %ptr
+  store <4 x i16> %val, ptr addrspace(3) %ptr
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
index 8468aa3..d0d5129 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll

@@ -9,11 +9,11 @@
 ; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]]
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
-define amdgpu_kernel void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load double, double addrspace(1)* %aptr, align 8
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @test_trig_preop_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+  %a = load double, ptr addrspace(1) %aptr, align 8
+  %b = load i32, ptr addrspace(1) %bptr, align 4
   %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b) nounwind readnone
-  store double %result, double addrspace(1)* %out, align 8
+  store double %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -22,9 +22,9 @@
 ; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
-define amdgpu_kernel void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
-  %a = load double, double addrspace(1)* %aptr, align 8
+define amdgpu_kernel void @test_trig_preop_f64_imm_segment(ptr addrspace(1) %out, ptr addrspace(1) %aptr) nounwind {
+  %a = load double, ptr addrspace(1) %aptr, align 8
   %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7) nounwind readnone
-  store double %result, double addrspace(1)* %out, align 8
+  store double %result, ptr addrspace(1) %out, align 8
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
index e0dbfa9..f74446f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll

@@ -2,7 +2,7 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
 
-define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
 ; SI-LABEL: bfe_u32_arg_arg_arg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -29,11 +29,11 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 {
 ; SI-LABEL: bfe_u32_arg_arg_imm:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -62,11 +62,11 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 {
 ; SI-LABEL: bfe_u32_arg_imm_arg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -95,11 +95,11 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 {
 ; SI-LABEL: bfe_u32_imm_arg_arg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -130,11 +130,11 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 {
 ; SI-LABEL: bfe_u32_arg_0_width_reg_offset:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -155,11 +155,11 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 {
 ; SI-LABEL: bfe_u32_arg_0_width_imm_offset:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -180,11 +180,11 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_zextload_i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -218,15 +218,15 @@
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %load = load i8, i8 addrspace(1)* %in
+  %load = load i8, ptr addrspace(1) %in
   %ext = zext i8 %load to i32
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FIXME: Should be using s_add_i32
-define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_zext_in_reg_i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -264,15 +264,15 @@
 ; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %load = load i32, i32 addrspace(1)* %in, align 4
+  %load = load i32, ptr addrspace(1) %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_zext_in_reg_i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -310,15 +310,15 @@
 ; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %load = load i32, i32 addrspace(1)* %in, align 4
+  %load = load i32, ptr addrspace(1) %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 65535
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 16)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_1:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -358,15 +358,15 @@
 ; VI-NEXT:    v_bfe_u32 v0, v0, 1, 8
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %load = load i32, i32 addrspace(1)* %in, align 4
+  %load = load i32, ptr addrspace(1) %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 1, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_3:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -406,15 +406,15 @@
 ; VI-NEXT:    v_bfe_u32 v0, v0, 3, 8
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %load = load i32, i32 addrspace(1)* %in, align 4
+  %load = load i32, ptr addrspace(1) %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 3, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_7:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -454,15 +454,15 @@
 ; VI-NEXT:    v_bfe_u32 v0, v0, 7, 8
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %load = load i32, i32 addrspace(1)* %in, align 4
+  %load = load i32, ptr addrspace(1) %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 7, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_zext_in_reg_i16_offset_8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -500,15 +500,15 @@
 ; VI-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %load = load i32, i32 addrspace(1)* %in, align 4
+  %load = load i32, ptr addrspace(1) %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 65535
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 8, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_test_1:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -544,13 +544,13 @@
 ; VI-NEXT:    v_and_b32_e32 v0, 1, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %x = load i32, i32 addrspace(1)* %in, align 4
+  %x = load i32, ptr addrspace(1) %in, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_test_2:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -570,14 +570,14 @@
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  %x = load i32, i32 addrspace(1)* %in, align 4
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_test_3:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -597,14 +597,14 @@
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  %x = load i32, i32 addrspace(1)* %in, align 4
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_test_4:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -624,15 +624,15 @@
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  %x = load i32, i32 addrspace(1)* %in, align 4
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = shl i32 %x, 31
   %shr = lshr i32 %shl, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_test_5:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -668,15 +668,15 @@
 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %x = load i32, i32 addrspace(1)* %in, align 4
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = shl i32 %x, 31
   %shr = ashr i32 %shl, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_test_6:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -714,14 +714,14 @@
 ; VI-NEXT:    v_and_b32_e32 v0, 2.0, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %x = load i32, i32 addrspace(1)* %in, align 4
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_test_7:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -757,14 +757,14 @@
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 31, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %x = load i32, i32 addrspace(1)* %in, align 4
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_test_8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -800,14 +800,14 @@
 ; VI-NEXT:    v_and_b32_e32 v0, 1, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %x = load i32, i32 addrspace(1)* %in, align 4
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_test_9:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -843,13 +843,13 @@
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %x = load i32, i32 addrspace(1)* %in, align 4
+  %x = load i32, ptr addrspace(1) %in, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_test_10:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -885,13 +885,13 @@
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %x = load i32, i32 addrspace(1)* %in, align 4
+  %x = load i32, ptr addrspace(1) %in, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_test_11:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -927,13 +927,13 @@
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %x = load i32, i32 addrspace(1)* %in, align 4
+  %x = load i32, ptr addrspace(1) %in, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_test_12:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -969,14 +969,14 @@
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %x = load i32, i32 addrspace(1)* %in, align 4
+  %x = load i32, ptr addrspace(1) %in, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  store i32 %bfe, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
-define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_test_13:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1012,13 +1012,13 @@
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %x = load i32, i32 addrspace(1)* %in, align 4
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = ashr i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+  store i32 %bfe, ptr addrspace(1) %out, align 4 ret void
 }
 
-define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: bfe_u32_test_14:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1038,14 +1038,14 @@
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  %x = load i32, i32 addrspace(1)* %in, align 4
+  %x = load i32, ptr addrspace(1) %in, align 4
   %shl = lshr i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+  store i32 %bfe, ptr addrspace(1) %out, align 4 ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_0:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1066,12 +1066,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_1:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1092,12 +1092,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_2:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1118,12 +1118,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_3:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1144,12 +1144,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_4:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1170,12 +1170,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_5:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1196,12 +1196,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_6:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1222,12 +1222,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_7:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1248,12 +1248,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1274,12 +1274,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_9:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1300,12 +1300,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_10:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1326,12 +1326,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_11:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1352,12 +1352,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_12:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1378,12 +1378,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_13:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1404,12 +1404,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_14:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1430,12 +1430,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_15:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1456,12 +1456,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1482,12 +1482,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_17:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1508,12 +1508,12 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; EG-NOT: BFE
-define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: bfe_u32_constant_fold_test_18:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1534,7 +1534,7 @@
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1)
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -1542,7 +1542,7 @@
 ; reduced to the bits demanded by the bfe.
 
 ; XXX: The operand to v_bfe_u32 could also just directly be the load register.
-define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
+define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0,
 ; SI-LABEL: simplify_bfe_u32_multi_use_arg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -1587,17 +1587,17 @@
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
-                                            i32 addrspace(1)* %out1,
-                                            i32 addrspace(1)* %in) #0 {
-  %src = load i32, i32 addrspace(1)* %in, align 4
+                                            ptr addrspace(1) %out1,
+                                            ptr addrspace(1) %in) #0 {
+  %src = load i32, ptr addrspace(1) %in, align 4
   %and = and i32 %src, 63
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %and, i32 2, i32 2)
-  store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4
-  store i32 %and, i32 addrspace(1)* %out1, align 4
+  store i32 %bfe_u32, ptr addrspace(1) %out0, align 4
+  store i32 %and, ptr addrspace(1) %out1, align 4
   ret void
 }
 
-define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 {
 ; SI-LABEL: lshr_and:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -1623,11 +1623,11 @@
 ; VI-NEXT:    s_endpgm
   %b = lshr i32 %a, 6
   %c = and i32 %b, 7
-  store i32 %c, i32 addrspace(1)* %out, align 8
+  store i32 %c, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; SI-LABEL: v_lshr_and:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1657,11 +1657,11 @@
 ; VI-NEXT:    s_endpgm
   %c = lshr i32 %a, %b
   %d = and i32 %c, 7
-  store i32 %d, i32 addrspace(1)* %out, align 8
+  store i32 %d, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 {
 ; SI-LABEL: and_lshr:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -1687,11 +1687,11 @@
 ; VI-NEXT:    s_endpgm
   %b = and i32 %a, 448
   %c = lshr i32 %b, 6
-  store i32 %c, i32 addrspace(1)* %out, align 8
+  store i32 %c, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 {
 ; SI-LABEL: and_lshr2:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -1717,11 +1717,11 @@
 ; VI-NEXT:    s_endpgm
   %b = and i32 %a, 511
   %c = lshr i32 %b, 6
-  store i32 %c, i32 addrspace(1)* %out, align 8
+  store i32 %c, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 {
 ; SI-LABEL: shl_lshr:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -1747,7 +1747,7 @@
 ; VI-NEXT:    s_endpgm
   %b = shl i32 %a, 9
   %c = lshr i32 %b, 11
-  store i32 %c, i32 addrspace(1)* %out, align 8
+  store i32 %c, ptr addrspace(1) %out, align 8
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll
index f241a23..490ce70 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll

@@ -11,16 +11,16 @@
 ; GFX9:   v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 ; GFX10:  v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_udot2_clamp(
-    i32 addrspace(1)* %r,
-    <2 x i16> addrspace(1)* %a,
-    <2 x i16> addrspace(1)* %b,
-    i32 addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
-  %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
-  %c.val = load i32, i32 addrspace(1)* %c
+  %a.val = load <2 x i16>, ptr addrspace(1) %a
+  %b.val = load <2 x i16>, ptr addrspace(1) %b
+  %c.val = load i32, ptr addrspace(1) %c
   %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 1)
-  store i32 %r.val, i32 addrspace(1)* %r
+  store i32 %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -28,16 +28,16 @@
 ; GFX9:   v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
 ; GFX10:  v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_udot2_no_clamp(
-    i32 addrspace(1)* %r,
-    <2 x i16> addrspace(1)* %a,
-    <2 x i16> addrspace(1)* %b,
-    i32 addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
-  %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
-  %c.val = load i32, i32 addrspace(1)* %c
+  %a.val = load <2 x i16>, ptr addrspace(1) %a
+  %b.val = load <2 x i16>, ptr addrspace(1) %b
+  %c.val = load i32, ptr addrspace(1) %c
   %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 0)
-  store i32 %r.val, i32 addrspace(1)* %r
+  store i32 %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -46,18 +46,18 @@
 ; GFX940: v_dot2_u32_u16 v{{[0-9]+}}, 1, v{{[0-9]+}}, s{{[0-9]+}}{{$}}
 ; GFX10:  v_dot2_u32_u16 v{{[0-9]+}}, 1, v{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1,0] op_sel_hi:[0,0,1]{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_udot2_op_sel(
-    i32 addrspace(1)* %r,
-    <2 x i16> addrspace(1)* %b,
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %b,
     i32 %c) {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b, i32 %id
-  %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep
+  %b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b, i32 %id
+  %b.val = load <2 x i16>, ptr addrspace(1) %b.gep
   %b.elt0 = extractelement <2 x i16> %b.val, i32 0
   %b.elt1 = extractelement <2 x i16> %b.val, i32 1
   %b0 = insertelement <2 x i16> undef, i16 %b.elt1, i32 0
   %b1 = insertelement <2 x i16> %b0, i16 %b.elt0, i32 1
   %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 1, i16 1>, <2 x i16> %b1, i32 %c, i1 0)
-  store i32 %r.val, i32 addrspace(1)* %r
+  store i32 %r.val, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll
index a2fb4fd..7fc79a1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll

@@ -10,18 +10,18 @@
 ; GFX9:   v_dot4_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 ; GFX10:  v_dot4_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_udot4_clamp(
-    i32 addrspace(1)* %r,
-    <4 x i8> addrspace(1)* %a,
-    <4 x i8> addrspace(1)* %b,
-    i32 addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a
-  %b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b
+  %a.val = load <4 x i8>, ptr addrspace(1) %a
+  %b.val = load <4 x i8>, ptr addrspace(1) %b
   %a.val.cast = bitcast <4 x i8> %a.val to i32
   %b.val.cast = bitcast <4 x i8> %b.val to i32
-  %c.val = load i32, i32 addrspace(1)* %c
+  %c.val = load i32, ptr addrspace(1) %c
   %r.val = call i32 @llvm.amdgcn.udot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
-  store i32 %r.val, i32 addrspace(1)* %r
+  store i32 %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -29,17 +29,17 @@
 ; GFX9:   v_dot4_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
 ; GFX10:  v_dot4_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_udot4_no_clamp(
-    i32 addrspace(1)* %r,
-    <4 x i8> addrspace(1)* %a,
-    <4 x i8> addrspace(1)* %b,
-    i32 addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a
-  %b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b
+  %a.val = load <4 x i8>, ptr addrspace(1) %a
+  %b.val = load <4 x i8>, ptr addrspace(1) %b
   %a.val.cast = bitcast <4 x i8> %a.val to i32
   %b.val.cast = bitcast <4 x i8> %b.val to i32
-  %c.val = load i32, i32 addrspace(1)* %c
+  %c.val = load i32, ptr addrspace(1) %c
   %r.val = call i32 @llvm.amdgcn.udot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
-  store i32 %r.val, i32 addrspace(1)* %r
+  store i32 %r.val, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll
index 8bc53c9..4aed5b5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll

@@ -10,18 +10,18 @@
 ; GFX9:   v_dot8_u32_u4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 ; GFX10:  v_dot8_u32_u4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_udot8_clamp(
-    i32 addrspace(1)* %r,
-    <8 x i4> addrspace(1)* %a,
-    <8 x i4> addrspace(1)* %b,
-    i32 addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a
-  %b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b
+  %a.val = load <8 x i4>, ptr addrspace(1) %a
+  %b.val = load <8 x i4>, ptr addrspace(1) %b
   %a.val.cast = bitcast <8 x i4> %a.val to i32
   %b.val.cast = bitcast <8 x i4> %b.val to i32
-  %c.val = load i32, i32 addrspace(1)* %c
+  %c.val = load i32, ptr addrspace(1) %c
   %r.val = call i32 @llvm.amdgcn.udot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
-  store i32 %r.val, i32 addrspace(1)* %r
+  store i32 %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -29,17 +29,17 @@
 ; GFX9:   v_dot8_u32_u4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
 ; GFX10:  v_dot8_u32_u4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_udot8_no_clamp(
-    i32 addrspace(1)* %r,
-    <8 x i4> addrspace(1)* %a,
-    <8 x i4> addrspace(1)* %b,
-    i32 addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a
-  %b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b
+  %a.val = load <8 x i4>, ptr addrspace(1) %a
+  %b.val = load <8 x i4>, ptr addrspace(1) %b
   %a.val.cast = bitcast <8 x i4> %a.val to i32
   %b.val.cast = bitcast <8 x i4> %b.val to i32
-  %c.val = load i32, i32 addrspace(1)* %c
+  %c.val = load i32, ptr addrspace(1) %c
   %r.val = call i32 @llvm.amdgcn.udot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
-  store i32 %r.val, i32 addrspace(1)* %r
+  store i32 %r.val, ptr addrspace(1) %r
   ret void
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index 581b73d..8472271 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll

@@ -10,9 +10,9 @@
 ; GFX8-OPT: s_mov
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
-define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
+define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
   %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0) #0
-  store i32 %tmp0, i32 addrspace(1)* %out
+  store i32 %tmp0, ptr addrspace(1) %out
   ret void
 }
 
@@ -23,9 +23,9 @@
 ; GFX8-OPT: s_mov
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1{{$}}
-define amdgpu_kernel void @dpp_test_bc(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
+define amdgpu_kernel void @dpp_test_bc(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
   %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 1) #0
-  store i32 %tmp0, i32 addrspace(1)* %out
+  store i32 %tmp0, ptr addrspace(1) %out
   ret void
 }
 
@@ -38,20 +38,20 @@
 ; GFX8: s_nop 1
 ; GFX8-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
 @0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4
-define weak_odr amdgpu_kernel void @dpp_test1(i32* %arg) local_unnamed_addr {
+define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
-  %tmp2 = getelementptr inbounds [448 x i32], [448 x i32] addrspace(3)* @0, i32 0, i32 %tmp
-  %tmp3 = load i32, i32 addrspace(3)* %tmp2, align 4
+  %tmp2 = getelementptr inbounds [448 x i32], ptr addrspace(3) @0, i32 0, i32 %tmp
+  %tmp3 = load i32, ptr addrspace(3) %tmp2, align 4
   fence syncscope("workgroup-one-as") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup-one-as") acquire
   %tmp4 = add nsw i32 %tmp3, %tmp3
   %tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false)
   %tmp6 = add nsw i32 %tmp5, %tmp4
-  %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
-  store i32 %tmp6, i32* %tmp7, align 4
+  %tmp7 = getelementptr inbounds i32, ptr %arg, i64 %tmp1
+  store i32 %tmp6, ptr %tmp7, align 4
   ret void
 }
 
@@ -59,12 +59,12 @@
 ; GCN:     load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
 ; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
 ; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
-define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i64 %in2) {
+define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
-  %load = load i64, i64 addrspace(1)* %gep
+  %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
+  %load = load i64, ptr addrspace(1) %gep
   %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 0) #0
-  store i64 %tmp0, i64 addrspace(1)* %gep
+  store i64 %tmp0, ptr addrspace(1) %gep
   ret void
 }
 
@@ -79,12 +79,12 @@
 ; GFX8-OPT-DAG,GFX10-DAG,GFX11-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
 ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
 ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
-define amdgpu_kernel void @update_dpp64_imm_old_test(i64 addrspace(1)* %arg, i64 %in2) {
+define amdgpu_kernel void @update_dpp64_imm_old_test(ptr addrspace(1) %arg, i64 %in2) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
-  %load = load i64, i64 addrspace(1)* %gep
+  %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
+  %load = load i64, ptr addrspace(1) %gep
   %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 123451234512345, i64 %load, i32 1, i32 1, i32 1, i1 0) #0
-  store i64 %tmp0, i64 addrspace(1)* %gep
+  store i64 %tmp0, ptr addrspace(1) %gep
   ret void
 }
 
@@ -97,9 +97,9 @@
 ; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
 ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
 ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
-define amdgpu_kernel void @update_dpp64_imm_src_test(i64 addrspace(1)* %out, i64 %in1) {
+define amdgpu_kernel void @update_dpp64_imm_src_test(ptr addrspace(1) %out, i64 %in1) {
   %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 0) #0
-  store i64 %tmp0, i64 addrspace(1)* %out
+  store i64 %tmp0, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index 224a7bb..1347ef2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll

@@ -22,16 +22,16 @@
 ; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 64
 ; GCN:       store_{{dword|b32}} v{{.+}}, [[V]]
 
-; OPT-W32:   store i32 32, i32 addrspace(1)* %arg, align 4
-; OPT-W64:   store i32 64, i32 addrspace(1)* %arg, align 4
+; OPT-W32:   store i32 32, ptr addrspace(1) %arg, align 4
+; OPT-W64:   store i32 64, ptr addrspace(1) %arg, align 4
 ; OPT-WXX:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT-WXX:   store i32 %tmp, i32 addrspace(1)* %arg, align 4
+; OPT-WXX:   store i32 %tmp, ptr addrspace(1) %arg, align 4
 ; OPT-NEXT:  ret void
 
-define amdgpu_kernel void @fold_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
-  store i32 %tmp, i32 addrspace(1)* %arg, align 4
+  store i32 %tmp, ptr addrspace(1) %arg, align 4
   ret void
 }
 
@@ -43,20 +43,20 @@
 ; GCN-NOT:   cndmask
 ; GCN:       store_{{dword|b32}} v{{.+}}, [[V]]
 
-; OPT-W32:   store i32 1, i32 addrspace(1)* %arg, align 4
-; OPT-W64:   store i32 2, i32 addrspace(1)* %arg, align 4
+; OPT-W32:   store i32 1, ptr addrspace(1) %arg, align 4
+; OPT-W64:   store i32 2, ptr addrspace(1) %arg, align 4
 ; OPT-WXX:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
 ; OPT-WXX:   %tmp1 = icmp ugt i32 %tmp, 32
 ; OPT-WXX:   %tmp2 = select i1 %tmp1, i32 2, i32 1
-; OPT-WXX:   store i32 %tmp2, i32 addrspace(1)* %arg
+; OPT-WXX:   store i32 %tmp2, ptr addrspace(1) %arg
 ; OPT-NEXT:  ret void
 
-define amdgpu_kernel void @fold_and_optimize_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   %tmp1 = icmp ugt i32 %tmp, 32
   %tmp2 = select i1 %tmp1, i32 2, i32 1
-  store i32 %tmp2, i32 addrspace(1)* %arg
+  store i32 %tmp2, ptr addrspace(1) %arg
   ret void
 }
 
@@ -67,17 +67,17 @@
 ; OPT-WXX:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
 ; OPT-WXX:   %tmp1 = icmp ugt i32 %tmp, 32
 ; OPT-WXX:   bb3:
-; OPT-W64:   store i32 1, i32 addrspace(1)* %arg, align 4
+; OPT-W64:   store i32 1, ptr addrspace(1) %arg, align 4
 ; OPT-NEXT:  ret void
 
-define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   %tmp1 = icmp ugt i32 %tmp, 32
   br i1 %tmp1, label %bb2, label %bb3
 
 bb2:                                              ; preds = %bb
-  store i32 1, i32 addrspace(1)* %arg, align 4
+  store i32 1, ptr addrspace(1) %arg, align 4
   br label %bb3
 
 bb3:                                              ; preds = %bb2, %bb

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
index c63a634..e6af6e8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll

@@ -10,7 +10,7 @@
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_f32_16x16x16_f16:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
@@ -21,13 +21,13 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
+  store <8 x float> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_f32_16x16x16_bf16:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
@@ -38,13 +38,13 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
+  store <8 x float> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f16.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
@@ -55,11 +55,11 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0)
-  store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
+  store <16 x half> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
@@ -70,13 +70,13 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1)
-  store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
+  store <16 x half> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
@@ -87,11 +87,11 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0)
-  store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
+  store <16 x i16> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
@@ -102,13 +102,13 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1)
-  store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
+  store <16 x i16> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15]
@@ -119,11 +119,11 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
@@ -134,11 +134,11 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
@@ -149,11 +149,11 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0]
@@ -164,11 +164,11 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] clamp
@@ -179,11 +179,11 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp
@@ -194,11 +194,11 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp
@@ -209,11 +209,11 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp
@@ -224,13 +224,13 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu4
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
@@ -241,11 +241,11 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
@@ -256,11 +256,11 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
@@ -271,11 +271,11 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0]
@@ -286,12 +286,12 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
@@ -302,11 +302,11 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp
@@ -317,11 +317,11 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp
@@ -332,11 +332,11 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp
@@ -347,7 +347,7 @@
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
index 6db0205..dd6b1a1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll

@@ -10,7 +10,7 @@
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_f32_16x16x16_f16:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
@@ -19,13 +19,13 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <4 x float> %C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
+  store <4 x float> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_f32_16x16x16_bf16:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
@@ -34,13 +34,13 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
+  store <4 x float> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f16.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
@@ -49,11 +49,11 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
+  store <8 x half> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
@@ -62,13 +62,13 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
+  store <8 x half> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
@@ -77,11 +77,11 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0)
-  store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
+  store <8 x i16> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
@@ -90,13 +90,13 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1)
-  store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
+  store <8 x i16> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11]
@@ -105,12 +105,12 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
@@ -119,11 +119,11 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
@@ -132,11 +132,11 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
@@ -145,11 +145,11 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp
@@ -158,11 +158,11 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
@@ -171,11 +171,11 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
@@ -184,11 +184,11 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
@@ -197,13 +197,13 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu4
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7]
@@ -212,11 +212,11 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
@@ -225,11 +225,11 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
@@ -238,11 +238,11 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
@@ -251,11 +251,11 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp
@@ -264,11 +264,11 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
@@ -277,11 +277,11 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
@@ -290,11 +290,11 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
@@ -303,7 +303,7 @@
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
index 6d7ce55..3eec083 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll

@@ -34,9 +34,9 @@
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define amdgpu_kernel void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 {
   %id = call i32 @llvm.amdgcn.workgroup.id.x()
-  store i32 %id, i32 addrspace(1)* %out
+  store i32 %id, ptr addrspace(1) %out
   ret void
 }
 
@@ -61,9 +61,9 @@
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define amdgpu_kernel void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 {
   %id = call i32 @llvm.amdgcn.workgroup.id.y()
-  store i32 %id, i32 addrspace(1)* %out
+  store i32 %id, ptr addrspace(1) %out
   ret void
 }
 
@@ -96,9 +96,9 @@
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define amdgpu_kernel void @test_workgroup_id_z(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 {
   %id = call i32 @llvm.amdgcn.workgroup.id.z()
-  store i32 %id, i32 addrspace(1)* %out
+  store i32 %id, ptr addrspace(1) %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index f41a184..0d6c5cb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll

@@ -22,9 +22,9 @@
 ; ALL: {{buffer|flat|global}}_store_{{dword|b32}} {{.*}}v0
 
 ; PACKED-TID: .amdhsa_system_vgpr_workitem_id 0
-define amdgpu_kernel void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workitem_id_x(ptr addrspace(1) %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  store i32 %id, i32 addrspace(1)* %out
+  store i32 %id, ptr addrspace(1) %out
   ret void
 }
 
@@ -40,9 +40,9 @@
 ; PACKED-TID: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10
 ; PACKED-TID: {{buffer|flat|global}}_store_{{dword|b32}} {{.*}}[[ID]]
 ; PACKED-TID: .amdhsa_system_vgpr_workitem_id 1
-define amdgpu_kernel void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workitem_id_y(ptr addrspace(1) %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.y()
-  store i32 %id, i32 addrspace(1)* %out
+  store i32 %id, ptr addrspace(1) %out
   ret void
 }
 
@@ -58,9 +58,9 @@
 ; PACKED-TID: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10
 ; PACKED-TID: {{buffer|flat|global}}_store_{{dword|b32}} {{.*}}[[ID]]
 ; PACKED-TID: .amdhsa_system_vgpr_workitem_id 2
-define amdgpu_kernel void @test_workitem_id_z(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workitem_id_z(ptr addrspace(1) %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.z()
-  store i32 %id, i32 addrspace(1)* %out
+  store i32 %id, ptr addrspace(1) %out
   ret void
 }
 
@@ -76,13 +76,13 @@
 
 ; ALL: flat_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
 ; ALL: flat_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
-define amdgpu_kernel void @test_reqd_workgroup_size_x_only(i32* %out) !reqd_work_group_size !0 {
+define amdgpu_kernel void @test_reqd_workgroup_size_x_only(ptr %out) !reqd_work_group_size !0 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %id.y = call i32 @llvm.amdgcn.workitem.id.y()
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
-  store volatile i32 %id.x, i32* %out
-  store volatile i32 %id.y, i32* %out
-  store volatile i32 %id.z, i32* %out
+  store volatile i32 %id.x, ptr %out
+  store volatile i32 %id.y, ptr %out
+  store volatile i32 %id.z, ptr %out
   ret void
 }
 
@@ -98,13 +98,13 @@
 ; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
 
 ; ALL: flat_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
-define amdgpu_kernel void @test_reqd_workgroup_size_y_only(i32* %out) !reqd_work_group_size !1 {
+define amdgpu_kernel void @test_reqd_workgroup_size_y_only(ptr %out) !reqd_work_group_size !1 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %id.y = call i32 @llvm.amdgcn.workitem.id.y()
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
-  store volatile i32 %id.x, i32* %out
-  store volatile i32 %id.y, i32* %out
-  store volatile i32 %id.z, i32* %out
+  store volatile i32 %id.x, ptr %out
+  store volatile i32 %id.y, ptr %out
+  store volatile i32 %id.z, ptr %out
   ret void
 }
 
@@ -119,13 +119,13 @@
 
 ; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 20
 ; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
-define amdgpu_kernel void @test_reqd_workgroup_size_z_only(i32* %out) !reqd_work_group_size !2 {
+define amdgpu_kernel void @test_reqd_workgroup_size_z_only(ptr %out) !reqd_work_group_size !2 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %id.y = call i32 @llvm.amdgcn.workitem.id.y()
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
-  store volatile i32 %id.x, i32* %out
-  store volatile i32 %id.y, i32* %out
-  store volatile i32 %id.z, i32* %out
+  store volatile i32 %id.x, ptr %out
+  store volatile i32 %id.y, ptr %out
+  store volatile i32 %id.z, ptr %out
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index bf75801..3795166 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll

@@ -8,33 +8,33 @@
 ; CHECK-LABEL: {{^}}test_writelane_sreg:
 ; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
 ; GFX10: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
-  %oldval = load i32, i32 addrspace(1)* %out
+define amdgpu_kernel void @test_writelane_sreg(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
+  %oldval = load i32, ptr addrspace(1) %out
   %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
-  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}test_writelane_imm_sreg:
 ; CHECK: v_writelane_b32 v{{[0-9]+}}, 32, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
-  %oldval = load i32, i32 addrspace(1)* %out
+define amdgpu_kernel void @test_writelane_imm_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
+  %oldval = load i32, ptr addrspace(1) %out
   %writelane = call i32 @llvm.amdgcn.writelane(i32 32, i32 %src1, i32 %oldval)
-  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}test_writelane_vreg_lane:
 ; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
 ; CHECK: v_writelane_b32 v{{[0-9]+}}, 12, [[LANE]]
-define amdgpu_kernel void @test_writelane_vreg_lane(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_writelane_vreg_lane(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
-  %args = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in
-  %oldval = load i32, i32 addrspace(1)* %out
+  %gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid
+  %args = load <2 x i32>, ptr addrspace(1) %gep.in
+  %oldval = load i32, ptr addrspace(1) %out
   %lane = extractelement <2 x i32> %args, i32 1
   %writelane = call i32 @llvm.amdgcn.writelane(i32 12, i32 %lane, i32 %oldval)
-  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -43,20 +43,20 @@
 ; CIGFX9: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; CIGFX9: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], m0
 ; GFX10: v_writelane_b32 v{{[0-9]+}}, m0, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
-  %oldval = load i32, i32 addrspace(1)* %out
+define amdgpu_kernel void @test_writelane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 {
+  %oldval = load i32, ptr addrspace(1) %out
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
   %writelane = call i32 @llvm.amdgcn.writelane(i32 %m0, i32 %src1, i32 %oldval)
-  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}test_writelane_imm:
 ; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
-define amdgpu_kernel void @test_writelane_imm(i32 addrspace(1)* %out, i32 %src0) #1 {
-  %oldval = load i32, i32 addrspace(1)* %out
+define amdgpu_kernel void @test_writelane_imm(ptr addrspace(1) %out, i32 %src0) #1 {
+  %oldval = load i32, ptr addrspace(1) %out
   %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 32, i32 %oldval) #0
-  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -64,9 +64,9 @@
 ; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], s{{[0-9]+}}
 ; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
 ; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
   %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
-  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -74,9 +74,9 @@
 ; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], 42
 ; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
 ; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_writelane_imm_oldval(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_writelane_imm_oldval(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 {
   %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 42)
-  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
commit	ad386a886b16ad83a982c0eff3f80d8ae8888882	[log] [tgz]
author	Matt Arsenault <Matthew.Arsenault@amd.com>	Mon Nov 28 14:13:14 2022 -0500
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	Mon Nov 28 14:21:31 2022 -0500
tree	f8655f3a187a4f104815bd2b31906cab995a74cf
parent	b1a6f2aa89c3cf82ae67d28a8a20651eb68d84f1 [diff]