llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll - llvm-project - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -mtriple=amdgcn-- -S -passes=separate-const-offset-from-gep,gvn -reassociate-geps-verify-no-dead-code < %s | FileCheck -check-prefix=IR %s

 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"

 @array = internal addrspace(4) constant [4096 x [32 x float]] zeroinitializer, align 4

 define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, ptr addrspace(1) nocapture %output) {
 ; IR-LABEL: define amdgpu_kernel void @sum_of_array(
 ; IR-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr addrspace(1) captures(none) [[OUTPUT:%.*]]) {
 ; IR-NEXT:    [[TMP:%.*]] = sext i32 [[Y]] to i64
 ; IR-NEXT:    [[TMP1:%.*]] = sext i32 [[X]] to i64
 ; IR-NEXT:    [[TMP2:%.*]] = getelementptr [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 [[TMP1]], i64 [[TMP]]
 ; IR-NEXT:    [[TMP82:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 4
 ; IR-NEXT:    [[TMP144:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 128
 ; IR-NEXT:    [[TMP187:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 132
 ; IR-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUTPUT]], align 4
 ; IR-NEXT:    ret void
 ;
   %tmp = sext i32 %y to i64
   %tmp1 = sext i32 %x to i64
   %tmp2 = getelementptr inbounds [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 %tmp1, i64 %tmp
   %tmp4 = load float, ptr addrspace(4) %tmp2, align 4
   %tmp5 = fadd float %tmp4, 0.000000e+00
   %tmp6 = add i32 %y, 1
   %tmp7 = sext i32 %tmp6 to i64
   %tmp8 = getelementptr inbounds [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 %tmp1, i64 %tmp7
   %tmp10 = load float, ptr addrspace(4) %tmp8, align 4
   %tmp11 = fadd float %tmp5, %tmp10
   %tmp12 = add i32 %x, 1
   %tmp13 = sext i32 %tmp12 to i64
   %tmp14 = getelementptr inbounds [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 %tmp13, i64 %tmp
   %tmp16 = load float, ptr addrspace(4) %tmp14, align 4
   %tmp17 = fadd float %tmp11, %tmp16
   %tmp18 = getelementptr inbounds [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 %tmp13, i64 %tmp7
   %tmp20 = load float, ptr addrspace(4) %tmp18, align 4
   %tmp21 = fadd float %tmp17, %tmp20
   store float %tmp21, ptr addrspace(1) %output, align 4
   ret void
 }

 @array2 = internal addrspace(4) constant [4096 x [4 x float]] zeroinitializer, align 4

 ; Some of the indices go over the maximum mubuf offset, so don't split them.

 define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, ptr addrspace(1) nocapture %output) {
 ; IR-LABEL: define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(
 ; IR-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr addrspace(1) captures(none) [[OUTPUT:%.*]]) {
 ; IR-NEXT:    [[TMP:%.*]] = sext i32 [[Y]] to i64
 ; IR-NEXT:    [[TMP1:%.*]] = sext i32 [[X]] to i64
 ; IR-NEXT:    [[TMP2:%.*]] = getelementptr [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP1]], i64 [[TMP]]
 ; IR-NEXT:    [[TMP6:%.*]] = add i32 [[Y]], 255
 ; IR-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
 ; IR-NEXT:    [[TMP82:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 1020
 ; IR-NEXT:    [[TMP12:%.*]] = add i32 [[X]], 256
 ; IR-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
 ; IR-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP13]], i64 [[TMP]]
 ; IR-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP13]], i64 [[TMP7]]
 ; IR-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUTPUT]], align 4
 ; IR-NEXT:    ret void
 ;
   %tmp = sext i32 %y to i64
   %tmp1 = sext i32 %x to i64
   %tmp2 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 %tmp1, i64 %tmp
   %tmp4 = load float, ptr addrspace(4) %tmp2, align 4
   %tmp5 = fadd float %tmp4, 0.000000e+00
   %tmp6 = add i32 %y, 255
   %tmp7 = sext i32 %tmp6 to i64
   %tmp8 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 %tmp1, i64 %tmp7
   %tmp10 = load float, ptr addrspace(4) %tmp8, align 4
   %tmp11 = fadd float %tmp5, %tmp10
   %tmp12 = add i32 %x, 256
   %tmp13 = sext i32 %tmp12 to i64
   %tmp14 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 %tmp13, i64 %tmp
   %tmp16 = load float, ptr addrspace(4) %tmp14, align 4
   %tmp17 = fadd float %tmp11, %tmp16
   %tmp18 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 %tmp13, i64 %tmp7
   %tmp20 = load float, ptr addrspace(4) %tmp18, align 4
   %tmp21 = fadd float %tmp17, %tmp20
   store float %tmp21, ptr addrspace(1) %output, align 4
   ret void
 }


 @lds_array = internal addrspace(3) global [4096 x [4 x float]] undef, align 4

 ; DS instructions have a larger immediate offset, so make sure these are OK.
 define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, ptr addrspace(1) nocapture %output) {
 ; IR-LABEL: define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(
 ; IR-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr addrspace(1) captures(none) [[OUTPUT:%.*]]) {
 ; IR-NEXT:    [[TMP2:%.*]] = getelementptr [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 [[X]], i32 [[Y]]
 ; IR-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(3) [[TMP2]], align 4
 ; IR-NEXT:    [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00
 ; IR-NEXT:    [[TMP82:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 1020
 ; IR-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(3) [[TMP82]], align 4
 ; IR-NEXT:    [[TMP11:%.*]] = fadd float [[TMP5]], [[TMP10]]
 ; IR-NEXT:    [[TMP144:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 64512
 ; IR-NEXT:    [[TMP16:%.*]] = load float, ptr addrspace(3) [[TMP144]], align 4
 ; IR-NEXT:    [[TMP17:%.*]] = fadd float [[TMP11]], [[TMP16]]
 ; IR-NEXT:    [[TMP187:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 65532
 ; IR-NEXT:    [[TMP20:%.*]] = load float, ptr addrspace(3) [[TMP187]], align 4
 ; IR-NEXT:    [[TMP21:%.*]] = fadd float [[TMP17]], [[TMP20]]
 ; IR-NEXT:    store float [[TMP21]], ptr addrspace(1) [[OUTPUT]], align 4
 ; IR-NEXT:    ret void
 ;
   %tmp2 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 %x, i32 %y
   %tmp4 = load float, ptr addrspace(3) %tmp2, align 4
   %tmp5 = fadd float %tmp4, 0.000000e+00
   %tmp6 = add i32 %y, 255
   %tmp8 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 %x, i32 %tmp6
   %tmp10 = load float, ptr addrspace(3) %tmp8, align 4
   %tmp11 = fadd float %tmp5, %tmp10
   %tmp12 = add i32 %x, 4032
   %tmp14 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 %tmp12, i32 %y
   %tmp16 = load float, ptr addrspace(3) %tmp14, align 4
   %tmp17 = fadd float %tmp11, %tmp16
   %tmp18 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 %tmp12, i32 %tmp6
   %tmp20 = load float, ptr addrspace(3) %tmp18, align 4
   %tmp21 = fadd float %tmp17, %tmp20
   store float %tmp21, ptr addrspace(1) %output, align 4
   ret void
 }

 define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @keep_metadata(ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
 ; IR-LABEL: define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @keep_metadata(
 ; IR-SAME: ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP0:%.*]], ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP1:%.*]], ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP2:%.*]], ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP3:%.*]], float inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], <2 x i32> [[TMP6:%.*]], <2 x i32> [[TMP7:%.*]], <2 x i32> [[TMP8:%.*]], <3 x i32> [[TMP9:%.*]], <2 x i32> [[TMP10:%.*]], <2 x i32> [[TMP11:%.*]], <2 x i32> [[TMP12:%.*]], float [[TMP13:%.*]], float [[TMP14:%.*]], float [[TMP15:%.*]], float [[TMP16:%.*]], float [[TMP17:%.*]], i32 [[TMP18:%.*]], i32 [[TMP19:%.*]], float [[TMP20:%.*]], i32 [[TMP21:%.*]]) #[[ATTR0:[0-9]+]] {
 ; IR-NEXT:  main_body:
 ; IR-NEXT:    [[TMP22:%.*]] = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 [[TMP5]]) #[[ATTR3:[0-9]+]]
 ; IR-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
 ; IR-NEXT:    [[TMP24:%.*]] = shl i32 [[TMP23]], 1
 ; IR-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[TMP24]] to i64
 ; IR-NEXT:    [[TMP25:%.*]] = getelementptr [0 x <8 x i32>], ptr addrspace(4) [[TMP1]], i64 0, i64 [[IDXPROM1]], !amdgpu.uniform [[META0:![0-9]+]]
 ; IR-NEXT:    [[TMP26:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP25]], align 32, !invariant.load [[META0]]
 ; IR-NEXT:    [[TMP27:%.*]] = shl i32 [[TMP23]], 2
 ; IR-NEXT:    [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
 ; IR-NEXT:    [[TMP29:%.*]] = getelementptr [0 x <4 x i32>], ptr addrspace(4) [[TMP1]], i64 0, i64 [[TMP28]], !amdgpu.uniform [[META0]]
 ; IR-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP29]], i64 48, !amdgpu.uniform [[META0]]
 ; IR-NEXT:    [[TMP31:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP30]], align 16, !invariant.load [[META0]]
 ; IR-NEXT:    [[TMP32:%.*]] = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> [[TMP26]], <4 x i32> [[TMP31]], i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #[[ATTR3]]
 ; IR-NEXT:    [[TMP33:%.*]] = extractelement <4 x float> [[TMP32]], i32 0
 ; IR-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP32]], i32 1
 ; IR-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP32]], i32 2
 ; IR-NEXT:    [[TMP36:%.*]] = extractelement <4 x float> [[TMP32]], i32 3
 ; IR-NEXT:    [[TMP37:%.*]] = bitcast float [[TMP4]] to i32
 ; IR-NEXT:    [[TMP38:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 [[TMP37]], 4
 ; IR-NEXT:    [[TMP39:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP38]], float [[TMP33]], 5
 ; IR-NEXT:    [[TMP40:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP39]], float [[TMP34]], 6
 ; IR-NEXT:    [[TMP41:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP40]], float [[TMP35]], 7
 ; IR-NEXT:    [[TMP42:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP41]], float [[TMP36]], 8
 ; IR-NEXT:    [[TMP43:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP42]], float [[TMP20]], 19
 ; IR-NEXT:    ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP43]]
 ;
 main_body:
   %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8
   %23 = bitcast float %22 to i32
   %24 = shl i32 %23, 1
   %25 = getelementptr [0 x <8 x i32>], ptr addrspace(4) %1, i32 0, i32 %24, !amdgpu.uniform !0
   %26 = load <8 x i32>, ptr addrspace(4) %25, align 32, !invariant.load !0
   %27 = shl i32 %23, 2
   %28 = or disjoint i32 %27, 3
   %29 = getelementptr [0 x <4 x i32>], ptr addrspace(4) %1, i32 0, i32 %28, !amdgpu.uniform !0
   %30 = load <4 x i32>, ptr addrspace(4) %29, align 16, !invariant.load !0
   %31 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %30, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8
   %32 = extractelement <4 x float> %31, i32 0
   %33 = extractelement <4 x float> %31, i32 1
   %34 = extractelement <4 x float> %31, i32 2
   %35 = extractelement <4 x float> %31, i32 3
   %36 = bitcast float %4 to i32
   %37 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %36, 4
   %38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %37, float %32, 5
   %39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 6
   %40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 7
   %41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 8
   %42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %20, 19
   ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42
 }

 ; Function Attrs: nounwind readnone speculatable
 declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6

 ; Function Attrs: nounwind readonly
 declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #7


 !0 = !{}

 attributes #5 = { "InitialPSInputAddr"="45175" }
 attributes #6 = { nounwind readnone speculatable }
 attributes #7 = { nounwind readonly }
 attributes #8 = { nounwind readnone }
 ;.
 ; IR: [[META0]] = !{}
 ;.
	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
	; RUN: opt -mtriple=amdgcn-- -S -passes=separate-const-offset-from-gep,gvn -reassociate-geps-verify-no-dead-code < %s \| FileCheck -check-prefix=IR %s

	target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"

	@array = internal addrspace(4) constant [4096 x [32 x float]] zeroinitializer, align 4

	define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, ptr addrspace(1) nocapture %output) {
	; IR-LABEL: define amdgpu_kernel void @sum_of_array(
	; IR-SAME: i32 [[X:%.]], i32 [[Y:%.]], ptr addrspace(1) captures(none) [[OUTPUT:%.*]]) {
	; IR-NEXT: [[TMP:%.*]] = sext i32 [[Y]] to i64
	; IR-NEXT: [[TMP1:%.*]] = sext i32 [[X]] to i64
	; IR-NEXT: [[TMP2:%.*]] = getelementptr [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 [[TMP1]], i64 [[TMP]]
	; IR-NEXT: [[TMP82:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 4
	; IR-NEXT: [[TMP144:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 128
	; IR-NEXT: [[TMP187:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 132
	; IR-NEXT: store float 0.000000e+00, ptr addrspace(1) [[OUTPUT]], align 4
	; IR-NEXT: ret void
	;
	%tmp = sext i32 %y to i64
	%tmp1 = sext i32 %x to i64
	%tmp2 = getelementptr inbounds [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 %tmp1, i64 %tmp
	%tmp4 = load float, ptr addrspace(4) %tmp2, align 4
	%tmp5 = fadd float %tmp4, 0.000000e+00
	%tmp6 = add i32 %y, 1
	%tmp7 = sext i32 %tmp6 to i64
	%tmp8 = getelementptr inbounds [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 %tmp1, i64 %tmp7
	%tmp10 = load float, ptr addrspace(4) %tmp8, align 4
	%tmp11 = fadd float %tmp5, %tmp10
	%tmp12 = add i32 %x, 1
	%tmp13 = sext i32 %tmp12 to i64
	%tmp14 = getelementptr inbounds [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 %tmp13, i64 %tmp
	%tmp16 = load float, ptr addrspace(4) %tmp14, align 4
	%tmp17 = fadd float %tmp11, %tmp16
	%tmp18 = getelementptr inbounds [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 %tmp13, i64 %tmp7
	%tmp20 = load float, ptr addrspace(4) %tmp18, align 4
	%tmp21 = fadd float %tmp17, %tmp20
	store float %tmp21, ptr addrspace(1) %output, align 4
	ret void
	}

	@array2 = internal addrspace(4) constant [4096 x [4 x float]] zeroinitializer, align 4

	; Some of the indices go over the maximum mubuf offset, so don't split them.

	define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, ptr addrspace(1) nocapture %output) {
	; IR-LABEL: define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(
	; IR-SAME: i32 [[X:%.]], i32 [[Y:%.]], ptr addrspace(1) captures(none) [[OUTPUT:%.*]]) {
	; IR-NEXT: [[TMP:%.*]] = sext i32 [[Y]] to i64
	; IR-NEXT: [[TMP1:%.*]] = sext i32 [[X]] to i64
	; IR-NEXT: [[TMP2:%.*]] = getelementptr [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP1]], i64 [[TMP]]
	; IR-NEXT: [[TMP6:%.*]] = add i32 [[Y]], 255
	; IR-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
	; IR-NEXT: [[TMP82:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 1020
	; IR-NEXT: [[TMP12:%.*]] = add i32 [[X]], 256
	; IR-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
	; IR-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP13]], i64 [[TMP]]
	; IR-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP13]], i64 [[TMP7]]
	; IR-NEXT: store float 0.000000e+00, ptr addrspace(1) [[OUTPUT]], align 4
	; IR-NEXT: ret void
	;
	%tmp = sext i32 %y to i64
	%tmp1 = sext i32 %x to i64
	%tmp2 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 %tmp1, i64 %tmp
	%tmp4 = load float, ptr addrspace(4) %tmp2, align 4
	%tmp5 = fadd float %tmp4, 0.000000e+00
	%tmp6 = add i32 %y, 255
	%tmp7 = sext i32 %tmp6 to i64
	%tmp8 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 %tmp1, i64 %tmp7
	%tmp10 = load float, ptr addrspace(4) %tmp8, align 4
	%tmp11 = fadd float %tmp5, %tmp10
	%tmp12 = add i32 %x, 256
	%tmp13 = sext i32 %tmp12 to i64
	%tmp14 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 %tmp13, i64 %tmp
	%tmp16 = load float, ptr addrspace(4) %tmp14, align 4
	%tmp17 = fadd float %tmp11, %tmp16
	%tmp18 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 %tmp13, i64 %tmp7
	%tmp20 = load float, ptr addrspace(4) %tmp18, align 4
	%tmp21 = fadd float %tmp17, %tmp20
	store float %tmp21, ptr addrspace(1) %output, align 4
	ret void
	}


	@lds_array = internal addrspace(3) global [4096 x [4 x float]] undef, align 4

	; DS instructions have a larger immediate offset, so make sure these are OK.
	define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, ptr addrspace(1) nocapture %output) {
	; IR-LABEL: define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(
	; IR-SAME: i32 [[X:%.]], i32 [[Y:%.]], ptr addrspace(1) captures(none) [[OUTPUT:%.*]]) {
	; IR-NEXT: [[TMP2:%.*]] = getelementptr [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 [[X]], i32 [[Y]]
	; IR-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(3) [[TMP2]], align 4
	; IR-NEXT: [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00
	; IR-NEXT: [[TMP82:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 1020
	; IR-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(3) [[TMP82]], align 4
	; IR-NEXT: [[TMP11:%.*]] = fadd float [[TMP5]], [[TMP10]]
	; IR-NEXT: [[TMP144:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 64512
	; IR-NEXT: [[TMP16:%.*]] = load float, ptr addrspace(3) [[TMP144]], align 4
	; IR-NEXT: [[TMP17:%.*]] = fadd float [[TMP11]], [[TMP16]]
	; IR-NEXT: [[TMP187:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 65532
	; IR-NEXT: [[TMP20:%.*]] = load float, ptr addrspace(3) [[TMP187]], align 4
	; IR-NEXT: [[TMP21:%.*]] = fadd float [[TMP17]], [[TMP20]]
	; IR-NEXT: store float [[TMP21]], ptr addrspace(1) [[OUTPUT]], align 4
	; IR-NEXT: ret void
	;
	%tmp2 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 %x, i32 %y
	%tmp4 = load float, ptr addrspace(3) %tmp2, align 4
	%tmp5 = fadd float %tmp4, 0.000000e+00
	%tmp6 = add i32 %y, 255
	%tmp8 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 %x, i32 %tmp6
	%tmp10 = load float, ptr addrspace(3) %tmp8, align 4
	%tmp11 = fadd float %tmp5, %tmp10
	%tmp12 = add i32 %x, 4032
	%tmp14 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 %tmp12, i32 %y
	%tmp16 = load float, ptr addrspace(3) %tmp14, align 4
	%tmp17 = fadd float %tmp11, %tmp16
	%tmp18 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 %tmp12, i32 %tmp6
	%tmp20 = load float, ptr addrspace(3) %tmp18, align 4
	%tmp21 = fadd float %tmp17, %tmp20
	store float %tmp21, ptr addrspace(1) %output, align 4
	ret void
	}

	define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @keep_metadata(ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
	; IR-LABEL: define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @keep_metadata(
	; IR-SAME: ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP0:%.]], ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP1:%.]], ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP2:%.]], ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP3:%.]], float inreg [[TMP4:%.]], i32 inreg [[TMP5:%.]], <2 x i32> [[TMP6:%.]], <2 x i32> [[TMP7:%.]], <2 x i32> [[TMP8:%.]], <3 x i32> [[TMP9:%.]], <2 x i32> [[TMP10:%.]], <2 x i32> [[TMP11:%.]], <2 x i32> [[TMP12:%.]], float [[TMP13:%.]], float [[TMP14:%.]], float [[TMP15:%.]], float [[TMP16:%.]], float [[TMP17:%.]], i32 [[TMP18:%.]], i32 [[TMP19:%.]], float [[TMP20:%.]], i32 [[TMP21:%.]]) #[[ATTR0:[0-9]+]] {
	; IR-NEXT: main_body:
	; IR-NEXT: [[TMP22:%.*]] = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 [[TMP5]]) #[[ATTR3:[0-9]+]]
	; IR-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
	; IR-NEXT: [[TMP24:%.*]] = shl i32 [[TMP23]], 1
	; IR-NEXT: [[IDXPROM1:%.*]] = sext i32 [[TMP24]] to i64
	; IR-NEXT: [[TMP25:%.*]] = getelementptr [0 x <8 x i32>], ptr addrspace(4) [[TMP1]], i64 0, i64 [[IDXPROM1]], !amdgpu.uniform [[META0:![0-9]+]]
	; IR-NEXT: [[TMP26:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP25]], align 32, !invariant.load [[META0]]
	; IR-NEXT: [[TMP27:%.*]] = shl i32 [[TMP23]], 2
	; IR-NEXT: [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
	; IR-NEXT: [[TMP29:%.*]] = getelementptr [0 x <4 x i32>], ptr addrspace(4) [[TMP1]], i64 0, i64 [[TMP28]], !amdgpu.uniform [[META0]]
	; IR-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP29]], i64 48, !amdgpu.uniform [[META0]]
	; IR-NEXT: [[TMP31:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP30]], align 16, !invariant.load [[META0]]
	; IR-NEXT: [[TMP32:%.*]] = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> [[TMP26]], <4 x i32> [[TMP31]], i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #[[ATTR3]]
	; IR-NEXT: [[TMP33:%.*]] = extractelement <4 x float> [[TMP32]], i32 0
	; IR-NEXT: [[TMP34:%.*]] = extractelement <4 x float> [[TMP32]], i32 1
	; IR-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP32]], i32 2
	; IR-NEXT: [[TMP36:%.*]] = extractelement <4 x float> [[TMP32]], i32 3
	; IR-NEXT: [[TMP37:%.*]] = bitcast float [[TMP4]] to i32
	; IR-NEXT: [[TMP38:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 [[TMP37]], 4
	; IR-NEXT: [[TMP39:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP38]], float [[TMP33]], 5
	; IR-NEXT: [[TMP40:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP39]], float [[TMP34]], 6
	; IR-NEXT: [[TMP41:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP40]], float [[TMP35]], 7
	; IR-NEXT: [[TMP42:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP41]], float [[TMP36]], 8
	; IR-NEXT: [[TMP43:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP42]], float [[TMP20]], 19
	; IR-NEXT: ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP43]]
	;
	main_body:
	%22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8
	%23 = bitcast float %22 to i32
	%24 = shl i32 %23, 1
	%25 = getelementptr [0 x <8 x i32>], ptr addrspace(4) %1, i32 0, i32 %24, !amdgpu.uniform !0
	%26 = load <8 x i32>, ptr addrspace(4) %25, align 32, !invariant.load !0
	%27 = shl i32 %23, 2
	%28 = or disjoint i32 %27, 3
	%29 = getelementptr [0 x <4 x i32>], ptr addrspace(4) %1, i32 0, i32 %28, !amdgpu.uniform !0
	%30 = load <4 x i32>, ptr addrspace(4) %29, align 16, !invariant.load !0
	%31 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %30, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8
	%32 = extractelement <4 x float> %31, i32 0
	%33 = extractelement <4 x float> %31, i32 1
	%34 = extractelement <4 x float> %31, i32 2
	%35 = extractelement <4 x float> %31, i32 3
	%36 = bitcast float %4 to i32
	%37 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %36, 4
	%38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %37, float %32, 5
	%39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 6
	%40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 7
	%41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 8
	%42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %20, 19
	ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42
	}

	; Function Attrs: nounwind readnone speculatable
	declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6

	; Function Attrs: nounwind readonly
	declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #7


	!0 = !{}

	attributes #5 = { "InitialPSInputAddr"="45175" }
	attributes #6 = { nounwind readnone speculatable }
	attributes #7 = { nounwind readonly }
	attributes #8 = { nounwind readnone }
	;.
	; IR: [[META0]] = !{}
	;.