llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/amdgpu-large-lds-offset.ll - llvm-project.git - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -passes=separate-const-offset-from-gep -S %s | FileCheck %s

 ; This test is intended to test that the separate-const-offset-from-gep
 ; pass will still separate offsets even if the base offset is large.

 @global_smem = external addrspace(3) global [0 x i8], align 16

 define void @large_base_offset(ptr addrspace(1) writeonly %out, i32 %idx) {
 ; CHECK-LABEL: define void @large_base_offset(
 ; CHECK-SAME: ptr addrspace(1) writeonly [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 15
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP2]], 10
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP1]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i32 [[TMP5]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i32 [[TMP2]], 5
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) @global_smem, i32 67584
 ; CHECK-NEXT:    [[TMP9:%.*]] = or disjoint i32 [[TMP6]], 16384
 ; CHECK-NEXT:    [[TMP10:%.*]] = lshr exact i32 [[TMP9]], 5
 ; CHECK-NEXT:    [[TMP11:%.*]] = and i32 [[TMP10]], 992
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP8]], i32 [[TMP6]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP12]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP13]], i32 16384
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <16 x half>, ptr addrspace(3) [[TMP14]], align 32
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x half> [[TMP15]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <8 x half> [[TMP16]], ptr addrspace(1) [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %idx_low_bits = and i32 %idx, 15
   %row_offset = shl nuw nsw i32 %idx_low_bits, 10
   %idx_bit4 = and i32 %idx, 16
   %idx_bit4_shifted = shl nuw nsw i32 %idx_bit4, 1
   %combined_offset = or disjoint i32 %idx_bit4_shifted, %row_offset
   %lane_offset = shl nuw nsw i32 %idx_low_bits, 5
   %lds_large_base = getelementptr i8, ptr addrspace(3) @global_smem, i32 67584
   %combined_offset_with_stride = or disjoint i32 %combined_offset, 16384
   %aligned_offset = lshr exact i32 %combined_offset_with_stride, 5
   %row_alignment = and i32 %aligned_offset, 992
   %lds_vector_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_large_base, i32 %combined_offset_with_stride
   %lds_lane_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_vector_ptr, i32 %row_alignment
   %lds_wide_load = load <16 x half>, ptr addrspace(3) %lds_lane_ptr, align 32
   %upper_half = shufflevector <16 x half> %lds_wide_load, <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <8 x half> %upper_half, ptr addrspace(1) %out, align 16
   ret void
 }

 define void @small_base_offset(ptr addrspace(1) writeonly %out, i32 %idx) {
 ; CHECK-LABEL: define void @small_base_offset(
 ; CHECK-SAME: ptr addrspace(1) writeonly [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 15
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP2]], 10
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP1]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i32 [[TMP5]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i32 [[TMP2]], 5
 ; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i32 [[TMP6]], 16384
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr exact i32 [[TMP8]], 5
 ; CHECK-NEXT:    [[TMP10:%.*]] = and i32 [[TMP9]], 992
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(3) @global_smem, i32 [[TMP6]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 [[TMP10]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 16388
 ; CHECK-NEXT:    [[TMP14:%.*]] = load <16 x half>, ptr addrspace(3) [[TMP13]], align 32
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x half> [[TMP14]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <8 x half> [[TMP15]], ptr addrspace(1) [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %idx_low_bits = and i32 %idx, 15
   %row_offset = shl nuw nsw i32 %idx_low_bits, 10
   %idx_bit4 = and i32 %idx, 16
   %idx_bit4_shifted = shl nuw nsw i32 %idx_bit4, 1
   %combined_offset = or disjoint i32 %idx_bit4_shifted, %row_offset
   %lane_offset = shl nuw nsw i32 %idx_low_bits, 5
   %lds_small_base = getelementptr i8, ptr addrspace(3) @global_smem, i32 4
   %combined_offset_with_stride = or disjoint i32 %combined_offset, 16384
   %aligned_offset = lshr exact i32 %combined_offset_with_stride, 5
   %row_alignment = and i32 %aligned_offset, 992
   %lds_vector_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_small_base, i32 %combined_offset_with_stride
   %lds_lane_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_vector_ptr, i32 %row_alignment
   %lds_wide_load = load <16 x half>, ptr addrspace(3) %lds_lane_ptr, align 32
   %upper_half = shufflevector <16 x half> %lds_wide_load, <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <8 x half> %upper_half, ptr addrspace(1) %out, align 16
   ret void
 }
	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
	; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -passes=separate-const-offset-from-gep -S %s \| FileCheck %s

	; This test is intended to test that the separate-const-offset-from-gep
	; pass will still separate offsets even if the base offset is large.

	@global_smem = external addrspace(3) global [0 x i8], align 16

	define void @large_base_offset(ptr addrspace(1) writeonly %out, i32 %idx) {
	; CHECK-LABEL: define void @large_base_offset(
	; CHECK-SAME: ptr addrspace(1) writeonly [[TMP0:%.]], i32 [[TMP1:%.]]) #[[ATTR0:[0-9]+]] {
	; CHECK-NEXT: [[ENTRY:.*:]]
	; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 15
	; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i32 [[TMP2]], 10
	; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP1]], 16
	; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 1
	; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i32 [[TMP5]], [[TMP3]]
	; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP2]], 5
	; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) @global_smem, i32 67584
	; CHECK-NEXT: [[TMP9:%.*]] = or disjoint i32 [[TMP6]], 16384
	; CHECK-NEXT: [[TMP10:%.*]] = lshr exact i32 [[TMP9]], 5
	; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP10]], 992
	; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP8]], i32 [[TMP6]]
	; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP12]], i32 [[TMP11]]
	; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP13]], i32 16384
	; CHECK-NEXT: [[TMP15:%.*]] = load <16 x half>, ptr addrspace(3) [[TMP14]], align 32
	; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x half> [[TMP15]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	; CHECK-NEXT: store <8 x half> [[TMP16]], ptr addrspace(1) [[TMP0]], align 16
	; CHECK-NEXT: ret void
	;
	entry:
	%idx_low_bits = and i32 %idx, 15
	%row_offset = shl nuw nsw i32 %idx_low_bits, 10
	%idx_bit4 = and i32 %idx, 16
	%idx_bit4_shifted = shl nuw nsw i32 %idx_bit4, 1
	%combined_offset = or disjoint i32 %idx_bit4_shifted, %row_offset
	%lane_offset = shl nuw nsw i32 %idx_low_bits, 5
	%lds_large_base = getelementptr i8, ptr addrspace(3) @global_smem, i32 67584
	%combined_offset_with_stride = or disjoint i32 %combined_offset, 16384
	%aligned_offset = lshr exact i32 %combined_offset_with_stride, 5
	%row_alignment = and i32 %aligned_offset, 992
	%lds_vector_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_large_base, i32 %combined_offset_with_stride
	%lds_lane_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_vector_ptr, i32 %row_alignment
	%lds_wide_load = load <16 x half>, ptr addrspace(3) %lds_lane_ptr, align 32
	%upper_half = shufflevector <16 x half> %lds_wide_load, <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	store <8 x half> %upper_half, ptr addrspace(1) %out, align 16
	ret void
	}

	define void @small_base_offset(ptr addrspace(1) writeonly %out, i32 %idx) {
	; CHECK-LABEL: define void @small_base_offset(
	; CHECK-SAME: ptr addrspace(1) writeonly [[TMP0:%.]], i32 [[TMP1:%.]]) #[[ATTR0]] {
	; CHECK-NEXT: [[ENTRY:.*:]]
	; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 15
	; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i32 [[TMP2]], 10
	; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP1]], 16
	; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 1
	; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i32 [[TMP5]], [[TMP3]]
	; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP2]], 5
	; CHECK-NEXT: [[TMP8:%.*]] = or disjoint i32 [[TMP6]], 16384
	; CHECK-NEXT: [[TMP9:%.*]] = lshr exact i32 [[TMP8]], 5
	; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 992
	; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr addrspace(3) @global_smem, i32 [[TMP6]]
	; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 [[TMP10]]
	; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 16388
	; CHECK-NEXT: [[TMP14:%.*]] = load <16 x half>, ptr addrspace(3) [[TMP13]], align 32
	; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x half> [[TMP14]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	; CHECK-NEXT: store <8 x half> [[TMP15]], ptr addrspace(1) [[TMP0]], align 16
	; CHECK-NEXT: ret void
	;
	entry:
	%idx_low_bits = and i32 %idx, 15
	%row_offset = shl nuw nsw i32 %idx_low_bits, 10
	%idx_bit4 = and i32 %idx, 16
	%idx_bit4_shifted = shl nuw nsw i32 %idx_bit4, 1
	%combined_offset = or disjoint i32 %idx_bit4_shifted, %row_offset
	%lane_offset = shl nuw nsw i32 %idx_low_bits, 5
	%lds_small_base = getelementptr i8, ptr addrspace(3) @global_smem, i32 4
	%combined_offset_with_stride = or disjoint i32 %combined_offset, 16384
	%aligned_offset = lshr exact i32 %combined_offset_with_stride, 5
	%row_alignment = and i32 %aligned_offset, 992
	%lds_vector_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_small_base, i32 %combined_offset_with_stride
	%lds_lane_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_vector_ptr, i32 %row_alignment
	%lds_wide_load = load <16 x half>, ptr addrspace(3) %lds_lane_ptr, align 32
	%upper_half = shufflevector <16 x half> %lds_wide_load, <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	store <8 x half> %upper_half, ptr addrspace(1) %out, align 16
	ret void
	}