| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -passes=separate-const-offset-from-gep -S %s | FileCheck %s |
| |
| ; This test is intended to test that the separate-const-offset-from-gep |
| ; pass will still separate offsets even if the base offset is large. |
| |
| @global_smem = external addrspace(3) global [0 x i8], align 16 |
| |
| define void @large_base_offset(ptr addrspace(1) writeonly %out, i32 %idx) { |
| ; CHECK-LABEL: define void @large_base_offset( |
| ; CHECK-SAME: ptr addrspace(1) writeonly [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { |
| ; CHECK-NEXT: [[ENTRY:.*:]] |
| ; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 15 |
| ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i32 [[TMP2]], 10 |
| ; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP1]], 16 |
| ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 1 |
| ; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i32 [[TMP5]], [[TMP3]] |
| ; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP2]], 5 |
| ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) @global_smem, i32 67584 |
| ; CHECK-NEXT: [[TMP9:%.*]] = or disjoint i32 [[TMP6]], 16384 |
| ; CHECK-NEXT: [[TMP10:%.*]] = lshr exact i32 [[TMP9]], 5 |
| ; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP10]], 992 |
| ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP8]], i32 [[TMP6]] |
| ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP12]], i32 [[TMP11]] |
| ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP13]], i32 16384 |
| ; CHECK-NEXT: [[TMP15:%.*]] = load <16 x half>, ptr addrspace(3) [[TMP14]], align 32 |
| ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x half> [[TMP15]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| ; CHECK-NEXT: store <8 x half> [[TMP16]], ptr addrspace(1) [[TMP0]], align 16 |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %idx_low_bits = and i32 %idx, 15 |
| %row_offset = shl nuw nsw i32 %idx_low_bits, 10 |
| %idx_bit4 = and i32 %idx, 16 |
| %idx_bit4_shifted = shl nuw nsw i32 %idx_bit4, 1 |
| %combined_offset = or disjoint i32 %idx_bit4_shifted, %row_offset |
| %lane_offset = shl nuw nsw i32 %idx_low_bits, 5 |
| %lds_large_base = getelementptr i8, ptr addrspace(3) @global_smem, i32 67584 |
| %combined_offset_with_stride = or disjoint i32 %combined_offset, 16384 |
| %aligned_offset = lshr exact i32 %combined_offset_with_stride, 5 |
| %row_alignment = and i32 %aligned_offset, 992 |
| %lds_vector_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_large_base, i32 %combined_offset_with_stride |
| %lds_lane_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_vector_ptr, i32 %row_alignment |
| %lds_wide_load = load <16 x half>, ptr addrspace(3) %lds_lane_ptr, align 32 |
| %upper_half = shufflevector <16 x half> %lds_wide_load, <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| store <8 x half> %upper_half, ptr addrspace(1) %out, align 16 |
| ret void |
| } |
| |
| define void @small_base_offset(ptr addrspace(1) writeonly %out, i32 %idx) { |
| ; CHECK-LABEL: define void @small_base_offset( |
| ; CHECK-SAME: ptr addrspace(1) writeonly [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { |
| ; CHECK-NEXT: [[ENTRY:.*:]] |
| ; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 15 |
| ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i32 [[TMP2]], 10 |
| ; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP1]], 16 |
| ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 1 |
| ; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i32 [[TMP5]], [[TMP3]] |
| ; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP2]], 5 |
| ; CHECK-NEXT: [[TMP8:%.*]] = or disjoint i32 [[TMP6]], 16384 |
| ; CHECK-NEXT: [[TMP9:%.*]] = lshr exact i32 [[TMP8]], 5 |
| ; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 992 |
| ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr addrspace(3) @global_smem, i32 [[TMP6]] |
| ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 [[TMP10]] |
| ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 16388 |
| ; CHECK-NEXT: [[TMP14:%.*]] = load <16 x half>, ptr addrspace(3) [[TMP13]], align 32 |
| ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x half> [[TMP14]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| ; CHECK-NEXT: store <8 x half> [[TMP15]], ptr addrspace(1) [[TMP0]], align 16 |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %idx_low_bits = and i32 %idx, 15 |
| %row_offset = shl nuw nsw i32 %idx_low_bits, 10 |
| %idx_bit4 = and i32 %idx, 16 |
| %idx_bit4_shifted = shl nuw nsw i32 %idx_bit4, 1 |
| %combined_offset = or disjoint i32 %idx_bit4_shifted, %row_offset |
| %lane_offset = shl nuw nsw i32 %idx_low_bits, 5 |
| %lds_small_base = getelementptr i8, ptr addrspace(3) @global_smem, i32 4 |
| %combined_offset_with_stride = or disjoint i32 %combined_offset, 16384 |
| %aligned_offset = lshr exact i32 %combined_offset_with_stride, 5 |
| %row_alignment = and i32 %aligned_offset, 992 |
| %lds_vector_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_small_base, i32 %combined_offset_with_stride |
| %lds_lane_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_vector_ptr, i32 %row_alignment |
| %lds_wide_load = load <16 x half>, ptr addrspace(3) %lds_lane_ptr, align 32 |
| %upper_half = shufflevector <16 x half> %lds_wide_load, <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| store <8 x half> %upper_half, ptr addrspace(1) %out, align 16 |
| ret void |
| } |