blob: 8bbe2b1dc4c9aaf5884fafd36d7964047bb9b840 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -passes=separate-const-offset-from-gep -S %s | FileCheck %s
; This test is intended to test that the separate-const-offset-from-gep
; pass will still separate offsets even if the base offset is large.
@global_smem = external addrspace(3) global [0 x i8], align 16
define void @large_base_offset(ptr addrspace(1) writeonly %out, i32 %idx) {
; CHECK-LABEL: define void @large_base_offset(
; CHECK-SAME: ptr addrspace(1) writeonly [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 15
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i32 [[TMP2]], 10
; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP1]], 16
; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 1
; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i32 [[TMP5]], [[TMP3]]
; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP2]], 5
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) @global_smem, i32 67584
; CHECK-NEXT: [[TMP9:%.*]] = or disjoint i32 [[TMP6]], 16384
; CHECK-NEXT: [[TMP10:%.*]] = lshr exact i32 [[TMP9]], 5
; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP10]], 992
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP8]], i32 [[TMP6]]
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP12]], i32 [[TMP11]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP13]], i32 16384
; CHECK-NEXT: [[TMP15:%.*]] = load <16 x half>, ptr addrspace(3) [[TMP14]], align 32
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x half> [[TMP15]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: store <8 x half> [[TMP16]], ptr addrspace(1) [[TMP0]], align 16
; CHECK-NEXT: ret void
;
entry:
%idx_low_bits = and i32 %idx, 15
%row_offset = shl nuw nsw i32 %idx_low_bits, 10
%idx_bit4 = and i32 %idx, 16
%idx_bit4_shifted = shl nuw nsw i32 %idx_bit4, 1
%combined_offset = or disjoint i32 %idx_bit4_shifted, %row_offset
%lane_offset = shl nuw nsw i32 %idx_low_bits, 5
%lds_large_base = getelementptr i8, ptr addrspace(3) @global_smem, i32 67584
%combined_offset_with_stride = or disjoint i32 %combined_offset, 16384
%aligned_offset = lshr exact i32 %combined_offset_with_stride, 5
%row_alignment = and i32 %aligned_offset, 992
%lds_vector_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_large_base, i32 %combined_offset_with_stride
%lds_lane_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_vector_ptr, i32 %row_alignment
%lds_wide_load = load <16 x half>, ptr addrspace(3) %lds_lane_ptr, align 32
%upper_half = shufflevector <16 x half> %lds_wide_load, <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
store <8 x half> %upper_half, ptr addrspace(1) %out, align 16
ret void
}
define void @small_base_offset(ptr addrspace(1) writeonly %out, i32 %idx) {
; CHECK-LABEL: define void @small_base_offset(
; CHECK-SAME: ptr addrspace(1) writeonly [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 15
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i32 [[TMP2]], 10
; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP1]], 16
; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 1
; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i32 [[TMP5]], [[TMP3]]
; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP2]], 5
; CHECK-NEXT: [[TMP8:%.*]] = or disjoint i32 [[TMP6]], 16384
; CHECK-NEXT: [[TMP9:%.*]] = lshr exact i32 [[TMP8]], 5
; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 992
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr addrspace(3) @global_smem, i32 [[TMP6]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 [[TMP10]]
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 16388
; CHECK-NEXT: [[TMP14:%.*]] = load <16 x half>, ptr addrspace(3) [[TMP13]], align 32
; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x half> [[TMP14]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: store <8 x half> [[TMP15]], ptr addrspace(1) [[TMP0]], align 16
; CHECK-NEXT: ret void
;
entry:
%idx_low_bits = and i32 %idx, 15
%row_offset = shl nuw nsw i32 %idx_low_bits, 10
%idx_bit4 = and i32 %idx, 16
%idx_bit4_shifted = shl nuw nsw i32 %idx_bit4, 1
%combined_offset = or disjoint i32 %idx_bit4_shifted, %row_offset
%lane_offset = shl nuw nsw i32 %idx_low_bits, 5
%lds_small_base = getelementptr i8, ptr addrspace(3) @global_smem, i32 4
%combined_offset_with_stride = or disjoint i32 %combined_offset, 16384
%aligned_offset = lshr exact i32 %combined_offset_with_stride, 5
%row_alignment = and i32 %aligned_offset, 992
%lds_vector_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_small_base, i32 %combined_offset_with_stride
%lds_lane_ptr = getelementptr inbounds nuw i8, ptr addrspace(3) %lds_vector_ptr, i32 %row_alignment
%lds_wide_load = load <16 x half>, ptr addrspace(3) %lds_lane_ptr, align 32
%upper_half = shufflevector <16 x half> %lds_wide_load, <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
store <8 x half> %upper_half, ptr addrspace(1) %out, align 16
ret void
}