| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-RELAXED %s |
| ; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-STRICT %s |
| |
| define amdgpu_kernel void @merge_v2i32_v2i32(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture readonly %b) #0 { |
| ; CHECK-LABEL: define amdgpu_kernel void @merge_v2i32_v2i32( |
| ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0:[0-9]+]] { |
| ; CHECK-NEXT: [[ENTRY:.*:]] |
| ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(1) [[B]], align 4 |
| ; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1> |
| ; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 2, i32 3> |
| ; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4 |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %a.1 = getelementptr inbounds <2 x i32>, ptr addrspace(1) %a, i64 1 |
| %b.1 = getelementptr inbounds <2 x i32>, ptr addrspace(1) %b, i64 1 |
| |
| %ld.c = load <2 x i32>, ptr addrspace(1) %b, align 4 |
| %ld.c.idx.1 = load <2 x i32>, ptr addrspace(1) %b.1, align 4 |
| |
| store <2 x i32> zeroinitializer, ptr addrspace(1) %a, align 4 |
| store <2 x i32> zeroinitializer, ptr addrspace(1) %a.1, align 4 |
| |
| ret void |
| } |
| |
| define amdgpu_kernel void @merge_v1i32_v1i32(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture readonly %b) #0 { |
| ; CHECK-LABEL: define amdgpu_kernel void @merge_v1i32_v1i32( |
| ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] { |
| ; CHECK-NEXT: [[ENTRY:.*:]] |
| ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[B]], align 4 |
| ; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4 |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %a.1 = getelementptr inbounds <1 x i32>, ptr addrspace(1) %a, i64 1 |
| %b.1 = getelementptr inbounds <1 x i32>, ptr addrspace(1) %b, i64 1 |
| |
| %ld.c = load <1 x i32>, ptr addrspace(1) %b, align 4 |
| %ld.c.idx.1 = load <1 x i32>, ptr addrspace(1) %b.1, align 4 |
| |
| store <1 x i32> zeroinitializer, ptr addrspace(1) %a, align 4 |
| store <1 x i32> zeroinitializer, ptr addrspace(1) %a.1, align 4 |
| |
| ret void |
| } |
| |
| define amdgpu_kernel void @no_merge_v3i32_v3i32(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture readonly %b) #0 { |
| ; CHECK-LABEL: define amdgpu_kernel void @no_merge_v3i32_v3i32( |
| ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] { |
| ; CHECK-NEXT: [[ENTRY:.*:]] |
| ; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds <3 x i32>, ptr addrspace(1) [[A]], i64 1 |
| ; CHECK-NEXT: [[B_1:%.*]] = getelementptr inbounds <3 x i32>, ptr addrspace(1) [[B]], i64 1 |
| ; CHECK-NEXT: [[LD_C:%.*]] = load <3 x i32>, ptr addrspace(1) [[B]], align 4 |
| ; CHECK-NEXT: [[LD_C_IDX_1:%.*]] = load <3 x i32>, ptr addrspace(1) [[B_1]], align 4 |
| ; CHECK-NEXT: store <3 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4 |
| ; CHECK-NEXT: store <3 x i32> zeroinitializer, ptr addrspace(1) [[A_1]], align 4 |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %a.1 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %a, i64 1 |
| %b.1 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %b, i64 1 |
| |
| %ld.c = load <3 x i32>, ptr addrspace(1) %b, align 4 |
| %ld.c.idx.1 = load <3 x i32>, ptr addrspace(1) %b.1, align 4 |
| |
| store <3 x i32> zeroinitializer, ptr addrspace(1) %a, align 4 |
| store <3 x i32> zeroinitializer, ptr addrspace(1) %a.1, align 4 |
| |
| ret void |
| } |
| |
| define amdgpu_kernel void @merge_v2i16_v2i16(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture readonly %b) #0 { |
| ; CHECK-LABEL: define amdgpu_kernel void @merge_v2i16_v2i16( |
| ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] { |
| ; CHECK-NEXT: [[ENTRY:.*:]] |
| ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 4 |
| ; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1> |
| ; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3> |
| ; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr addrspace(1) [[A]], align 4 |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %a.1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a, i64 1 |
| %b.1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b, i64 1 |
| |
| %ld.c = load <2 x i16>, ptr addrspace(1) %b, align 4 |
| %ld.c.idx.1 = load <2 x i16>, ptr addrspace(1) %b.1, align 4 |
| |
| store <2 x i16> zeroinitializer, ptr addrspace(1) %a, align 4 |
| store <2 x i16> zeroinitializer, ptr addrspace(1) %a.1, align 4 |
| |
| ret void |
| } |
| |
| define amdgpu_kernel void @merge_fat_ptrs(ptr addrspace(7) nocapture %a, ptr addrspace(7) nocapture readonly %b) #0 { |
| ; CHECK-OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_fat_ptrs( |
| ; CHECK-OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[A:%.*]], ptr addrspace(7) readonly captures(none) [[B:%.*]]) #[[ATTR0]] { |
| ; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]] |
| ; CHECK-OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(7) [[B]], align 4 |
| ; CHECK-OOB-RELAXED-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1> |
| ; CHECK-OOB-RELAXED-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3> |
| ; CHECK-OOB-RELAXED-NEXT: store <4 x i16> zeroinitializer, ptr addrspace(7) [[A]], align 4 |
| ; CHECK-OOB-RELAXED-NEXT: ret void |
| ; |
| ; CHECK-OOB-STRICT-LABEL: define amdgpu_kernel void @merge_fat_ptrs( |
| ; CHECK-OOB-STRICT-SAME: ptr addrspace(7) captures(none) [[A:%.*]], ptr addrspace(7) readonly captures(none) [[B:%.*]]) #[[ATTR0]] { |
| ; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]] |
| ; CHECK-OOB-STRICT-NEXT: [[A_1:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(7) [[A]], i32 1 |
| ; CHECK-OOB-STRICT-NEXT: [[B_1:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(7) [[B]], i32 1 |
| ; CHECK-OOB-STRICT-NEXT: [[LD_C:%.*]] = load <2 x i16>, ptr addrspace(7) [[B]], align 4 |
| ; CHECK-OOB-STRICT-NEXT: [[LD_C_IDX_1:%.*]] = load <2 x i16>, ptr addrspace(7) [[B_1]], align 4 |
| ; CHECK-OOB-STRICT-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(7) [[A]], align 4 |
| ; CHECK-OOB-STRICT-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(7) [[A_1]], align 4 |
| ; CHECK-OOB-STRICT-NEXT: ret void |
| ; |
| entry: |
| %a.1 = getelementptr inbounds <2 x i16>, ptr addrspace(7) %a, i32 1 |
| %b.1 = getelementptr inbounds <2 x i16>, ptr addrspace(7) %b, i32 1 |
| |
| %ld.c = load <2 x i16>, ptr addrspace(7) %b, align 4 |
| %ld.c.idx.1 = load <2 x i16>, ptr addrspace(7) %b.1, align 4 |
| |
| store <2 x i16> zeroinitializer, ptr addrspace(7) %a, align 4 |
| store <2 x i16> zeroinitializer, ptr addrspace(7) %a.1, align 4 |
| |
| ret void |
| } |
| |
| ; Ideally this would be merged |
| define amdgpu_kernel void @merge_load_i32_v2i16(ptr addrspace(1) nocapture %a) #0 { |
| ; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i16( |
| ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] { |
| ; CHECK-NEXT: [[ENTRY:.*:]] |
| ; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i32 1 |
| ; CHECK-NEXT: [[LD_0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4 |
| ; CHECK-NEXT: [[LD_1:%.*]] = load <2 x i16>, ptr addrspace(1) [[A_1]], align 4 |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1 |
| |
| %ld.0 = load i32, ptr addrspace(1) %a |
| %ld.1 = load <2 x i16>, ptr addrspace(1) %a.1 |
| |
| ret void |
| } |
| |
| attributes #0 = { nounwind } |
| attributes #1 = { nounwind readnone } |