| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 |
| ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s |
| |
| target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" |
| |
| define amdgpu_kernel void @test_overwrite(i64 %val, i1 %cond) { |
| ; CHECK-LABEL: define amdgpu_kernel void @test_overwrite |
| ; CHECK-SAME: (i64 [[VAL:%.*]], i1 [[COND:%.*]]) { |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]] |
| ; CHECK: loop: |
| ; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ] |
| ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0 |
| ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA]], i64 68, i32 0 |
| ; CHECK-NEXT: [[TMP2]] = insertelement <3 x i64> [[TMP1]], i64 32, i32 0 |
| ; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 68 |
| ; CHECK-NEXT: br i1 [[LOOP_CC]], label [[LOOP]], label [[END]] |
| ; CHECK: end: |
| ; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ] |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0 |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %stack = alloca [3 x i64], align 4, addrspace(5) |
| store i64 43, ptr addrspace(5) %stack |
| br i1 %cond, label %loop, label %end |
| |
| loop: |
| %load.0 = load i64, ptr addrspace(5) %stack |
| store i64 68, ptr addrspace(5) %stack |
| %load.1 = load i64, ptr addrspace(5) %stack |
| store i64 32, ptr addrspace(5) %stack |
| %loop.cc = icmp ne i64 %load.0, %load.1 |
| br i1 %loop.cc, label %loop, label %end |
| |
| end: |
| %reload = load i64, ptr addrspace(5) %stack |
| ret void |
| } |
| |
| define <4 x i64> @test_fullvec_out_of_bounds(<4 x i64> %arg) { |
| ; CHECK-LABEL: define <4 x i64> @test_fullvec_out_of_bounds |
| ; CHECK-SAME: (<4 x i64> [[ARG:%.*]]) { |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i64> [[ARG]], i64 0 |
| ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP0]], i32 3 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[ARG]], i64 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[ARG]], i64 2 |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[ARG]], i64 3 |
| ; CHECK-NEXT: ret <4 x i64> poison |
| ; |
| entry: |
| %stack = alloca [4 x i64], align 4, addrspace(5) |
| %stack.2 = getelementptr inbounds [4 x i64], ptr addrspace(5) %stack, i32 0, i32 2 |
| %stack.3 = getelementptr inbounds [4 x i64], ptr addrspace(5) %stack, i32 0, i32 3 |
| store <4 x i64> %arg, ptr addrspace(5) %stack.3 |
| %reload = load <4 x i64>, ptr addrspace(5) %stack.2 |
| ret <4 x i64> %reload |
| } |
| |
| define amdgpu_kernel void @test_no_overwrite(i64 %val, i1 %cond) { |
| ; CHECK-LABEL: define amdgpu_kernel void @test_no_overwrite |
| ; CHECK-SAME: (i64 [[VAL:%.*]], i1 [[COND:%.*]]) { |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]] |
| ; CHECK: loop: |
| ; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP1:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ] |
| ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0 |
| ; CHECK-NEXT: [[TMP1]] = insertelement <3 x i64> [[PROMOTEALLOCA]], i64 32, i32 1 |
| ; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 32 |
| ; CHECK-NEXT: br i1 [[LOOP_CC]], label [[LOOP]], label [[END]] |
| ; CHECK: end: |
| ; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP1]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ] |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 1 |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %stack = alloca [3 x i64], align 4, addrspace(5) |
| %stack.1 = getelementptr inbounds i64, ptr addrspace(5) %stack, i32 1 |
| store i64 43, ptr addrspace(5) %stack |
| br i1 %cond, label %loop, label %end |
| |
| loop: |
| %load = load i64, ptr addrspace(5) %stack |
| store i64 32, ptr addrspace(5) %stack.1 |
| %loop.cc = icmp ne i64 %load, 32 |
| br i1 %loop.cc, label %loop, label %end |
| |
| end: |
| %reload = load i64, ptr addrspace(5) %stack |
| %reload.1 = load i64, ptr addrspace(5) %stack.1 |
| ret void |
| } |
| |
| define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) { |
| ; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec |
| ; CHECK-SAME: (ptr [[ARG:%.*]]) { |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[ARG]] to i64 |
| ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <8 x i8> |
| ; CHECK-NEXT: ret ptr [[ARG]] |
| ; |
| entry: |
| %alloca = alloca [8 x i8], align 8, addrspace(5) |
| store ptr %arg, ptr addrspace(5) %alloca, align 8 |
| %tmp = load ptr, ptr addrspace(5) %alloca, align 8 |
| ret ptr %tmp |
| } |
| |
| define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec(ptr addrspace(3) %arg) { |
| ; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec |
| ; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) { |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(3) [[ARG]] to i32 |
| ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[TMP0]] to <4 x i8> |
| ; CHECK-NEXT: ret ptr addrspace(3) [[ARG]] |
| ; |
| entry: |
| %alloca = alloca [4 x i8], align 8, addrspace(5) |
| store ptr addrspace(3) %arg, ptr addrspace(5) %alloca, align 8 |
| %tmp = load ptr addrspace(3), ptr addrspace(5) %alloca, align 8 |
| ret ptr addrspace(3) %tmp |
| } |
| |
| define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr> %arg) { |
| ; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec |
| ; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) { |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr> [[ARG]] to <2 x i64> |
| ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32> |
| ; CHECK-NEXT: [[TMP2:%.*]] = inttoptr <4 x i32> [[TMP1]] to <4 x ptr addrspace(3)> |
| ; CHECK-NEXT: ret <4 x ptr addrspace(3)> [[TMP2]] |
| ; |
| entry: |
| %alloca = alloca [4 x i32], align 8, addrspace(5) |
| store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8 |
| %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8 |
| ret <4 x ptr addrspace(3)> %tmp |
| } |
| |
| define <8 x i16> @ptralloca_load_store_ints_full(<2 x i64> %arg) { |
| ; CHECK-LABEL: define <8 x i16> @ptralloca_load_store_ints_full |
| ; CHECK-SAME: (<2 x i64> [[ARG:%.*]]) { |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[ARG]] to <4 x i32> |
| ; CHECK-NEXT: [[TMP1:%.*]] = inttoptr <4 x i32> [[TMP0]] to <4 x ptr addrspace(5)> |
| ; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <8 x i16> |
| ; CHECK-NEXT: ret <8 x i16> [[TMP2]] |
| ; |
| entry: |
| %stack = alloca [4 x ptr addrspace(5)], align 4, addrspace(5) |
| store <2 x i64> %arg, ptr addrspace(5) %stack |
| %reload = load <8 x i16>, ptr addrspace(5) %stack |
| ret <8 x i16> %reload |
| } |
| |
| define void @alloca_load_store_ptr_mixed_ptrvec(<2 x ptr addrspace(3)> %arg) { |
| ; CHECK-LABEL: define void @alloca_load_store_ptr_mixed_ptrvec |
| ; CHECK-SAME: (<2 x ptr addrspace(3)> [[ARG:%.*]]) { |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[ARG]] to <2 x i32> |
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i64 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 0 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i64 1 |
| ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP3]], i32 1 |
| ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0 |
| ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP3]], i64 1 |
| ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x ptr addrspace(3)> |
| ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i64 0 |
| ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3]], i64 1 |
| ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 undef, i64 2 |
| ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 undef, i64 3 |
| ; CHECK-NEXT: [[TMP12:%.*]] = inttoptr <4 x i32> [[TMP11]] to <4 x ptr addrspace(3)> |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %alloca = alloca [8 x i32], align 8, addrspace(5) |
| store <2 x ptr addrspace(3)> %arg, ptr addrspace(5) %alloca, align 8 |
| %tmp = load <2 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8 |
| %tmp.full = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8 |
| ret void |
| } |
| |
| ; Will not vectorize because we're accessing a 64 bit vector with a 32 bits pointer. |
| define ptr addrspace(3) @alloca_load_store_ptr_mixed_full_ivec(ptr addrspace(3) %arg) { |
| ; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr_mixed_full_ivec |
| ; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) { |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [8 x i8], align 8, addrspace(5) |
| ; CHECK-NEXT: store ptr addrspace(3) [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8 |
| ; CHECK-NEXT: [[TMP:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[ALLOCA]], align 8 |
| ; CHECK-NEXT: ret ptr addrspace(3) [[TMP]] |
| ; |
| entry: |
| %alloca = alloca [8 x i8], align 8, addrspace(5) |
| store ptr addrspace(3) %arg, ptr addrspace(5) %alloca, align 8 |
| %tmp = load ptr addrspace(3), ptr addrspace(5) %alloca, align 8 |
| ret ptr addrspace(3) %tmp |
| } |