| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py |
| ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s |
| |
| ; Make sure that array alloca loaded and stored as multi-element aggregates are handled correctly |
| ; Strictly the promote-alloca pass shouldn't have to deal with this case as it is non-canonical, but |
| ; the pass should handle it gracefully if it is |
| ; The checks look for lines that previously caused issues in PromoteAlloca (non-canonical). Opt |
| ; should now leave these unchanged |
| |
| %Block = type { [1 x float], i32 } |
| %gl_PerVertex = type { <4 x float>, float, [1 x float], [1 x float] } |
| %struct = type { i32, i32 } |
| |
| @block = external addrspace(1) global %Block |
| @pv = external addrspace(1) global %gl_PerVertex |
| |
| define amdgpu_vs void @promote_1d_aggr() #0 { |
| ; CHECK-LABEL: @promote_1d_aggr( |
| ; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) |
| ; CHECK-NEXT: [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5) |
| ; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], ptr addrspace(1) @block, i32 0, i32 1 |
| ; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4 |
| ; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4 |
| ; CHECK-NEXT: [[FOO3:%.*]] = load [1 x float], ptr addrspace(1) @block, align 4 |
| ; CHECK-NEXT: store [1 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4 |
| ; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] |
| ; CHECK-NEXT: [[FOO6:%.*]] = load float, ptr addrspace(5) [[FOO5]], align 4 |
| ; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5) |
| ; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16 |
| ; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0 |
| ; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1 |
| ; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2 |
| ; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3 |
| ; CHECK-NEXT: store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16 |
| ; CHECK-NEXT: ret void |
| ; |
| %i = alloca i32, addrspace(5) |
| %f1 = alloca [1 x float], addrspace(5) |
| %foo = getelementptr %Block, ptr addrspace(1) @block, i32 0, i32 1 |
| %foo1 = load i32, ptr addrspace(1) %foo |
| store i32 %foo1, ptr addrspace(5) %i |
| %foo3 = load [1 x float], ptr addrspace(1) @block |
| store [1 x float] %foo3, ptr addrspace(5) %f1 |
| %foo4 = load i32, ptr addrspace(5) %i |
| %foo5 = getelementptr [1 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 |
| %foo6 = load float, ptr addrspace(5) %foo5 |
| %foo7 = alloca <4 x float>, addrspace(5) |
| %foo8 = load <4 x float>, ptr addrspace(5) %foo7 |
| %foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0 |
| %foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1 |
| %foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2 |
| %foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3 |
| store <4 x float> %foo12, ptr addrspace(1) @pv |
| ret void |
| } |
| |
| %Block2 = type { i32, [2 x float] } |
| @block2 = external addrspace(1) global %Block2 |
| |
| define amdgpu_vs void @promote_store_aggr() #0 { |
| ; CHECK-LABEL: @promote_store_aggr( |
| ; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) |
| ; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5) |
| ; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) @block2, align 4 |
| ; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4 |
| ; CHECK-NEXT: [[FOO2:%.*]] = load i32, ptr addrspace(5) [[I]], align 4 |
| ; CHECK-NEXT: [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float |
| ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3]], i32 0 |
| ; CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float 2.000000e+00, i64 1 |
| ; CHECK-NEXT: store <2 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[FOO6:%.*]] = load [2 x float], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[FOO7:%.*]] = getelementptr [[BLOCK2:%.*]], ptr addrspace(1) @block2, i32 0, i32 1 |
| ; CHECK-NEXT: store [2 x float] [[FOO6]], ptr addrspace(1) [[FOO7]], align 4 |
| ; CHECK-NEXT: store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv, align 16 |
| ; CHECK-NEXT: ret void |
| ; |
| %i = alloca i32, addrspace(5) |
| %f1 = alloca [2 x float], addrspace(5) |
| %foo1 = load i32, ptr addrspace(1) @block2 |
| store i32 %foo1, ptr addrspace(5) %i |
| %foo2 = load i32, ptr addrspace(5) %i |
| %foo3 = sitofp i32 %foo2 to float |
| store float %foo3, ptr addrspace(5) %f1 |
| %foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 1 |
| store float 2.000000e+00, ptr addrspace(5) %foo5 |
| %foo6 = load [2 x float], ptr addrspace(5) %f1 |
| %foo7 = getelementptr %Block2, ptr addrspace(1) @block2, i32 0, i32 1 |
| store [2 x float] %foo6, ptr addrspace(1) %foo7 |
| store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv |
| ret void |
| } |
| |
| %Block3 = type { [2 x float], i32 } |
| @block3 = external addrspace(1) global %Block3 |
| |
| define amdgpu_vs void @promote_load_from_store_aggr() #0 { |
| ; CHECK-LABEL: @promote_load_from_store_aggr( |
| ; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) |
| ; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5) |
| ; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1 |
| ; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4 |
| ; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4 |
| ; CHECK-NEXT: [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4 |
| ; CHECK-NEXT: store [2 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4 |
| ; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] |
| ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 [[FOO4]] |
| ; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5) |
| ; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16 |
| ; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP2]], i32 0 |
| ; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP2]], i32 1 |
| ; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP2]], i32 2 |
| ; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP2]], i32 3 |
| ; CHECK-NEXT: store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16 |
| ; CHECK-NEXT: ret void |
| ; |
| %i = alloca i32, addrspace(5) |
| %f1 = alloca [2 x float], addrspace(5) |
| %foo = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 1 |
| %foo1 = load i32, ptr addrspace(1) %foo |
| store i32 %foo1, ptr addrspace(5) %i |
| %foo3 = load [2 x float], ptr addrspace(1) @block3 |
| store [2 x float] %foo3, ptr addrspace(5) %f1 |
| %foo4 = load i32, ptr addrspace(5) %i |
| %foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 |
| %foo6 = load float, ptr addrspace(5) %foo5 |
| %foo7 = alloca <4 x float>, addrspace(5) |
| %foo8 = load <4 x float>, ptr addrspace(5) %foo7 |
| %foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0 |
| %foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1 |
| %foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2 |
| %foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3 |
| store <4 x float> %foo12, ptr addrspace(1) @pv |
| ret void |
| } |
| |
| define amdgpu_vs void @promote_memmove_aggr() #0 { |
| ; CHECK-LABEL: @promote_memmove_aggr( |
| ; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) |
| ; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 |
| ; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1 |
| ; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 |
| ; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3 |
| ; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 4> |
| ; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0 |
| ; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4 |
| ; CHECK-NEXT: ret void |
| ; |
| %f1 = alloca [5 x float], addrspace(5) |
| store [5 x float] zeroinitializer, ptr addrspace(5) %f1 |
| %foo1 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 1 |
| store float 1.0, ptr addrspace(5) %foo1 |
| %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3 |
| store float 2.0, ptr addrspace(5) %foo2 |
| call void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo1, i32 16, i1 false) |
| %foo3 = load float, ptr addrspace(5) %f1 |
| store float %foo3, ptr addrspace(1) @pv |
| ret void |
| } |
| |
| define amdgpu_vs void @promote_memcpy_aggr() #0 { |
| ; CHECK-LABEL: @promote_memcpy_aggr( |
| ; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) |
| ; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 |
| ; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 2.000000e+00, i64 3 |
| ; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 |
| ; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 |
| ; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] |
| ; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 3.000000e+00, i32 [[FOO4]] |
| ; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4> |
| ; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0 |
| ; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4 |
| ; CHECK-NEXT: ret void |
| ; |
| %f1 = alloca [5 x float], addrspace(5) |
| store [5 x float] zeroinitializer, ptr addrspace(5) %f1 |
| |
| %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3 |
| store float 2.0, ptr addrspace(5) %foo2 |
| |
| %foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0 |
| %foo4 = load i32, ptr addrspace(1) %foo3 |
| %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 |
| store float 3.0, ptr addrspace(5) %foo5 |
| |
| call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false) |
| %foo6 = load float, ptr addrspace(5) %f1 |
| store float %foo6, ptr addrspace(1) @pv |
| ret void |
| } |
| |
| define amdgpu_vs void @promote_memcpy_identity_aggr() #0 { |
| ; CHECK-LABEL: @promote_memcpy_identity_aggr( |
| ; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) |
| ; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 |
| ; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1 |
| ; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 |
| ; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3 |
| ; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4> |
| ; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0 |
| ; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4 |
| ; CHECK-NEXT: ret void |
| ; |
| %f1 = alloca [5 x float], addrspace(5) |
| store [5 x float] zeroinitializer, ptr addrspace(5) %f1 |
| %foo1 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 1 |
| store float 1.0, ptr addrspace(5) %foo1 |
| %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3 |
| store float 2.0, ptr addrspace(5) %foo2 |
| call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %f1, i32 20, i1 false) |
| %foo3 = load float, ptr addrspace(5) %f1 |
| store float %foo3, ptr addrspace(1) @pv |
| ret void |
| } |
| |
| ; TODO: promote alloca even there is a memcpy between different alloca |
| define amdgpu_vs void @promote_memcpy_two_aggrs() #0 { |
| ; CHECK-LABEL: @promote_memcpy_two_aggrs( |
| ; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) |
| ; CHECK-NEXT: [[F2:%.*]] = alloca [5 x float], align 4, addrspace(5) |
| ; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F2]], align 4 |
| ; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 |
| ; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 |
| ; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] |
| ; CHECK-NEXT: store float 3.000000e+00, ptr addrspace(5) [[FOO5]], align 4 |
| ; CHECK-NEXT: call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 [[F2]], ptr addrspace(5) align 4 [[F1]], i32 8, i1 false) |
| ; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 [[FOO4]] |
| ; CHECK-NEXT: [[FOO7:%.*]] = load float, ptr addrspace(5) [[FOO6]], align 4 |
| ; CHECK-NEXT: store float [[FOO7]], ptr addrspace(1) @pv, align 4 |
| ; CHECK-NEXT: ret void |
| ; |
| %f1 = alloca [5 x float], addrspace(5) |
| %f2 = alloca [5 x float], addrspace(5) |
| |
| store [5 x float] zeroinitializer, ptr addrspace(5) %f1 |
| store [5 x float] zeroinitializer, ptr addrspace(5) %f2 |
| |
| %foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0 |
| %foo4 = load i32, ptr addrspace(1) %foo3 |
| %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 |
| store float 3.0, ptr addrspace(5) %foo5 |
| |
| call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f2, ptr addrspace(5) align 4 %f1, i32 8, i1 false) |
| |
| %foo6 = getelementptr [5 x float], ptr addrspace(5) %f2, i32 0, i32 %foo4 |
| %foo7 = load float, ptr addrspace(5) %foo6 |
| store float %foo7, ptr addrspace(1) @pv |
| ret void |
| } |
| |
| ; TODO: promote alloca even there is a memcpy between the alloca and other memory space. |
| define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0 { |
| ; CHECK-LABEL: @promote_memcpy_p1p5_aggr( |
| ; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) |
| ; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 |
| ; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 |
| ; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] |
| ; CHECK-NEXT: store float 3.000000e+00, ptr addrspace(5) [[FOO5]], align 4 |
| ; CHECK-NEXT: call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 [[F1]], i32 8, i1 false) |
| ; CHECK-NEXT: ret void |
| ; |
| %f1 = alloca [5 x float], addrspace(5) |
| store [5 x float] zeroinitializer, ptr addrspace(5) %f1 |
| |
| %foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0 |
| %foo4 = load i32, ptr addrspace(1) %foo3 |
| %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 |
| store float 3.0, ptr addrspace(5) %foo5 |
| |
| call void @llvm.memcpy.p1i8.p5i8.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 %f1, i32 8, i1 false) |
| ret void |
| } |
| |
| define amdgpu_vs void @promote_memcpy_inline_aggr() #0 { |
| ; CHECK-LABEL: @promote_memcpy_inline_aggr( |
| ; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5) |
| ; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3 |
| ; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0 |
| ; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4 |
| ; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] |
| ; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 3.000000e+00, i32 [[FOO4]] |
| ; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <5 x float> [[TMP3]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4> |
| ; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4 |
| ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <5 x float> [[TMP5]], i32 0 |
| ; CHECK-NEXT: store float [[TMP6]], ptr addrspace(1) @pv, align 4 |
| ; CHECK-NEXT: ret void |
| ; |
| %f1 = alloca [5 x float], addrspace(5) |
| store [5 x float] zeroinitializer, ptr addrspace(5) %f1 |
| |
| %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3 |
| %foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0 |
| %foo4 = load i32, ptr addrspace(1) %foo3 |
| %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 |
| store float 3.0, ptr addrspace(5) %foo5 |
| |
| call void @llvm.memcpy.inline.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false) |
| %foo6 = load float, ptr addrspace(5) %f1 |
| store float %foo6, ptr addrspace(1) @pv |
| ret void |
| } |
| |
| declare void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) |
| declare void @llvm.memcpy.p1i8.p5i8.i32(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) |
| declare void @llvm.memcpy.inline.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) |
| declare void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg) |
| |
| @tmp_g = external addrspace(1) global { [4 x double], <2 x double>, <3 x double>, <4 x double> } |
| @frag_color = external addrspace(1) global <4 x float> |
| |
| define amdgpu_ps void @promote_double_aggr() #0 { |
| ; CHECK-LABEL: @promote_double_aggr( |
| ; CHECK-NEXT: [[S:%.*]] = alloca [2 x double], align 8, addrspace(5) |
| ; CHECK-NEXT: [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0 |
| ; CHECK-NEXT: [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8 |
| ; CHECK-NEXT: [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1 |
| ; CHECK-NEXT: [[FOO3:%.*]] = load double, ptr addrspace(1) [[FOO2]], align 8 |
| ; CHECK-NEXT: [[FOO4:%.*]] = insertvalue [2 x double] undef, double [[FOO1]], 0 |
| ; CHECK-NEXT: [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1 |
| ; CHECK-NEXT: store [2 x double] [[FOO5]], ptr addrspace(5) [[S]], align 8 |
| ; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1 |
| ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i64 1 |
| ; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8 |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i64 1 |
| ; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[TMP2]], [[TMP4]] |
| ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8 |
| ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[FOO10]], i32 0 |
| ; CHECK-NEXT: store <2 x double> [[TMP6]], ptr addrspace(5) [[S]], align 8 |
| ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8 |
| ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 |
| ; CHECK-NEXT: [[FOO14:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1 |
| ; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8 |
| ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i64 1 |
| ; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[TMP8]], [[TMP10]] |
| ; CHECK-NEXT: [[FOO17:%.*]] = fptrunc double [[FOO16]] to float |
| ; CHECK-NEXT: [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0 |
| ; CHECK-NEXT: [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1 |
| ; CHECK-NEXT: [[FOO20:%.*]] = insertelement <4 x float> [[FOO19]], float [[FOO17]], i32 2 |
| ; CHECK-NEXT: [[FOO21:%.*]] = insertelement <4 x float> [[FOO20]], float [[FOO17]], i32 3 |
| ; CHECK-NEXT: store <4 x float> [[FOO21]], ptr addrspace(1) @frag_color, align 16 |
| ; CHECK-NEXT: ret void |
| ; |
| %s = alloca [2 x double], addrspace(5) |
| %foo = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0 |
| %foo1 = load double, ptr addrspace(1) %foo |
| %foo2 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1 |
| %foo3 = load double, ptr addrspace(1) %foo2 |
| %foo4 = insertvalue [2 x double] undef, double %foo1, 0 |
| %foo5 = insertvalue [2 x double] %foo4, double %foo3, 1 |
| store [2 x double] %foo5, ptr addrspace(5) %s |
| %foo6 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1 |
| %foo7 = load double, ptr addrspace(5) %foo6 |
| %foo8 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1 |
| %foo9 = load double, ptr addrspace(5) %foo8 |
| %foo10 = fadd double %foo7, %foo9 |
| store double %foo10, ptr addrspace(5) %s |
| %foo13 = load double, ptr addrspace(5) %s |
| %foo14 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1 |
| %foo15 = load double, ptr addrspace(5) %foo14 |
| %foo16 = fadd double %foo13, %foo15 |
| %foo17 = fptrunc double %foo16 to float |
| %foo18 = insertelement <4 x float> undef, float %foo17, i32 0 |
| %foo19 = insertelement <4 x float> %foo18, float %foo17, i32 1 |
| %foo20 = insertelement <4 x float> %foo19, float %foo17, i32 2 |
| %foo21 = insertelement <4 x float> %foo20, float %foo17, i32 3 |
| store <4 x float> %foo21, ptr addrspace(1) @frag_color |
| ret void |
| } |
| |
| ; Don't crash on a type that isn't a valid vector element. |
| define amdgpu_kernel void @alloca_struct() #0 { |
| ; CHECK-LABEL: @alloca_struct( |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() |
| ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1 |
| ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load !0 |
| ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2 |
| ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0 |
| ; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16 |
| ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]] |
| ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]] |
| ; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]] |
| ; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]] |
| ; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]] |
| ; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]] |
| ; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]] |
| ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]] |
| ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], ptr addrspace(3) @alloca_struct.alloca, i32 0, i32 [[TMP13]] |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %alloca = alloca [2 x %struct], align 4, addrspace(5) |
| ret void |
| } |