; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=sroa,amdgpu-promote-alloca < %s | FileCheck %s

; Make sure that array alloca loaded and stored as multi-element aggregates are handled correctly
; Strictly the promote-alloca pass shouldn't have to deal with this case as it is non-canonical, but
; the pass should handle it gracefully if it is
; The checks look for lines that previously caused issues in PromoteAlloca (non-canonical). Opt
; should now leave these unchanged

%Block = type { [1 x float], i32 }
%gl_PerVertex = type { <4 x float>, float, [1 x float], [1 x float] }
%struct = type { i32, i32 }

@block = external addrspace(1) global %Block
@pv = external addrspace(1) global %gl_PerVertex

define amdgpu_vs void @promote_1d_aggr() #0 {
; CHECK-LABEL: @promote_1d_aggr(
; CHECK-NEXT:    [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5)
; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], ptr addrspace(1) @block, i32 0, i32 1
; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
; CHECK-NEXT:    [[FOO3:%.*]] = load [1 x float], ptr addrspace(1) @block, align 4
; CHECK-NEXT:    [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [1 x float] [[FOO3]], 0
; CHECK-NEXT:    [[FOO3_FCA_0_GEP:%.*]] = getelementptr inbounds [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
; CHECK-NEXT:    store float [[FOO3_FCA_0_EXTRACT]], ptr addrspace(5) [[FOO3_FCA_0_GEP]], align 4
; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO1]]
; CHECK-NEXT:    [[FOO6:%.*]] = load float, ptr addrspace(5) [[FOO5]], align 4
; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> undef, float [[FOO6]], i32 0
; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1
; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2
; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3
; CHECK-NEXT:    store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
; CHECK-NEXT:    ret void
;
  %i = alloca i32, addrspace(5)
  %f1 = alloca [1 x float], addrspace(5)
  %foo = getelementptr %Block, ptr addrspace(1) @block, i32 0, i32 1
  %foo1 = load i32, ptr addrspace(1) %foo
  store i32 %foo1, ptr addrspace(5) %i
  %foo3 = load [1 x float], ptr addrspace(1) @block
  store [1 x float] %foo3, ptr addrspace(5) %f1
  %foo4 = load i32, ptr addrspace(5) %i
  %foo5 = getelementptr [1 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
  %foo6 = load float, ptr addrspace(5) %foo5
  %foo7 = alloca <4 x float>, addrspace(5)
  %foo8 = load <4 x float>, ptr addrspace(5) %foo7
  %foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0
  %foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1
  %foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2
  %foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3
  store <4 x float> %foo12, ptr addrspace(1) @pv
  ret void
}

%Block2 = type { i32, [2 x float] }
@block2 = external addrspace(1) global %Block2

define amdgpu_vs void @promote_store_aggr() #0 {
; CHECK-LABEL: @promote_store_aggr(
; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) @block2, align 4
; CHECK-NEXT:    [[FOO3:%.*]] = sitofp i32 [[FOO1]] to float
; CHECK-NEXT:    [[FOO6_FCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[FOO3]], 0
; CHECK-NEXT:    [[FOO6_FCA_1_INSERT:%.*]] = insertvalue [2 x float] [[FOO6_FCA_0_INSERT]], float 2.000000e+00, 1
; CHECK-NEXT:    [[FOO7:%.*]] = getelementptr [[BLOCK2:%.*]], ptr addrspace(1) @block2, i32 0, i32 1
; CHECK-NEXT:    store [2 x float] [[FOO6_FCA_1_INSERT]], ptr addrspace(1) [[FOO7]], align 4
; CHECK-NEXT:    store <4 x float> splat (float 1.000000e+00), ptr addrspace(1) @pv, align 16
; CHECK-NEXT:    ret void
;
  %i = alloca i32, addrspace(5)
  %f1 = alloca [2 x float], addrspace(5)
  %foo1 = load i32, ptr addrspace(1) @block2
  store i32 %foo1, ptr addrspace(5) %i
  %foo2 = load i32, ptr addrspace(5) %i
  %foo3 = sitofp i32 %foo2 to float
  store float %foo3, ptr addrspace(5) %f1
  %foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 1
  store float 2.000000e+00, ptr addrspace(5) %foo5
  %foo6 = load [2 x float], ptr addrspace(5) %f1
  %foo7 = getelementptr %Block2, ptr addrspace(1) @block2, i32 0, i32 1
  store [2 x float] %foo6, ptr addrspace(1) %foo7
  store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv
  ret void
}

%Block3 = type { [2 x float], i32 }
@block3 = external addrspace(1) global %Block3

define amdgpu_vs void @promote_load_from_store_aggr() #0 {
; CHECK-LABEL: @promote_load_from_store_aggr(
; CHECK-NEXT:    [[F1:%.*]] = freeze <2 x float> poison
; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1
; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
; CHECK-NEXT:    [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4
; CHECK-NEXT:    [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0
; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[F1]], float [[FOO3_FCA_0_EXTRACT]], i32 0
; CHECK-NEXT:    [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1
; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i32 1
; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]]
; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1
; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2
; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3
; CHECK-NEXT:    store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
; CHECK-NEXT:    ret void
;
  %i = alloca i32, addrspace(5)
  %f1 = alloca [2 x float], addrspace(5)
  %foo = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 1
  %foo1 = load i32, ptr addrspace(1) %foo
  store i32 %foo1, ptr addrspace(5) %i
  %foo3 = load [2 x float], ptr addrspace(1) @block3
  store [2 x float] %foo3, ptr addrspace(5) %f1
  %foo4 = load i32, ptr addrspace(5) %i
  %foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
  %foo6 = load float, ptr addrspace(5) %foo5
  %foo7 = alloca <4 x float>, addrspace(5)
  %foo8 = load <4 x float>, ptr addrspace(5) %foo7
  %foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0
  %foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1
  %foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2
  %foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3
  store <4 x float> %foo12, ptr addrspace(1) @pv
  ret void
}

%Block4 = type { [2 x i32], i32 }
@block4 = external addrspace(1) global %Block4
%gl_PV = type { <4 x i32>, i32, [1 x i32], [1 x i32] }
@pv1 = external addrspace(1) global %gl_PV

; This should not crash on an aliased variable offset that can be
; optimized out (variable %aliasTofoo3 in the test)
define amdgpu_vs void @promote_load_from_store_aggr_varoff(<4 x i32> %input) {
; CHECK-LABEL: @promote_load_from_store_aggr_varoff(
; CHECK-NEXT:    [[F1:%.*]] = freeze <3 x i32> poison
; CHECK-NEXT:    [[FOO3_UNPACK2:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x i32> [[F1]], i32 [[FOO3_UNPACK2]], i32 2
; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 [[FOO3_UNPACK2]]
; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x i32> [[INPUT:%.*]], i32 [[TMP2]], i64 3
; CHECK-NEXT:    store <4 x i32> [[FOO12]], ptr addrspace(1) @pv1, align 16
; CHECK-NEXT:    ret void
;
  %f1 = alloca [3 x i32], align 4, addrspace(5)
  %G1 = getelementptr inbounds i8, ptr addrspace(5) %f1, i32 8
  %foo3.unpack2 = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
  store i32 %foo3.unpack2, ptr addrspace(5) %G1, align 4
  %aliasTofoo3 = load i32, ptr addrspace(5) %G1, align 4
  %foo5 = getelementptr [3 x i32], ptr addrspace(5) %f1, i32 0, i32 %aliasTofoo3
  %foo6 = load i32, ptr addrspace(5) %foo5, align 4
  %foo12 = insertelement <4 x i32> %input, i32 %foo6, i64 3
  store <4 x i32> %foo12, ptr addrspace(1) @pv1, align 16
  ret void
}

define amdgpu_vs void @promote_memmove_aggr() #0 {
; CHECK-LABEL: @promote_memmove_aggr(
; CHECK-NEXT:    [[F1:%.*]] = freeze <5 x float> poison
; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 0.000000e+00, i32 1
; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <5 x float> [[TMP2]], float 0.000000e+00, i32 2
; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 0.000000e+00, i32 3
; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <5 x float> [[TMP5]], float 1.000000e+00, i32 1
; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <5 x float> [[TMP6]], float 2.000000e+00, i32 3
; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <5 x float> [[TMP7]], <5 x float> poison, <5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 4>
; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) @pv, align 4
; CHECK-NEXT:    ret void
;
  %f1 = alloca [5 x float], addrspace(5)
  store [5 x float] zeroinitializer, ptr addrspace(5) %f1
  %foo1 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 1
  store float 1.0, ptr addrspace(5) %foo1
  %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
  store float 2.0, ptr addrspace(5) %foo2
  call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo1, i32 16, i1 false)
  %foo3 = load float, ptr addrspace(5) %f1
  store float %foo3, ptr addrspace(1) @pv
  ret void
}

define amdgpu_vs void @promote_memcpy_aggr() #0 {
; CHECK-LABEL: @promote_memcpy_aggr(
; CHECK-NEXT:    [[F1:%.*]] = freeze <5 x float> poison
; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <5 x float> [[TMP7]], float 0.000000e+00, i32 1
; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <5 x float> [[TMP8]], float 0.000000e+00, i32 2
; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP9]], float 0.000000e+00, i32 3
; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <5 x float> [[TMP5]], float 2.000000e+00, i32 3
; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> [[TMP6]], float 3.000000e+00, i32 [[FOO4]]
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
; CHECK-NEXT:    store float [[TMP3]], ptr addrspace(1) @pv, align 4
; CHECK-NEXT:    ret void
;
  %f1 = alloca [5 x float], addrspace(5)
  store [5 x float] zeroinitializer, ptr addrspace(5) %f1

  %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
  store float 2.0, ptr addrspace(5) %foo2

  %foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
  %foo4 = load i32, ptr addrspace(1) %foo3
  %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
  store float 3.0, ptr addrspace(5) %foo5

  call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
  %foo6 = load float, ptr addrspace(5) %f1
  store float %foo6, ptr addrspace(1) @pv
  ret void
}

define amdgpu_vs void @promote_memcpy_identity_aggr() #0 {
; CHECK-LABEL: @promote_memcpy_identity_aggr(
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) @pv, align 4
; CHECK-NEXT:    ret void
;
  %f1 = alloca [5 x float], addrspace(5)
  store [5 x float] zeroinitializer, ptr addrspace(5) %f1
  %foo1 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 1
  store float 1.0, ptr addrspace(5) %foo1
  %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
  store float 2.0, ptr addrspace(5) %foo2
  call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %f1, i32 20, i1 false)
  %foo3 = load float, ptr addrspace(5) %f1
  store float %foo3, ptr addrspace(1) @pv
  ret void
}

; TODO: promote alloca even there is a memcpy between different alloca
define amdgpu_vs void @promote_memcpy_two_aggrs() #0 {
; CHECK-LABEL: @promote_memcpy_two_aggrs(
; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
; CHECK-NEXT:    [[F2:%.*]] = alloca [5 x float], align 4, addrspace(5)
; CHECK-NEXT:    [[DOTFCA_0_GEP1:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP1]], align 4
; CHECK-NEXT:    [[DOTFCA_1_GEP2:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP2]], align 4
; CHECK-NEXT:    [[DOTFCA_2_GEP3:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP3]], align 4
; CHECK-NEXT:    [[DOTFCA_3_GEP4:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP4]], align 4
; CHECK-NEXT:    [[DOTFCA_4_GEP5:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP5]], align 4
; CHECK-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 0
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
; CHECK-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 1
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
; CHECK-NEXT:    [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 2
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
; CHECK-NEXT:    [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 3
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
; CHECK-NEXT:    [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 4
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
; CHECK-NEXT:    store float 3.000000e+00, ptr addrspace(5) [[FOO5]], align 4
; CHECK-NEXT:    call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 [[F2]], ptr addrspace(5) align 4 [[F1]], i32 8, i1 false)
; CHECK-NEXT:    [[FOO6:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 [[FOO4]]
; CHECK-NEXT:    [[FOO7:%.*]] = load float, ptr addrspace(5) [[FOO6]], align 4
; CHECK-NEXT:    store float [[FOO7]], ptr addrspace(1) @pv, align 4
; CHECK-NEXT:    ret void
;
  %f1 = alloca [5 x float], addrspace(5)
  %f2 = alloca [5 x float], addrspace(5)

  store [5 x float] zeroinitializer, ptr addrspace(5) %f1
  store [5 x float] zeroinitializer, ptr addrspace(5) %f2

  %foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
  %foo4 = load i32, ptr addrspace(1) %foo3
  %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
  store float 3.0, ptr addrspace(5) %foo5

  call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f2, ptr addrspace(5) align 4 %f1, i32 8, i1 false)

  %foo6 = getelementptr [5 x float], ptr addrspace(5) %f2, i32 0, i32 %foo4
  %foo7 = load float, ptr addrspace(5) %foo6
  store float %foo7, ptr addrspace(1) @pv
  ret void
}

; TODO: promote alloca even there is a memcpy between the alloca and other memory space.
define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0 {
; CHECK-LABEL: @promote_memcpy_p1p5_aggr(
; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
; CHECK-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
; CHECK-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
; CHECK-NEXT:    [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
; CHECK-NEXT:    [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
; CHECK-NEXT:    [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
; CHECK-NEXT:    store float 3.000000e+00, ptr addrspace(5) [[FOO5]], align 4
; CHECK-NEXT:    call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 [[F1]], i32 8, i1 false)
; CHECK-NEXT:    ret void
;
  %f1 = alloca [5 x float], addrspace(5)
  store [5 x float] zeroinitializer, ptr addrspace(5) %f1

  %foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
  %foo4 = load i32, ptr addrspace(1) %foo3
  %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
  store float 3.0, ptr addrspace(5) %foo5

  call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 %f1, i32 8, i1 false)
  ret void
}

define amdgpu_vs void @promote_memcpy_inline_aggr() #0 {
; CHECK-LABEL: @promote_memcpy_inline_aggr(
; CHECK-NEXT:    [[F1:%.*]] = freeze <5 x float> poison
; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <5 x float> [[TMP6]], float 0.000000e+00, i32 1
; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <5 x float> [[TMP7]], float 0.000000e+00, i32 2
; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP8]], float 0.000000e+00, i32 3
; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> [[TMP5]], float 3.000000e+00, i32 [[FOO4]]
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
; CHECK-NEXT:    store float [[TMP3]], ptr addrspace(1) @pv, align 4
; CHECK-NEXT:    ret void
;
  %f1 = alloca [5 x float], addrspace(5)
  store [5 x float] zeroinitializer, ptr addrspace(5) %f1

  %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
  %foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
  %foo4 = load i32, ptr addrspace(1) %foo3
  %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
  store float 3.0, ptr addrspace(5) %foo5

  call void @llvm.memcpy.inline.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
  %foo6 = load float, ptr addrspace(5) %f1
  store float %foo6, ptr addrspace(1) @pv
  ret void
}

declare void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
declare void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
declare void @llvm.memcpy.inline.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
declare void @llvm.memmove.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)

@tmp_g = external addrspace(1) global { [4 x double], <2 x double>, <3 x double>, <4 x double> }
@frag_color = external addrspace(1) global <4 x float>

define amdgpu_ps void @promote_double_aggr() #0 {
; CHECK-LABEL: @promote_double_aggr(
; CHECK-NEXT:    [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
; CHECK-NEXT:    [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8
; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
; CHECK-NEXT:    [[FOO3:%.*]] = load double, ptr addrspace(1) [[FOO2]], align 8
; CHECK-NEXT:    [[FOO4:%.*]] = insertvalue [2 x double] poison, double [[FOO1]], 0
; CHECK-NEXT:    [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1
; CHECK-NEXT:    [[FOO5_FCA_0_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 0
; CHECK-NEXT:    [[FOO5_FCA_1_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 1
; CHECK-NEXT:    [[FOO10:%.*]] = fadd double [[FOO5_FCA_1_EXTRACT]], [[FOO5_FCA_1_EXTRACT]]
; CHECK-NEXT:    [[FOO16:%.*]] = fadd double [[FOO10]], [[FOO5_FCA_1_EXTRACT]]
; CHECK-NEXT:    [[FOO17:%.*]] = fptrunc double [[FOO16]] to float
; CHECK-NEXT:    [[FOO18:%.*]] = insertelement <4 x float> poison, float [[FOO17]], i32 0
; CHECK-NEXT:    [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1
; CHECK-NEXT:    [[FOO20:%.*]] = insertelement <4 x float> [[FOO19]], float [[FOO17]], i32 2
; CHECK-NEXT:    [[FOO21:%.*]] = insertelement <4 x float> [[FOO20]], float [[FOO17]], i32 3
; CHECK-NEXT:    store <4 x float> [[FOO21]], ptr addrspace(1) @frag_color, align 16
; CHECK-NEXT:    ret void
;
  %s = alloca [2 x double], addrspace(5)
  %foo = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
  %foo1 = load double, ptr addrspace(1) %foo
  %foo2 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
  %foo3 = load double, ptr addrspace(1) %foo2
  %foo4 = insertvalue [2 x double] poison, double %foo1, 0
  %foo5 = insertvalue [2 x double] %foo4, double %foo3, 1
  store [2 x double] %foo5, ptr addrspace(5) %s
  %foo6 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
  %foo7 = load double, ptr addrspace(5) %foo6
  %foo8 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
  %foo9 = load double, ptr addrspace(5) %foo8
  %foo10 = fadd double %foo7, %foo9
  store double %foo10, ptr addrspace(5) %s
  %foo13 = load double, ptr addrspace(5) %s
  %foo14 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
  %foo15 = load double, ptr addrspace(5) %foo14
  %foo16 = fadd double %foo13, %foo15
  %foo17 = fptrunc double %foo16 to float
  %foo18 = insertelement <4 x float> poison, float %foo17, i32 0
  %foo19 = insertelement <4 x float> %foo18, float %foo17, i32 1
  %foo20 = insertelement <4 x float> %foo19, float %foo17, i32 2
  %foo21 = insertelement <4 x float> %foo20, float %foo17, i32 3
  store <4 x float> %foo21, ptr addrspace(1) @frag_color
  ret void
}

; Don't crash on a type that isn't a valid vector element.
define amdgpu_kernel void @alloca_struct() #0 {
; CHECK-LABEL: @alloca_struct(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    ret void
;
entry:
  %alloca = alloca [2 x %struct], align 4, addrspace(5)
  ret void
}
