blob: a41d2f7fc033dd10b2e576bdbbd2bb14f112117a [file]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -passes=lower-matrix-intrinsics -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
; Test that fused matrix lowering handles loads and stores in different address
; spaces without crashing. When address spaces differ, data is unconditionally
; copied to a local buffer.
define void @multiply_diff_addr_spaces(ptr addrspace(1) %A, ptr addrspace(1) %B, ptr addrspace(2) %C) {
; CHECK-LABEL: define void @multiply_diff_addr_spaces(
; CHECK-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(2) [[C:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = alloca [4 x float], align 4, addrspace(1)
; CHECK-NEXT: [[TMP1:%.*]] = alloca [4 x float], align 4, addrspace(1)
; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP1]])
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(1) align 8 [[A]], i64 16, i1 false)
; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP0]])
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP0]], ptr addrspace(1) align 8 [[B]], i64 16, i1 false)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr addrspace(1) [[TMP1]], i64 0
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP2]], align 8
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[TMP2]], i64 2
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr addrspace(1) [[VEC_GEP]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr addrspace(1) [[TMP0]], i64 0
; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP3]], align 8
; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr addrspace(1) [[TMP3]], i64 2
; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x float>, ptr addrspace(1) [[VEC_GEP3]], align 8
; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> poison, float [[TMP4]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = fmul <1 x float> [[BLOCK]], [[SPLAT_SPLAT]]
; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x float> poison, float [[TMP6]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT6]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = fmul <1 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
; CHECK-NEXT: [[TMP8:%.*]] = fadd <1 x float> [[TMP5]], [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <1 x float> [[TMP8]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP9]], <2 x i32> <i32 2, i32 1>
; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT10]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP12:%.*]] = fmul <1 x float> [[BLOCK9]], [[SPLAT_SPLAT11]]
; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT13]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = fmul <1 x float> [[BLOCK12]], [[SPLAT_SPLAT14]]
; CHECK-NEXT: [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]]
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <1 x float> [[TMP15]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP16]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x float> poison, float [[TMP18]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT16]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP19:%.*]] = fmul <1 x float> [[BLOCK15]], [[SPLAT_SPLAT17]]
; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x float> poison, float [[TMP20]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT19]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP21:%.*]] = fmul <1 x float> [[BLOCK18]], [[SPLAT_SPLAT20]]
; CHECK-NEXT: [[TMP22:%.*]] = fadd <1 x float> [[TMP19]], [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <1 x float> [[TMP22]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP23]], <2 x i32> <i32 2, i32 1>
; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x float> [[TMP24]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x float> poison, float [[TMP25]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT23]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP26:%.*]] = fmul <1 x float> [[BLOCK22]], [[SPLAT_SPLAT24]]
; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x float> poison, float [[TMP27]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT26]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP28:%.*]] = fmul <1 x float> [[BLOCK25]], [[SPLAT_SPLAT27]]
; CHECK-NEXT: [[TMP29:%.*]] = fadd <1 x float> [[TMP26]], [[TMP28]]
; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <1 x float> [[TMP29]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP24]], <2 x float> [[TMP30]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr float, ptr addrspace(2) [[C]], i64 0
; CHECK-NEXT: store <2 x float> [[TMP17]], ptr addrspace(2) [[TMP32]], align 8
; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(2) [[TMP32]], i64 2
; CHECK-NEXT: store <2 x float> [[TMP31]], ptr addrspace(2) [[VEC_GEP28]], align 8
; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP1]])
; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP0]])
; CHECK-NEXT: ret void
;
entry:
%a = load <4 x float>, ptr addrspace(1) %A, align 8
%b = load <4 x float>, ptr addrspace(1) %B, align 8
%c = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2)
store <4 x float> %c, ptr addrspace(2) %C, align 8
ret void
}
define void @multiply_all_diff_addr_spaces(ptr addrspace(1) %A, ptr addrspace(3) %B, ptr addrspace(2) %C) {
; CHECK-LABEL: define void @multiply_all_diff_addr_spaces(
; CHECK-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(3) [[B:%.*]], ptr addrspace(2) [[C:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP1:%.*]] = alloca [4 x float], align 4, addrspace(3)
; CHECK-NEXT: [[TMP0:%.*]] = alloca [4 x float], align 4, addrspace(1)
; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP0]])
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP0]], ptr addrspace(1) align 8 [[A]], i64 16, i1 false)
; CHECK-NEXT: call void @llvm.lifetime.start.p3(ptr addrspace(3) [[TMP1]])
; CHECK-NEXT: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 4 [[TMP1]], ptr addrspace(3) align 8 [[B]], i64 16, i1 false)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr addrspace(1) [[TMP0]], i64 0
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP2]], align 8
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[TMP2]], i64 2
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr addrspace(1) [[VEC_GEP]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr addrspace(3) [[TMP1]], i64 0
; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x float>, ptr addrspace(3) [[TMP3]], align 8
; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr addrspace(3) [[TMP3]], i64 2
; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x float>, ptr addrspace(3) [[VEC_GEP3]], align 8
; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> poison, float [[TMP4]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = fmul <1 x float> [[BLOCK]], [[SPLAT_SPLAT]]
; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x float> poison, float [[TMP6]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT6]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = fmul <1 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
; CHECK-NEXT: [[TMP8:%.*]] = fadd <1 x float> [[TMP5]], [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <1 x float> [[TMP8]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP9]], <2 x i32> <i32 2, i32 1>
; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT10]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP12:%.*]] = fmul <1 x float> [[BLOCK9]], [[SPLAT_SPLAT11]]
; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT13]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = fmul <1 x float> [[BLOCK12]], [[SPLAT_SPLAT14]]
; CHECK-NEXT: [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]]
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <1 x float> [[TMP15]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP16]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x float> poison, float [[TMP18]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT16]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP19:%.*]] = fmul <1 x float> [[BLOCK15]], [[SPLAT_SPLAT17]]
; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x float> poison, float [[TMP20]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT19]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP21:%.*]] = fmul <1 x float> [[BLOCK18]], [[SPLAT_SPLAT20]]
; CHECK-NEXT: [[TMP22:%.*]] = fadd <1 x float> [[TMP19]], [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <1 x float> [[TMP22]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP23]], <2 x i32> <i32 2, i32 1>
; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x float> [[TMP24]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x float> poison, float [[TMP25]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT23]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP26:%.*]] = fmul <1 x float> [[BLOCK22]], [[SPLAT_SPLAT24]]
; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x float> poison, float [[TMP27]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT26]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP28:%.*]] = fmul <1 x float> [[BLOCK25]], [[SPLAT_SPLAT27]]
; CHECK-NEXT: [[TMP29:%.*]] = fadd <1 x float> [[TMP26]], [[TMP28]]
; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <1 x float> [[TMP29]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP24]], <2 x float> [[TMP30]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr float, ptr addrspace(2) [[C]], i64 0
; CHECK-NEXT: store <2 x float> [[TMP17]], ptr addrspace(2) [[TMP32]], align 8
; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(2) [[TMP32]], i64 2
; CHECK-NEXT: store <2 x float> [[TMP31]], ptr addrspace(2) [[VEC_GEP28]], align 8
; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP0]])
; CHECK-NEXT: call void @llvm.lifetime.end.p3(ptr addrspace(3) [[TMP1]])
; CHECK-NEXT: ret void
;
entry:
%a = load <4 x float>, ptr addrspace(1) %A, align 8
%b = load <4 x float>, ptr addrspace(3) %B, align 8
%c = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2)
store <4 x float> %c, ptr addrspace(2) %C, align 8
ret void
}
; First load (A) matches store address space, second load (B) differs.
; A gets runtime alias check, B gets unconditional copy.
define void @multiply_first_load_same_addr_space(ptr addrspace(1) %A, ptr addrspace(2) %B, ptr addrspace(1) %C) {
; CHECK-LABEL: define void @multiply_first_load_same_addr_space(
; CHECK-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(2) [[B:%.*]], ptr addrspace(1) [[C:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP4:%.*]] = alloca [4 x float], align 4, addrspace(2)
; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x float], align 4, addrspace(1)
; CHECK-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[C]], i64 16
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr addrspace(1) [[A]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP0]], label %[[ALIAS_CONT:.*]], label %[[NO_ALIAS:.*]]
; CHECK: [[ALIAS_CONT]]:
; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP2]])
; CHECK-NEXT: [[LOAD_END:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[A]], i64 16
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr addrspace(1) [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label %[[COPY:.*]], label %[[NO_ALIAS]]
; CHECK: [[COPY]]:
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(1) align 8 [[A]], i64 16, i1 false)
; CHECK-NEXT: br label %[[NO_ALIAS]]
; CHECK: [[NO_ALIAS]]:
; CHECK-NEXT: [[TMP3:%.*]] = phi ptr addrspace(1) [ [[A]], %[[ENTRY]] ], [ [[A]], %[[ALIAS_CONT]] ], [ [[TMP2]], %[[COPY]] ]
; CHECK-NEXT: call void @llvm.lifetime.start.p2(ptr addrspace(2) [[TMP4]])
; CHECK-NEXT: call void @llvm.memcpy.p2.p2.i64(ptr addrspace(2) align 4 [[TMP4]], ptr addrspace(2) align 8 [[B]], i64 16, i1 false)
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr addrspace(1) [[TMP3]], i64 0
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP5]], align 8
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[TMP5]], i64 2
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr addrspace(1) [[VEC_GEP]], align 8
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, ptr addrspace(2) [[TMP4]], i64 0
; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x float>, ptr addrspace(2) [[TMP6]], align 8
; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr addrspace(2) [[TMP6]], i64 2
; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x float>, ptr addrspace(2) [[VEC_GEP3]], align 8
; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> poison, float [[TMP7]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = fmul <1 x float> [[BLOCK]], [[SPLAT_SPLAT]]
; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x float> poison, float [[TMP9]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT6]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = fmul <1 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
; CHECK-NEXT: [[TMP11:%.*]] = fadd <1 x float> [[TMP8]], [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x float> [[TMP11]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP12]], <2 x i32> <i32 2, i32 1>
; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x float> poison, float [[TMP14]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT10]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP15:%.*]] = fmul <1 x float> [[BLOCK9]], [[SPLAT_SPLAT11]]
; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT13]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP17:%.*]] = fmul <1 x float> [[BLOCK12]], [[SPLAT_SPLAT14]]
; CHECK-NEXT: [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]]
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> [[TMP19]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x float> poison, float [[TMP21]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT16]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP22:%.*]] = fmul <1 x float> [[BLOCK15]], [[SPLAT_SPLAT17]]
; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x float> poison, float [[TMP23]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT19]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP24:%.*]] = fmul <1 x float> [[BLOCK18]], [[SPLAT_SPLAT20]]
; CHECK-NEXT: [[TMP25:%.*]] = fadd <1 x float> [[TMP22]], [[TMP24]]
; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x float> [[TMP25]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP26]], <2 x i32> <i32 2, i32 1>
; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x float> poison, float [[TMP28]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT23]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP29:%.*]] = fmul <1 x float> [[BLOCK22]], [[SPLAT_SPLAT24]]
; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x float> poison, float [[TMP30]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT26]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP31:%.*]] = fmul <1 x float> [[BLOCK25]], [[SPLAT_SPLAT27]]
; CHECK-NEXT: [[TMP32:%.*]] = fadd <1 x float> [[TMP29]], [[TMP31]]
; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <1 x float> [[TMP32]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> [[TMP33]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[TMP35:%.*]] = getelementptr float, ptr addrspace(1) [[C]], i64 0
; CHECK-NEXT: store <2 x float> [[TMP20]], ptr addrspace(1) [[TMP35]], align 8
; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(1) [[TMP35]], i64 2
; CHECK-NEXT: store <2 x float> [[TMP34]], ptr addrspace(1) [[VEC_GEP28]], align 8
; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP2]])
; CHECK-NEXT: call void @llvm.lifetime.end.p2(ptr addrspace(2) [[TMP4]])
; CHECK-NEXT: ret void
;
entry:
%a = load <4 x float>, ptr addrspace(1) %A, align 8
%b = load <4 x float>, ptr addrspace(2) %B, align 8
%c = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2)
store <4 x float> %c, ptr addrspace(1) %C, align 8
ret void
}
; Second load (B) matches store address space, first load (A) differs.
; B gets runtime alias check, A gets unconditional copy.
define void @multiply_second_load_same_addr_space(ptr addrspace(2) %A, ptr addrspace(1) %B, ptr addrspace(1) %C) {
; CHECK-LABEL: define void @multiply_second_load_same_addr_space(
; CHECK-SAME: ptr addrspace(2) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[C:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP3:%.*]] = alloca [4 x float], align 4, addrspace(1)
; CHECK-NEXT: [[TMP0:%.*]] = alloca [4 x float], align 4, addrspace(2)
; CHECK-NEXT: call void @llvm.lifetime.start.p2(ptr addrspace(2) [[TMP0]])
; CHECK-NEXT: call void @llvm.memcpy.p2.p2.i64(ptr addrspace(2) align 4 [[TMP0]], ptr addrspace(2) align 8 [[A]], i64 16, i1 false)
; CHECK-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[C]], i64 16
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr addrspace(1) [[B]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP1]], label %[[ALIAS_CONT:.*]], label %[[NO_ALIAS:.*]]
; CHECK: [[ALIAS_CONT]]:
; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP3]])
; CHECK-NEXT: [[LOAD_END:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[B]], i64 16
; CHECK-NEXT: [[TMP2:%.*]] = icmp ult ptr addrspace(1) [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP2]], label %[[COPY:.*]], label %[[NO_ALIAS]]
; CHECK: [[COPY]]:
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP3]], ptr addrspace(1) align 8 [[B]], i64 16, i1 false)
; CHECK-NEXT: br label %[[NO_ALIAS]]
; CHECK: [[NO_ALIAS]]:
; CHECK-NEXT: [[TMP4:%.*]] = phi ptr addrspace(1) [ [[B]], %[[ENTRY]] ], [ [[B]], %[[ALIAS_CONT]] ], [ [[TMP3]], %[[COPY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr addrspace(2) [[TMP0]], i64 0
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr addrspace(2) [[TMP5]], align 8
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr addrspace(2) [[TMP5]], i64 2
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr addrspace(2) [[VEC_GEP]], align 8
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, ptr addrspace(1) [[TMP4]], i64 0
; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP6]], align 8
; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr addrspace(1) [[TMP6]], i64 2
; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x float>, ptr addrspace(1) [[VEC_GEP3]], align 8
; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> poison, float [[TMP7]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = fmul <1 x float> [[BLOCK]], [[SPLAT_SPLAT]]
; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x float> poison, float [[TMP9]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT6]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = fmul <1 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
; CHECK-NEXT: [[TMP11:%.*]] = fadd <1 x float> [[TMP8]], [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x float> [[TMP11]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP12]], <2 x i32> <i32 2, i32 1>
; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x float> poison, float [[TMP14]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT10]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP15:%.*]] = fmul <1 x float> [[BLOCK9]], [[SPLAT_SPLAT11]]
; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT13]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP17:%.*]] = fmul <1 x float> [[BLOCK12]], [[SPLAT_SPLAT14]]
; CHECK-NEXT: [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]]
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> [[TMP19]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x float> poison, float [[TMP21]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT16]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP22:%.*]] = fmul <1 x float> [[BLOCK15]], [[SPLAT_SPLAT17]]
; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x float> poison, float [[TMP23]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT19]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP24:%.*]] = fmul <1 x float> [[BLOCK18]], [[SPLAT_SPLAT20]]
; CHECK-NEXT: [[TMP25:%.*]] = fadd <1 x float> [[TMP22]], [[TMP24]]
; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x float> [[TMP25]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP26]], <2 x i32> <i32 2, i32 1>
; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x float> poison, float [[TMP28]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT23]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP29:%.*]] = fmul <1 x float> [[BLOCK22]], [[SPLAT_SPLAT24]]
; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1>
; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1
; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x float> poison, float [[TMP30]], i64 0
; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT26]], <1 x float> poison, <1 x i32> zeroinitializer
; CHECK-NEXT: [[TMP31:%.*]] = fmul <1 x float> [[BLOCK25]], [[SPLAT_SPLAT27]]
; CHECK-NEXT: [[TMP32:%.*]] = fadd <1 x float> [[TMP29]], [[TMP31]]
; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <1 x float> [[TMP32]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> [[TMP33]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[TMP35:%.*]] = getelementptr float, ptr addrspace(1) [[C]], i64 0
; CHECK-NEXT: store <2 x float> [[TMP20]], ptr addrspace(1) [[TMP35]], align 8
; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(1) [[TMP35]], i64 2
; CHECK-NEXT: store <2 x float> [[TMP34]], ptr addrspace(1) [[VEC_GEP28]], align 8
; CHECK-NEXT: call void @llvm.lifetime.end.p2(ptr addrspace(2) [[TMP0]])
; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP3]])
; CHECK-NEXT: ret void
;
entry:
%a = load <4 x float>, ptr addrspace(2) %A, align 8
%b = load <4 x float>, ptr addrspace(1) %B, align 8
%c = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2)
store <4 x float> %c, ptr addrspace(1) %C, align 8
ret void
}
declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)