| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: opt -passes=lower-matrix-intrinsics -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s |
| |
| ; Test that fused matrix lowering handles loads and stores in different address |
| ; spaces without crashing. When address spaces differ, data is unconditionally |
| ; copied to a local buffer. |
| define void @multiply_diff_addr_spaces(ptr addrspace(1) %A, ptr addrspace(1) %B, ptr addrspace(2) %C) { |
| ; CHECK-LABEL: define void @multiply_diff_addr_spaces( |
| ; CHECK-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(2) [[C:%.*]]) { |
| ; CHECK-NEXT: [[ENTRY:.*:]] |
| ; CHECK-NEXT: [[TMP0:%.*]] = alloca [4 x float], align 4, addrspace(1) |
| ; CHECK-NEXT: [[TMP1:%.*]] = alloca [4 x float], align 4, addrspace(1) |
| ; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP1]]) |
| ; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(1) align 8 [[A]], i64 16, i1 false) |
| ; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP0]]) |
| ; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP0]], ptr addrspace(1) align 8 [[B]], i64 16, i1 false) |
| ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr addrspace(1) [[TMP1]], i64 0 |
| ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP2]], align 8 |
| ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[TMP2]], i64 2 |
| ; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr addrspace(1) [[VEC_GEP]], align 8 |
| ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr addrspace(1) [[TMP0]], i64 0 |
| ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP3]], align 8 |
| ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr addrspace(1) [[TMP3]], i64 2 |
| ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x float>, ptr addrspace(1) [[VEC_GEP3]], align 8 |
| ; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> poison, float [[TMP4]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP5:%.*]] = fmul <1 x float> [[BLOCK]], [[SPLAT_SPLAT]] |
| ; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x float> poison, float [[TMP6]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT6]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP7:%.*]] = fmul <1 x float> [[BLOCK5]], [[SPLAT_SPLAT7]] |
| ; CHECK-NEXT: [[TMP8:%.*]] = fadd <1 x float> [[TMP5]], [[TMP7]] |
| ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <1 x float> [[TMP8]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP9]], <2 x i32> <i32 2, i32 1> |
| ; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT10]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP12:%.*]] = fmul <1 x float> [[BLOCK9]], [[SPLAT_SPLAT11]] |
| ; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT13]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP14:%.*]] = fmul <1 x float> [[BLOCK12]], [[SPLAT_SPLAT14]] |
| ; CHECK-NEXT: [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]] |
| ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <1 x float> [[TMP15]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP16]], <2 x i32> <i32 0, i32 2> |
| ; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x float> poison, float [[TMP18]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT16]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP19:%.*]] = fmul <1 x float> [[BLOCK15]], [[SPLAT_SPLAT17]] |
| ; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x float> poison, float [[TMP20]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT19]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP21:%.*]] = fmul <1 x float> [[BLOCK18]], [[SPLAT_SPLAT20]] |
| ; CHECK-NEXT: [[TMP22:%.*]] = fadd <1 x float> [[TMP19]], [[TMP21]] |
| ; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <1 x float> [[TMP22]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP23]], <2 x i32> <i32 2, i32 1> |
| ; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x float> [[TMP24]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x float> poison, float [[TMP25]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT23]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP26:%.*]] = fmul <1 x float> [[BLOCK22]], [[SPLAT_SPLAT24]] |
| ; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x float> poison, float [[TMP27]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT26]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP28:%.*]] = fmul <1 x float> [[BLOCK25]], [[SPLAT_SPLAT27]] |
| ; CHECK-NEXT: [[TMP29:%.*]] = fadd <1 x float> [[TMP26]], [[TMP28]] |
| ; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <1 x float> [[TMP29]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP24]], <2 x float> [[TMP30]], <2 x i32> <i32 0, i32 2> |
| ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr float, ptr addrspace(2) [[C]], i64 0 |
| ; CHECK-NEXT: store <2 x float> [[TMP17]], ptr addrspace(2) [[TMP32]], align 8 |
| ; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(2) [[TMP32]], i64 2 |
| ; CHECK-NEXT: store <2 x float> [[TMP31]], ptr addrspace(2) [[VEC_GEP28]], align 8 |
| ; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP1]]) |
| ; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP0]]) |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %a = load <4 x float>, ptr addrspace(1) %A, align 8 |
| %b = load <4 x float>, ptr addrspace(1) %B, align 8 |
| %c = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2) |
| store <4 x float> %c, ptr addrspace(2) %C, align 8 |
| ret void |
| } |
| |
| define void @multiply_all_diff_addr_spaces(ptr addrspace(1) %A, ptr addrspace(3) %B, ptr addrspace(2) %C) { |
| ; CHECK-LABEL: define void @multiply_all_diff_addr_spaces( |
| ; CHECK-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(3) [[B:%.*]], ptr addrspace(2) [[C:%.*]]) { |
| ; CHECK-NEXT: [[ENTRY:.*:]] |
| ; CHECK-NEXT: [[TMP1:%.*]] = alloca [4 x float], align 4, addrspace(3) |
| ; CHECK-NEXT: [[TMP0:%.*]] = alloca [4 x float], align 4, addrspace(1) |
| ; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP0]]) |
| ; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP0]], ptr addrspace(1) align 8 [[A]], i64 16, i1 false) |
| ; CHECK-NEXT: call void @llvm.lifetime.start.p3(ptr addrspace(3) [[TMP1]]) |
| ; CHECK-NEXT: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 4 [[TMP1]], ptr addrspace(3) align 8 [[B]], i64 16, i1 false) |
| ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr addrspace(1) [[TMP0]], i64 0 |
| ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP2]], align 8 |
| ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[TMP2]], i64 2 |
| ; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr addrspace(1) [[VEC_GEP]], align 8 |
| ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr addrspace(3) [[TMP1]], i64 0 |
| ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x float>, ptr addrspace(3) [[TMP3]], align 8 |
| ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr addrspace(3) [[TMP3]], i64 2 |
| ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x float>, ptr addrspace(3) [[VEC_GEP3]], align 8 |
| ; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> poison, float [[TMP4]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP5:%.*]] = fmul <1 x float> [[BLOCK]], [[SPLAT_SPLAT]] |
| ; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x float> poison, float [[TMP6]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT6]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP7:%.*]] = fmul <1 x float> [[BLOCK5]], [[SPLAT_SPLAT7]] |
| ; CHECK-NEXT: [[TMP8:%.*]] = fadd <1 x float> [[TMP5]], [[TMP7]] |
| ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <1 x float> [[TMP8]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP9]], <2 x i32> <i32 2, i32 1> |
| ; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT10]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP12:%.*]] = fmul <1 x float> [[BLOCK9]], [[SPLAT_SPLAT11]] |
| ; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT13]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP14:%.*]] = fmul <1 x float> [[BLOCK12]], [[SPLAT_SPLAT14]] |
| ; CHECK-NEXT: [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]] |
| ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <1 x float> [[TMP15]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP16]], <2 x i32> <i32 0, i32 2> |
| ; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x float> poison, float [[TMP18]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT16]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP19:%.*]] = fmul <1 x float> [[BLOCK15]], [[SPLAT_SPLAT17]] |
| ; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x float> poison, float [[TMP20]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT19]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP21:%.*]] = fmul <1 x float> [[BLOCK18]], [[SPLAT_SPLAT20]] |
| ; CHECK-NEXT: [[TMP22:%.*]] = fadd <1 x float> [[TMP19]], [[TMP21]] |
| ; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <1 x float> [[TMP22]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP23]], <2 x i32> <i32 2, i32 1> |
| ; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x float> [[TMP24]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x float> poison, float [[TMP25]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT23]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP26:%.*]] = fmul <1 x float> [[BLOCK22]], [[SPLAT_SPLAT24]] |
| ; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x float> poison, float [[TMP27]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT26]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP28:%.*]] = fmul <1 x float> [[BLOCK25]], [[SPLAT_SPLAT27]] |
| ; CHECK-NEXT: [[TMP29:%.*]] = fadd <1 x float> [[TMP26]], [[TMP28]] |
| ; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <1 x float> [[TMP29]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP24]], <2 x float> [[TMP30]], <2 x i32> <i32 0, i32 2> |
| ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr float, ptr addrspace(2) [[C]], i64 0 |
| ; CHECK-NEXT: store <2 x float> [[TMP17]], ptr addrspace(2) [[TMP32]], align 8 |
| ; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(2) [[TMP32]], i64 2 |
| ; CHECK-NEXT: store <2 x float> [[TMP31]], ptr addrspace(2) [[VEC_GEP28]], align 8 |
| ; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP0]]) |
| ; CHECK-NEXT: call void @llvm.lifetime.end.p3(ptr addrspace(3) [[TMP1]]) |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %a = load <4 x float>, ptr addrspace(1) %A, align 8 |
| %b = load <4 x float>, ptr addrspace(3) %B, align 8 |
| %c = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2) |
| store <4 x float> %c, ptr addrspace(2) %C, align 8 |
| ret void |
| } |
| |
| ; First load (A) matches store address space, second load (B) differs. |
| ; A gets runtime alias check, B gets unconditional copy. |
| define void @multiply_first_load_same_addr_space(ptr addrspace(1) %A, ptr addrspace(2) %B, ptr addrspace(1) %C) { |
| ; CHECK-LABEL: define void @multiply_first_load_same_addr_space( |
| ; CHECK-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(2) [[B:%.*]], ptr addrspace(1) [[C:%.*]]) { |
| ; CHECK-NEXT: [[ENTRY:.*]]: |
| ; CHECK-NEXT: [[TMP4:%.*]] = alloca [4 x float], align 4, addrspace(2) |
| ; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x float], align 4, addrspace(1) |
| ; CHECK-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[C]], i64 16 |
| ; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr addrspace(1) [[A]], [[STORE_END]] |
| ; CHECK-NEXT: br i1 [[TMP0]], label %[[ALIAS_CONT:.*]], label %[[NO_ALIAS:.*]] |
| ; CHECK: [[ALIAS_CONT]]: |
| ; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP2]]) |
| ; CHECK-NEXT: [[LOAD_END:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[A]], i64 16 |
| ; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr addrspace(1) [[C]], [[LOAD_END]] |
| ; CHECK-NEXT: br i1 [[TMP1]], label %[[COPY:.*]], label %[[NO_ALIAS]] |
| ; CHECK: [[COPY]]: |
| ; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(1) align 8 [[A]], i64 16, i1 false) |
| ; CHECK-NEXT: br label %[[NO_ALIAS]] |
| ; CHECK: [[NO_ALIAS]]: |
| ; CHECK-NEXT: [[TMP3:%.*]] = phi ptr addrspace(1) [ [[A]], %[[ENTRY]] ], [ [[A]], %[[ALIAS_CONT]] ], [ [[TMP2]], %[[COPY]] ] |
| ; CHECK-NEXT: call void @llvm.lifetime.start.p2(ptr addrspace(2) [[TMP4]]) |
| ; CHECK-NEXT: call void @llvm.memcpy.p2.p2.i64(ptr addrspace(2) align 4 [[TMP4]], ptr addrspace(2) align 8 [[B]], i64 16, i1 false) |
| ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr addrspace(1) [[TMP3]], i64 0 |
| ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP5]], align 8 |
| ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[TMP5]], i64 2 |
| ; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr addrspace(1) [[VEC_GEP]], align 8 |
| ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, ptr addrspace(2) [[TMP4]], i64 0 |
| ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x float>, ptr addrspace(2) [[TMP6]], align 8 |
| ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr addrspace(2) [[TMP6]], i64 2 |
| ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x float>, ptr addrspace(2) [[VEC_GEP3]], align 8 |
| ; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> poison, float [[TMP7]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP8:%.*]] = fmul <1 x float> [[BLOCK]], [[SPLAT_SPLAT]] |
| ; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x float> poison, float [[TMP9]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT6]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP10:%.*]] = fmul <1 x float> [[BLOCK5]], [[SPLAT_SPLAT7]] |
| ; CHECK-NEXT: [[TMP11:%.*]] = fadd <1 x float> [[TMP8]], [[TMP10]] |
| ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x float> [[TMP11]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP12]], <2 x i32> <i32 2, i32 1> |
| ; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x float> poison, float [[TMP14]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT10]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP15:%.*]] = fmul <1 x float> [[BLOCK9]], [[SPLAT_SPLAT11]] |
| ; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT13]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP17:%.*]] = fmul <1 x float> [[BLOCK12]], [[SPLAT_SPLAT14]] |
| ; CHECK-NEXT: [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]] |
| ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> [[TMP19]], <2 x i32> <i32 0, i32 2> |
| ; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x float> poison, float [[TMP21]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT16]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP22:%.*]] = fmul <1 x float> [[BLOCK15]], [[SPLAT_SPLAT17]] |
| ; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x float> poison, float [[TMP23]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT19]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP24:%.*]] = fmul <1 x float> [[BLOCK18]], [[SPLAT_SPLAT20]] |
| ; CHECK-NEXT: [[TMP25:%.*]] = fadd <1 x float> [[TMP22]], [[TMP24]] |
| ; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x float> [[TMP25]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP26]], <2 x i32> <i32 2, i32 1> |
| ; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x float> poison, float [[TMP28]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT23]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP29:%.*]] = fmul <1 x float> [[BLOCK22]], [[SPLAT_SPLAT24]] |
| ; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x float> poison, float [[TMP30]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT26]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP31:%.*]] = fmul <1 x float> [[BLOCK25]], [[SPLAT_SPLAT27]] |
| ; CHECK-NEXT: [[TMP32:%.*]] = fadd <1 x float> [[TMP29]], [[TMP31]] |
| ; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <1 x float> [[TMP32]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> [[TMP33]], <2 x i32> <i32 0, i32 2> |
| ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr float, ptr addrspace(1) [[C]], i64 0 |
| ; CHECK-NEXT: store <2 x float> [[TMP20]], ptr addrspace(1) [[TMP35]], align 8 |
| ; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(1) [[TMP35]], i64 2 |
| ; CHECK-NEXT: store <2 x float> [[TMP34]], ptr addrspace(1) [[VEC_GEP28]], align 8 |
| ; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP2]]) |
| ; CHECK-NEXT: call void @llvm.lifetime.end.p2(ptr addrspace(2) [[TMP4]]) |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %a = load <4 x float>, ptr addrspace(1) %A, align 8 |
| %b = load <4 x float>, ptr addrspace(2) %B, align 8 |
| %c = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2) |
| store <4 x float> %c, ptr addrspace(1) %C, align 8 |
| ret void |
| } |
| |
| ; Second load (B) matches store address space, first load (A) differs. |
| ; B gets runtime alias check, A gets unconditional copy. |
| define void @multiply_second_load_same_addr_space(ptr addrspace(2) %A, ptr addrspace(1) %B, ptr addrspace(1) %C) { |
| ; CHECK-LABEL: define void @multiply_second_load_same_addr_space( |
| ; CHECK-SAME: ptr addrspace(2) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[C:%.*]]) { |
| ; CHECK-NEXT: [[ENTRY:.*]]: |
| ; CHECK-NEXT: [[TMP3:%.*]] = alloca [4 x float], align 4, addrspace(1) |
| ; CHECK-NEXT: [[TMP0:%.*]] = alloca [4 x float], align 4, addrspace(2) |
| ; CHECK-NEXT: call void @llvm.lifetime.start.p2(ptr addrspace(2) [[TMP0]]) |
| ; CHECK-NEXT: call void @llvm.memcpy.p2.p2.i64(ptr addrspace(2) align 4 [[TMP0]], ptr addrspace(2) align 8 [[A]], i64 16, i1 false) |
| ; CHECK-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[C]], i64 16 |
| ; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr addrspace(1) [[B]], [[STORE_END]] |
| ; CHECK-NEXT: br i1 [[TMP1]], label %[[ALIAS_CONT:.*]], label %[[NO_ALIAS:.*]] |
| ; CHECK: [[ALIAS_CONT]]: |
| ; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP3]]) |
| ; CHECK-NEXT: [[LOAD_END:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[B]], i64 16 |
| ; CHECK-NEXT: [[TMP2:%.*]] = icmp ult ptr addrspace(1) [[C]], [[LOAD_END]] |
| ; CHECK-NEXT: br i1 [[TMP2]], label %[[COPY:.*]], label %[[NO_ALIAS]] |
| ; CHECK: [[COPY]]: |
| ; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP3]], ptr addrspace(1) align 8 [[B]], i64 16, i1 false) |
| ; CHECK-NEXT: br label %[[NO_ALIAS]] |
| ; CHECK: [[NO_ALIAS]]: |
| ; CHECK-NEXT: [[TMP4:%.*]] = phi ptr addrspace(1) [ [[B]], %[[ENTRY]] ], [ [[B]], %[[ALIAS_CONT]] ], [ [[TMP3]], %[[COPY]] ] |
| ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr addrspace(2) [[TMP0]], i64 0 |
| ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr addrspace(2) [[TMP5]], align 8 |
| ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr addrspace(2) [[TMP5]], i64 2 |
| ; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr addrspace(2) [[VEC_GEP]], align 8 |
| ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, ptr addrspace(1) [[TMP4]], i64 0 |
| ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP6]], align 8 |
| ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr addrspace(1) [[TMP6]], i64 2 |
| ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x float>, ptr addrspace(1) [[VEC_GEP3]], align 8 |
| ; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> poison, float [[TMP7]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP8:%.*]] = fmul <1 x float> [[BLOCK]], [[SPLAT_SPLAT]] |
| ; CHECK-NEXT: [[BLOCK5:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x float> poison, float [[TMP9]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT6]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP10:%.*]] = fmul <1 x float> [[BLOCK5]], [[SPLAT_SPLAT7]] |
| ; CHECK-NEXT: [[TMP11:%.*]] = fadd <1 x float> [[TMP8]], [[TMP10]] |
| ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x float> [[TMP11]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP12]], <2 x i32> <i32 2, i32 1> |
| ; CHECK-NEXT: [[BLOCK8:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[BLOCK9:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x float> poison, float [[TMP14]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT10]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP15:%.*]] = fmul <1 x float> [[BLOCK9]], [[SPLAT_SPLAT11]] |
| ; CHECK-NEXT: [[BLOCK12:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[COL_LOAD2]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT13]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP17:%.*]] = fmul <1 x float> [[BLOCK12]], [[SPLAT_SPLAT14]] |
| ; CHECK-NEXT: [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]] |
| ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> [[TMP19]], <2 x i32> <i32 0, i32 2> |
| ; CHECK-NEXT: [[BLOCK15:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x float> poison, float [[TMP21]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT16]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP22:%.*]] = fmul <1 x float> [[BLOCK15]], [[SPLAT_SPLAT17]] |
| ; CHECK-NEXT: [[BLOCK18:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x float> poison, float [[TMP23]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT19]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP24:%.*]] = fmul <1 x float> [[BLOCK18]], [[SPLAT_SPLAT20]] |
| ; CHECK-NEXT: [[TMP25:%.*]] = fadd <1 x float> [[TMP22]], [[TMP24]] |
| ; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <1 x float> [[TMP25]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> [[TMP26]], <2 x i32> <i32 2, i32 1> |
| ; CHECK-NEXT: [[BLOCK21:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[BLOCK22:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x float> poison, float [[TMP28]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT23]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP29:%.*]] = fmul <1 x float> [[BLOCK22]], [[SPLAT_SPLAT24]] |
| ; CHECK-NEXT: [[BLOCK25:%.*]] = shufflevector <2 x float> [[COL_LOAD1]], <2 x float> poison, <1 x i32> <i32 1> |
| ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[COL_LOAD4]], i64 1 |
| ; CHECK-NEXT: [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x float> poison, float [[TMP30]], i64 0 |
| ; CHECK-NEXT: [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT26]], <1 x float> poison, <1 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP31:%.*]] = fmul <1 x float> [[BLOCK25]], [[SPLAT_SPLAT27]] |
| ; CHECK-NEXT: [[TMP32:%.*]] = fadd <1 x float> [[TMP29]], [[TMP31]] |
| ; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <1 x float> [[TMP32]], <1 x float> poison, <2 x i32> <i32 0, i32 poison> |
| ; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> [[TMP33]], <2 x i32> <i32 0, i32 2> |
| ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr float, ptr addrspace(1) [[C]], i64 0 |
| ; CHECK-NEXT: store <2 x float> [[TMP20]], ptr addrspace(1) [[TMP35]], align 8 |
| ; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(1) [[TMP35]], i64 2 |
| ; CHECK-NEXT: store <2 x float> [[TMP34]], ptr addrspace(1) [[VEC_GEP28]], align 8 |
| ; CHECK-NEXT: call void @llvm.lifetime.end.p2(ptr addrspace(2) [[TMP0]]) |
| ; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP3]]) |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %a = load <4 x float>, ptr addrspace(2) %A, align 8 |
| %b = load <4 x float>, ptr addrspace(1) %B, align 8 |
| %c = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2) |
| store <4 x float> %c, ptr addrspace(1) %C, align 8 |
| ret void |
| } |
| |
| declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) |