| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: opt -S -dxil-intrinsic-expansion < %s | FileCheck %s |
| |
| ; Verify that llvm.matrix.multiply is expanded to scalar dot products for DXIL. |
| |
| declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32) |
| declare <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float>, <6 x float>, i32, i32, i32) |
| declare <2 x float> @llvm.matrix.multiply.v2f32.v6f32.v3f32(<6 x float>, <3 x float>, i32, i32, i32) |
| declare <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32) |
| declare <16 x float> @llvm.matrix.multiply.v16f32.v16f32.v16f32(<16 x float>, <16 x float>, i32, i32, i32) |
| declare <4 x float> @llvm.matrix.multiply.v4f32.v16f32.v4f32(<16 x float>, <4 x float>, i32, i32, i32) |
| declare <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double>, <4 x double>, i32, i32, i32) |
| declare <2 x double> @llvm.matrix.multiply.v2f64.v4f64.v2f64(<4 x double>, <2 x double>, i32, i32, i32) |
| declare <6 x float> @llvm.matrix.multiply.v6f32.v2f32.v3f32(<2 x float>, <3 x float>, i32, i32, i32) |
| declare <6 x i32> @llvm.matrix.multiply.v6i32.v2i32.v3i32(<2 x i32>, <3 x i32>, i32, i32, i32) |
| declare <4 x half> @llvm.matrix.multiply.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, i32, i32, i32) |
| declare <2 x half> @llvm.matrix.multiply.v2f16.v6f16.v3f16(<6 x half>, <3 x half>, i32, i32, i32) |
| declare <6 x half> @llvm.matrix.multiply.v6f16.v2f16.v3f16(<2 x half>, <3 x half>, i32, i32, i32) |
| |
| ; 2x2 float: 4 dot2 calls. |
| define <4 x float> @test_float_2x2(<4 x float> %a, <4 x float> %b) { |
| ; CHECK-LABEL: define <4 x float> @test_float_2x2( |
| ; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) { |
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A]], i64 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[A]], i64 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[A]], i64 2 |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[A]], i64 3 |
| ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[B]], i64 0 |
| ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[B]], i64 1 |
| ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[B]], i64 2 |
| ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[B]], i64 3 |
| ; CHECK-NEXT: [[TMP9:%.*]] = call float @llvm.dx.dot2.f32(float [[TMP1]], float [[TMP3]], float [[TMP5]], float [[TMP6]]) |
| ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i64 0 |
| ; CHECK-NEXT: [[TMP11:%.*]] = call float @llvm.dx.dot2.f32(float [[TMP2]], float [[TMP4]], float [[TMP5]], float [[TMP6]]) |
| ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP11]], i64 1 |
| ; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.dx.dot2.f32(float [[TMP1]], float [[TMP3]], float [[TMP7]], float [[TMP8]]) |
| ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP13]], i64 2 |
| ; CHECK-NEXT: [[TMP15:%.*]] = call float @llvm.dx.dot2.f32(float [[TMP2]], float [[TMP4]], float [[TMP7]], float [[TMP8]]) |
| ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP15]], i64 3 |
| ; CHECK-NEXT: ret <4 x float> [[TMP16]] |
| ; |
| %r = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2) |
| ret <4 x float> %r |
| } |
| |
| ; 1x2 * 2x3: 3 dot2 calls. |
| define <3 x float> @test_vec_mat(<2 x float> %v, <6 x float> %m) { |
| ; CHECK-LABEL: define <3 x float> @test_vec_mat( |
| ; CHECK-SAME: <2 x float> [[V:%.*]], <6 x float> [[M:%.*]]) { |
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[V]], i64 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[V]], i64 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <6 x float> [[M]], i64 0 |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <6 x float> [[M]], i64 1 |
| ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <6 x float> [[M]], i64 2 |
| ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x float> [[M]], i64 3 |
| ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <6 x float> [[M]], i64 4 |
| ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x float> [[M]], i64 5 |
| ; CHECK-NEXT: [[TMP9:%.*]] = call float @llvm.dx.dot2.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]]) |
| ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <3 x float> poison, float [[TMP9]], i64 0 |
| ; CHECK-NEXT: [[TMP11:%.*]] = call float @llvm.dx.dot2.f32(float [[TMP1]], float [[TMP2]], float [[TMP5]], float [[TMP6]]) |
| ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <3 x float> [[TMP10]], float [[TMP11]], i64 1 |
| ; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.dx.dot2.f32(float [[TMP1]], float [[TMP2]], float [[TMP7]], float [[TMP8]]) |
| ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <3 x float> [[TMP12]], float [[TMP13]], i64 2 |
| ; CHECK-NEXT: ret <3 x float> [[TMP14]] |
| ; |
| %r = call <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float> %v, <6 x float> %m, i32 1, i32 2, i32 3) |
| ret <3 x float> %r |
| } |
| |
| ; 2x3 * 3x1: 2 dot3 calls. |
| define <2 x float> @test_mat_vec(<6 x float> %m, <3 x float> %v) { |
| ; CHECK-LABEL: define <2 x float> @test_mat_vec( |
| ; CHECK-SAME: <6 x float> [[M:%.*]], <3 x float> [[V:%.*]]) { |
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x float> [[M]], i64 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x float> [[M]], i64 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <6 x float> [[M]], i64 2 |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <6 x float> [[M]], i64 3 |
| ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <6 x float> [[M]], i64 4 |
| ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x float> [[M]], i64 5 |
| ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <3 x float> [[V]], i64 0 |
| ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <3 x float> [[V]], i64 1 |
| ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <3 x float> [[V]], i64 2 |
| ; CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.dx.dot3.f32(float [[TMP1]], float [[TMP3]], float [[TMP5]], float [[TMP7]], float [[TMP8]], float [[TMP9]]) |
| ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 |
| ; CHECK-NEXT: [[TMP12:%.*]] = call float @llvm.dx.dot3.f32(float [[TMP2]], float [[TMP4]], float [[TMP6]], float [[TMP7]], float [[TMP8]], float [[TMP9]]) |
| ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP12]], i64 1 |
| ; CHECK-NEXT: ret <2 x float> [[TMP13]] |
| ; |
| %r = call <2 x float> @llvm.matrix.multiply.v2f32.v6f32.v3f32(<6 x float> %m, <3 x float> %v, i32 2, i32 3, i32 1) |
| ret <2 x float> %r |
| } |
| |
| ; 2x2 integer: mul + imad chains. |
| define <4 x i32> @test_int_2x2(<4 x i32> %a, <4 x i32> %b) { |
| ; CHECK-LABEL: define <4 x i32> @test_int_2x2( |
| ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) { |
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[A]], i64 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[A]], i64 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[A]], i64 2 |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[A]], i64 3 |
| ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[B]], i64 0 |
| ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[B]], i64 1 |
| ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[B]], i64 2 |
| ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[B]], i64 3 |
| ; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP1]], [[TMP5]] |
| ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.dx.imad.i32(i32 [[TMP3]], i32 [[TMP6]], i32 [[TMP9]]) |
| ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0 |
| ; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP2]], [[TMP5]] |
| ; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.dx.imad.i32(i32 [[TMP4]], i32 [[TMP6]], i32 [[TMP12]]) |
| ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP13]], i64 1 |
| ; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP1]], [[TMP7]] |
| ; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.dx.imad.i32(i32 [[TMP3]], i32 [[TMP8]], i32 [[TMP15]]) |
| ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP16]], i64 2 |
| ; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP2]], [[TMP7]] |
| ; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.dx.imad.i32(i32 [[TMP4]], i32 [[TMP8]], i32 [[TMP18]]) |
| ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP19]], i64 3 |
| ; CHECK-NEXT: ret <4 x i32> [[TMP20]] |
| ; |
| %r = call <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b, i32 2, i32 2, i32 2) |
| ret <4 x i32> %r |
| } |
| |
| ; 4x4 float: 16 dot4 calls. |
| define <16 x float> @test_float_4x4(<16 x float> %a, <16 x float> %b) { |
| ; CHECK-LABEL: define <16 x float> @test_float_4x4( |
| ; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) { |
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <16 x float> [[A]], i64 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x float> [[A]], i64 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x float> [[A]], i64 2 |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x float> [[A]], i64 3 |
| ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x float> [[A]], i64 4 |
| ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x float> [[A]], i64 5 |
| ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x float> [[A]], i64 6 |
| ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x float> [[A]], i64 7 |
| ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x float> [[A]], i64 8 |
| ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x float> [[A]], i64 9 |
| ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x float> [[A]], i64 10 |
| ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x float> [[A]], i64 11 |
| ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x float> [[A]], i64 12 |
| ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x float> [[A]], i64 13 |
| ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x float> [[A]], i64 14 |
| ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x float> [[A]], i64 15 |
| ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x float> [[B]], i64 0 |
| ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x float> [[B]], i64 1 |
| ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x float> [[B]], i64 2 |
| ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x float> [[B]], i64 3 |
| ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x float> [[B]], i64 4 |
| ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x float> [[B]], i64 5 |
| ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x float> [[B]], i64 6 |
| ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x float> [[B]], i64 7 |
| ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x float> [[B]], i64 8 |
| ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x float> [[B]], i64 9 |
| ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x float> [[B]], i64 10 |
| ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x float> [[B]], i64 11 |
| ; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x float> [[B]], i64 12 |
| ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x float> [[B]], i64 13 |
| ; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x float> [[B]], i64 14 |
| ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x float> [[B]], i64 15 |
| ; CHECK-NEXT: [[TMP33:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP1]], float [[TMP5]], float [[TMP9]], float [[TMP13]], float [[TMP17]], float [[TMP18]], float [[TMP19]], float [[TMP20]]) |
| ; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x float> poison, float [[TMP33]], i64 0 |
| ; CHECK-NEXT: [[TMP35:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP2]], float [[TMP6]], float [[TMP10]], float [[TMP14]], float [[TMP17]], float [[TMP18]], float [[TMP19]], float [[TMP20]]) |
| ; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x float> [[TMP34]], float [[TMP35]], i64 1 |
| ; CHECK-NEXT: [[TMP37:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP3]], float [[TMP7]], float [[TMP11]], float [[TMP15]], float [[TMP17]], float [[TMP18]], float [[TMP19]], float [[TMP20]]) |
| ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x float> [[TMP36]], float [[TMP37]], i64 2 |
| ; CHECK-NEXT: [[TMP39:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP4]], float [[TMP8]], float [[TMP12]], float [[TMP16]], float [[TMP17]], float [[TMP18]], float [[TMP19]], float [[TMP20]]) |
| ; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x float> [[TMP38]], float [[TMP39]], i64 3 |
| ; CHECK-NEXT: [[TMP41:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP1]], float [[TMP5]], float [[TMP9]], float [[TMP13]], float [[TMP21]], float [[TMP22]], float [[TMP23]], float [[TMP24]]) |
| ; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x float> [[TMP40]], float [[TMP41]], i64 4 |
| ; CHECK-NEXT: [[TMP43:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP2]], float [[TMP6]], float [[TMP10]], float [[TMP14]], float [[TMP21]], float [[TMP22]], float [[TMP23]], float [[TMP24]]) |
| ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <16 x float> [[TMP42]], float [[TMP43]], i64 5 |
| ; CHECK-NEXT: [[TMP45:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP3]], float [[TMP7]], float [[TMP11]], float [[TMP15]], float [[TMP21]], float [[TMP22]], float [[TMP23]], float [[TMP24]]) |
| ; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x float> [[TMP44]], float [[TMP45]], i64 6 |
| ; CHECK-NEXT: [[TMP47:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP4]], float [[TMP8]], float [[TMP12]], float [[TMP16]], float [[TMP21]], float [[TMP22]], float [[TMP23]], float [[TMP24]]) |
| ; CHECK-NEXT: [[TMP48:%.*]] = insertelement <16 x float> [[TMP46]], float [[TMP47]], i64 7 |
| ; CHECK-NEXT: [[TMP49:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP1]], float [[TMP5]], float [[TMP9]], float [[TMP13]], float [[TMP25]], float [[TMP26]], float [[TMP27]], float [[TMP28]]) |
| ; CHECK-NEXT: [[TMP50:%.*]] = insertelement <16 x float> [[TMP48]], float [[TMP49]], i64 8 |
| ; CHECK-NEXT: [[TMP51:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP2]], float [[TMP6]], float [[TMP10]], float [[TMP14]], float [[TMP25]], float [[TMP26]], float [[TMP27]], float [[TMP28]]) |
| ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x float> [[TMP50]], float [[TMP51]], i64 9 |
| ; CHECK-NEXT: [[TMP53:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP3]], float [[TMP7]], float [[TMP11]], float [[TMP15]], float [[TMP25]], float [[TMP26]], float [[TMP27]], float [[TMP28]]) |
| ; CHECK-NEXT: [[TMP54:%.*]] = insertelement <16 x float> [[TMP52]], float [[TMP53]], i64 10 |
| ; CHECK-NEXT: [[TMP55:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP4]], float [[TMP8]], float [[TMP12]], float [[TMP16]], float [[TMP25]], float [[TMP26]], float [[TMP27]], float [[TMP28]]) |
| ; CHECK-NEXT: [[TMP56:%.*]] = insertelement <16 x float> [[TMP54]], float [[TMP55]], i64 11 |
| ; CHECK-NEXT: [[TMP57:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP1]], float [[TMP5]], float [[TMP9]], float [[TMP13]], float [[TMP29]], float [[TMP30]], float [[TMP31]], float [[TMP32]]) |
| ; CHECK-NEXT: [[TMP58:%.*]] = insertelement <16 x float> [[TMP56]], float [[TMP57]], i64 12 |
| ; CHECK-NEXT: [[TMP59:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP2]], float [[TMP6]], float [[TMP10]], float [[TMP14]], float [[TMP29]], float [[TMP30]], float [[TMP31]], float [[TMP32]]) |
| ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <16 x float> [[TMP58]], float [[TMP59]], i64 13 |
| ; CHECK-NEXT: [[TMP61:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP3]], float [[TMP7]], float [[TMP11]], float [[TMP15]], float [[TMP29]], float [[TMP30]], float [[TMP31]], float [[TMP32]]) |
| ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <16 x float> [[TMP60]], float [[TMP61]], i64 14 |
| ; CHECK-NEXT: [[TMP63:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP4]], float [[TMP8]], float [[TMP12]], float [[TMP16]], float [[TMP29]], float [[TMP30]], float [[TMP31]], float [[TMP32]]) |
| ; CHECK-NEXT: [[TMP64:%.*]] = insertelement <16 x float> [[TMP62]], float [[TMP63]], i64 15 |
| ; CHECK-NEXT: ret <16 x float> [[TMP64]] |
| ; |
| %r = call <16 x float> @llvm.matrix.multiply.v16f32.v16f32.v16f32(<16 x float> %a, <16 x float> %b, i32 4, i32 4, i32 4) |
| ret <16 x float> %r |
| } |
| |
| ; 4x4 * 4x1: 4 dot4 calls. |
| define <4 x float> @test_mat4x4_vec4(<16 x float> %m, <4 x float> %v) { |
| ; CHECK-LABEL: define <4 x float> @test_mat4x4_vec4( |
| ; CHECK-SAME: <16 x float> [[M:%.*]], <4 x float> [[V:%.*]]) { |
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <16 x float> [[M]], i64 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x float> [[M]], i64 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x float> [[M]], i64 2 |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x float> [[M]], i64 3 |
| ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x float> [[M]], i64 4 |
| ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x float> [[M]], i64 5 |
| ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x float> [[M]], i64 6 |
| ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x float> [[M]], i64 7 |
| ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x float> [[M]], i64 8 |
| ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x float> [[M]], i64 9 |
| ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x float> [[M]], i64 10 |
| ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x float> [[M]], i64 11 |
| ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x float> [[M]], i64 12 |
| ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x float> [[M]], i64 13 |
| ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x float> [[M]], i64 14 |
| ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x float> [[M]], i64 15 |
| ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[V]], i64 0 |
| ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[V]], i64 1 |
| ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[V]], i64 2 |
| ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[V]], i64 3 |
| ; CHECK-NEXT: [[TMP21:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP1]], float [[TMP5]], float [[TMP9]], float [[TMP13]], float [[TMP17]], float [[TMP18]], float [[TMP19]], float [[TMP20]]) |
| ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> poison, float [[TMP21]], i64 0 |
| ; CHECK-NEXT: [[TMP23:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP2]], float [[TMP6]], float [[TMP10]], float [[TMP14]], float [[TMP17]], float [[TMP18]], float [[TMP19]], float [[TMP20]]) |
| ; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP23]], i64 1 |
| ; CHECK-NEXT: [[TMP25:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP3]], float [[TMP7]], float [[TMP11]], float [[TMP15]], float [[TMP17]], float [[TMP18]], float [[TMP19]], float [[TMP20]]) |
| ; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP25]], i64 2 |
| ; CHECK-NEXT: [[TMP27:%.*]] = call float @llvm.dx.dot4.f32(float [[TMP4]], float [[TMP8]], float [[TMP12]], float [[TMP16]], float [[TMP17]], float [[TMP18]], float [[TMP19]], float [[TMP20]]) |
| ; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x float> [[TMP26]], float [[TMP27]], i64 3 |
| ; CHECK-NEXT: ret <4 x float> [[TMP28]] |
| ; |
| %r = call <4 x float> @llvm.matrix.multiply.v4f32.v16f32.v4f32(<16 x float> %m, <4 x float> %v, i32 4, i32 4, i32 1) |
| ret <4 x float> %r |
| } |
| |
| ; 2x2 double: scalar fmul + fmuladd chains. |
| define <4 x double> @test_double_2x2(<4 x double> %a, <4 x double> %b) { |
| ; CHECK-LABEL: define <4 x double> @test_double_2x2( |
| ; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) { |
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x double> [[A]], i64 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[A]], i64 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[A]], i64 2 |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[A]], i64 3 |
| ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[B]], i64 0 |
| ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[B]], i64 1 |
| ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[B]], i64 2 |
| ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[B]], i64 3 |
| ; CHECK-NEXT: [[TMP9:%.*]] = fmul double [[TMP1]], [[TMP5]] |
| ; CHECK-NEXT: [[TMP10:%.*]] = call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP6]], double [[TMP9]]) |
| ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> poison, double [[TMP10]], i64 0 |
| ; CHECK-NEXT: [[TMP12:%.*]] = fmul double [[TMP2]], [[TMP5]] |
| ; CHECK-NEXT: [[TMP13:%.*]] = call double @llvm.fmuladd.f64(double [[TMP4]], double [[TMP6]], double [[TMP12]]) |
| ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP13]], i64 1 |
| ; CHECK-NEXT: [[TMP15:%.*]] = fmul double [[TMP1]], [[TMP7]] |
| ; CHECK-NEXT: [[TMP16:%.*]] = call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP8]], double [[TMP15]]) |
| ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x double> [[TMP14]], double [[TMP16]], i64 2 |
| ; CHECK-NEXT: [[TMP18:%.*]] = fmul double [[TMP2]], [[TMP7]] |
| ; CHECK-NEXT: [[TMP19:%.*]] = call double @llvm.fmuladd.f64(double [[TMP4]], double [[TMP8]], double [[TMP18]]) |
| ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP19]], i64 3 |
| ; CHECK-NEXT: ret <4 x double> [[TMP20]] |
| ; |
| %r = call <4 x double> @llvm.matrix.multiply.v4f64.v4f64.v4f64(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2) |
| ret <4 x double> %r |
| } |
| |
| ; 2x2 double * 2x1 double: 2 scalar fmul + fmuladd chains. |
| define <2 x double> @test_double_mat_vec(<4 x double> %m, <2 x double> %v) { |
| ; CHECK-LABEL: define <2 x double> @test_double_mat_vec( |
| ; CHECK-SAME: <4 x double> [[M:%.*]], <2 x double> [[V:%.*]]) { |
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x double> [[M]], i64 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[M]], i64 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[M]], i64 2 |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[M]], i64 3 |
| ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[V]], i64 0 |
| ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[V]], i64 1 |
| ; CHECK-NEXT: [[TMP7:%.*]] = fmul double [[TMP1]], [[TMP5]] |
| ; CHECK-NEXT: [[TMP8:%.*]] = call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP6]], double [[TMP7]]) |
| ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0 |
| ; CHECK-NEXT: [[TMP10:%.*]] = fmul double [[TMP2]], [[TMP5]] |
| ; CHECK-NEXT: [[TMP11:%.*]] = call double @llvm.fmuladd.f64(double [[TMP4]], double [[TMP6]], double [[TMP10]]) |
| ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP11]], i64 1 |
| ; CHECK-NEXT: ret <2 x double> [[TMP12]] |
| ; |
| %r = call <2 x double> @llvm.matrix.multiply.v2f64.v4f64.v2f64(<4 x double> %m, <2 x double> %v, i32 2, i32 2, i32 1) |
| ret <2 x double> %r |
| } |
| |
| ; K=1 float outer product (2x1 * 1x3 = 2x3): each element is a single fmul. |
| define <6 x float> @test_k1_outer_product(<2 x float> %a, <3 x float> %b) { |
| ; CHECK-LABEL: define <6 x float> @test_k1_outer_product( |
| ; CHECK-SAME: <2 x float> [[A:%.*]], <3 x float> [[B:%.*]]) { |
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[A]], i64 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[A]], i64 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[B]], i64 0 |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <3 x float> [[B]], i64 1 |
| ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <3 x float> [[B]], i64 2 |
| ; CHECK-NEXT: [[TMP6:%.*]] = fmul float [[TMP1]], [[TMP3]] |
| ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <6 x float> poison, float [[TMP6]], i64 0 |
| ; CHECK-NEXT: [[TMP8:%.*]] = fmul float [[TMP2]], [[TMP3]] |
| ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <6 x float> [[TMP7]], float [[TMP8]], i64 1 |
| ; CHECK-NEXT: [[TMP10:%.*]] = fmul float [[TMP1]], [[TMP4]] |
| ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <6 x float> [[TMP9]], float [[TMP10]], i64 2 |
| ; CHECK-NEXT: [[TMP12:%.*]] = fmul float [[TMP2]], [[TMP4]] |
| ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <6 x float> [[TMP11]], float [[TMP12]], i64 3 |
| ; CHECK-NEXT: [[TMP14:%.*]] = fmul float [[TMP1]], [[TMP5]] |
| ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x float> [[TMP13]], float [[TMP14]], i64 4 |
| ; CHECK-NEXT: [[TMP16:%.*]] = fmul float [[TMP2]], [[TMP5]] |
| ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <6 x float> [[TMP15]], float [[TMP16]], i64 5 |
| ; CHECK-NEXT: ret <6 x float> [[TMP17]] |
| ; |
| %r = call <6 x float> @llvm.matrix.multiply.v6f32.v2f32.v3f32(<2 x float> %a, <3 x float> %b, i32 2, i32 1, i32 3) |
| ret <6 x float> %r |
| } |
| |
| ; 2x2 half: 4 dot2 calls. |
| define <4 x half> @test_half_2x2(<4 x half> %a, <4 x half> %b) { |
| ; CHECK-LABEL: define <4 x half> @test_half_2x2( |
| ; CHECK-SAME: <4 x half> [[A:%.*]], <4 x half> [[B:%.*]]) { |
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x half> [[A]], i64 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x half> [[A]], i64 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x half> [[A]], i64 2 |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x half> [[A]], i64 3 |
| ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x half> [[B]], i64 0 |
| ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x half> [[B]], i64 1 |
| ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x half> [[B]], i64 2 |
| ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x half> [[B]], i64 3 |
| ; CHECK-NEXT: [[TMP9:%.*]] = call half @llvm.dx.dot2.f16(half [[TMP1]], half [[TMP3]], half [[TMP5]], half [[TMP6]]) |
| ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x half> poison, half [[TMP9]], i64 0 |
| ; CHECK-NEXT: [[TMP11:%.*]] = call half @llvm.dx.dot2.f16(half [[TMP2]], half [[TMP4]], half [[TMP5]], half [[TMP6]]) |
| ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x half> [[TMP10]], half [[TMP11]], i64 1 |
| ; CHECK-NEXT: [[TMP13:%.*]] = call half @llvm.dx.dot2.f16(half [[TMP1]], half [[TMP3]], half [[TMP7]], half [[TMP8]]) |
| ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x half> [[TMP12]], half [[TMP13]], i64 2 |
| ; CHECK-NEXT: [[TMP15:%.*]] = call half @llvm.dx.dot2.f16(half [[TMP2]], half [[TMP4]], half [[TMP7]], half [[TMP8]]) |
| ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x half> [[TMP14]], half [[TMP15]], i64 3 |
| ; CHECK-NEXT: ret <4 x half> [[TMP16]] |
| ; |
| %r = call <4 x half> @llvm.matrix.multiply.v4f16.v4f16.v4f16(<4 x half> %a, <4 x half> %b, i32 2, i32 2, i32 2) |
| ret <4 x half> %r |
| } |
| |
| ; 2x3 half * 3x1 half: 2 dot3 calls. |
| define <2 x half> @test_half_mat_vec(<6 x half> %m, <3 x half> %v) { |
| ; CHECK-LABEL: define <2 x half> @test_half_mat_vec( |
| ; CHECK-SAME: <6 x half> [[M:%.*]], <3 x half> [[V:%.*]]) { |
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x half> [[M]], i64 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x half> [[M]], i64 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <6 x half> [[M]], i64 2 |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <6 x half> [[M]], i64 3 |
| ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <6 x half> [[M]], i64 4 |
| ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x half> [[M]], i64 5 |
| ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <3 x half> [[V]], i64 0 |
| ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <3 x half> [[V]], i64 1 |
| ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <3 x half> [[V]], i64 2 |
| ; CHECK-NEXT: [[TMP10:%.*]] = call half @llvm.dx.dot3.f16(half [[TMP1]], half [[TMP3]], half [[TMP5]], half [[TMP7]], half [[TMP8]], half [[TMP9]]) |
| ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x half> poison, half [[TMP10]], i64 0 |
| ; CHECK-NEXT: [[TMP12:%.*]] = call half @llvm.dx.dot3.f16(half [[TMP2]], half [[TMP4]], half [[TMP6]], half [[TMP7]], half [[TMP8]], half [[TMP9]]) |
| ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x half> [[TMP11]], half [[TMP12]], i64 1 |
| ; CHECK-NEXT: ret <2 x half> [[TMP13]] |
| ; |
| %r = call <2 x half> @llvm.matrix.multiply.v2f16.v6f16.v3f16(<6 x half> %m, <3 x half> %v, i32 2, i32 3, i32 1) |
| ret <2 x half> %r |
| } |
| |
| ; K=1 half outer product (2x1 * 1x3 = 2x3): each element is a single fmul. |
| define <6 x half> @test_k1_half_outer_product(<2 x half> %a, <3 x half> %b) { |
| ; CHECK-LABEL: define <6 x half> @test_k1_half_outer_product( |
| ; CHECK-SAME: <2 x half> [[A:%.*]], <3 x half> [[B:%.*]]) { |
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x half> [[A]], i64 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x half> [[A]], i64 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x half> [[B]], i64 0 |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <3 x half> [[B]], i64 1 |
| ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <3 x half> [[B]], i64 2 |
| ; CHECK-NEXT: [[TMP6:%.*]] = fmul half [[TMP1]], [[TMP3]] |
| ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <6 x half> poison, half [[TMP6]], i64 0 |
| ; CHECK-NEXT: [[TMP8:%.*]] = fmul half [[TMP2]], [[TMP3]] |
| ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <6 x half> [[TMP7]], half [[TMP8]], i64 1 |
| ; CHECK-NEXT: [[TMP10:%.*]] = fmul half [[TMP1]], [[TMP4]] |
| ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <6 x half> [[TMP9]], half [[TMP10]], i64 2 |
| ; CHECK-NEXT: [[TMP12:%.*]] = fmul half [[TMP2]], [[TMP4]] |
| ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <6 x half> [[TMP11]], half [[TMP12]], i64 3 |
| ; CHECK-NEXT: [[TMP14:%.*]] = fmul half [[TMP1]], [[TMP5]] |
| ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x half> [[TMP13]], half [[TMP14]], i64 4 |
| ; CHECK-NEXT: [[TMP16:%.*]] = fmul half [[TMP2]], [[TMP5]] |
| ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <6 x half> [[TMP15]], half [[TMP16]], i64 5 |
| ; CHECK-NEXT: ret <6 x half> [[TMP17]] |
| ; |
| %r = call <6 x half> @llvm.matrix.multiply.v6f16.v2f16.v3f16(<2 x half> %a, <3 x half> %b, i32 2, i32 1, i32 3) |
| ret <6 x half> %r |
| } |
| |
| ; K=1 integer outer product (2x1 * 1x3 = 2x3): each element is a single mul. |
| define <6 x i32> @test_k1_int_outer_product(<2 x i32> %a, <3 x i32> %b) { |
| ; CHECK-LABEL: define <6 x i32> @test_k1_int_outer_product( |
| ; CHECK-SAME: <2 x i32> [[A:%.*]], <3 x i32> [[B:%.*]]) { |
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[A]], i64 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[A]], i64 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i32> [[B]], i64 0 |
| ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <3 x i32> [[B]], i64 1 |
| ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <3 x i32> [[B]], i64 2 |
| ; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP1]], [[TMP3]] |
| ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <6 x i32> poison, i32 [[TMP6]], i64 0 |
| ; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP2]], [[TMP3]] |
| ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <6 x i32> [[TMP7]], i32 [[TMP8]], i64 1 |
| ; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP1]], [[TMP4]] |
| ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <6 x i32> [[TMP9]], i32 [[TMP10]], i64 2 |
| ; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP2]], [[TMP4]] |
| ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <6 x i32> [[TMP11]], i32 [[TMP12]], i64 3 |
| ; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP1]], [[TMP5]] |
| ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x i32> [[TMP13]], i32 [[TMP14]], i64 4 |
| ; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP2]], [[TMP5]] |
| ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <6 x i32> [[TMP15]], i32 [[TMP16]], i64 5 |
| ; CHECK-NEXT: ret <6 x i32> [[TMP17]] |
| ; |
| %r = call <6 x i32> @llvm.matrix.multiply.v6i32.v2i32.v3i32(<2 x i32> %a, <3 x i32> %b, i32 2, i32 1, i32 3) |
| ret <6 x i32> %r |
| } |