[AArch64] Add bf16 broadcast and transpose costs
These are only based on the size of the element, not the type (although the
codegen does need to account for it).
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0e0b30b..97e4993 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5607,6 +5607,8 @@
{TTI::SK_Broadcast, MVT::v2i64, 1},
{TTI::SK_Broadcast, MVT::v4f16, 1},
{TTI::SK_Broadcast, MVT::v8f16, 1},
+ {TTI::SK_Broadcast, MVT::v4bf16, 1},
+ {TTI::SK_Broadcast, MVT::v8bf16, 1},
{TTI::SK_Broadcast, MVT::v2f32, 1},
{TTI::SK_Broadcast, MVT::v4f32, 1},
{TTI::SK_Broadcast, MVT::v2f64, 1},
@@ -5621,6 +5623,8 @@
{TTI::SK_Transpose, MVT::v2i64, 1},
{TTI::SK_Transpose, MVT::v4f16, 1},
{TTI::SK_Transpose, MVT::v8f16, 1},
+ {TTI::SK_Transpose, MVT::v4bf16, 1},
+ {TTI::SK_Transpose, MVT::v8bf16, 1},
{TTI::SK_Transpose, MVT::v2f32, 1},
{TTI::SK_Transpose, MVT::v4f32, 1},
{TTI::SK_Transpose, MVT::v2f64, 1},
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll
index 0730f7c..6175ea4 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll
@@ -31,10 +31,10 @@
; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of 2 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of 0 for: %v1bf16 = shufflevector <1 x bfloat> undef, <1 x bfloat> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2bf16 = shufflevector <2 x bfloat> undef, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2bf16 = shufflevector <2 x bfloat> undef, <2 x bfloat> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4bf16 = shufflevector <4 x bfloat> undef, <4 x bfloat> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:14 Lat:28 SizeLat:28 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found costs of 2 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <16 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of 0 for: %v1f32 = shufflevector <1 x float> undef, <1 x float> undef, <1 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
@@ -116,8 +116,8 @@
; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of 0 for: %v2bf16 = shufflevector <2 x bfloat> undef, <2 x bfloat> undef, <1 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4bf16 = shufflevector <4 x bfloat> undef, <4 x bfloat> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of 0 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <1 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <4 x i32> zeroinitializer
@@ -194,9 +194,9 @@
; CHECK-NEXT: Cost Model: Found costs of 4 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <32 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v1bf16 = shufflevector <1 x bfloat> undef, <1 x bfloat> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2bf16 = shufflevector <2 x bfloat> undef, <2 x bfloat> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v4bf16 = shufflevector <4 x bfloat> undef, <4 x bfloat> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:14 Lat:28 SizeLat:28 for: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:28 Lat:56 SizeLat:56 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4bf16 = shufflevector <4 x bfloat> undef, <4 x bfloat> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found costs of 2 for: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found costs of 4 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <32 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v1f32 = shufflevector <1 x float> undef, <1 x float> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found costs of 2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> zeroinitializer
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll
index 35cfa88..4c48430 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll
@@ -218,3 +218,39 @@
%tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
ret <8 x half> %tmp0
}
+
+define <4 x bfloat> @trn1.v4bf16(<4 x bfloat> %v0, <4 x bfloat> %v1) {
+; CHECK-LABEL: 'trn1.v4bf16'
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %tmp0
+;
+ %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x bfloat> %tmp0
+}
+
+define <4 x bfloat> @trn2.v4bf16(<4 x bfloat> %v0, <4 x bfloat> %v1) {
+; CHECK-LABEL: 'trn2.v4bf16'
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %tmp0
+;
+ %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ ret <4 x bfloat> %tmp0
+}
+
+define <8 x bfloat> @trn1.v8bf16(<8 x bfloat> %v0, <8 x bfloat> %v1) {
+; CHECK-LABEL: 'trn1.v8bf16'
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %tmp0
+;
+ %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ ret <8 x bfloat> %tmp0
+}
+
+define <8 x bfloat> @trn2.v8bf16(<8 x bfloat> %v0, <8 x bfloat> %v1) {
+; CHECK-LABEL: 'trn2.v8bf16'
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %tmp0
+;
+ %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ ret <8 x bfloat> %tmp0
+}