[AArch64]Add memory op cost model for SVE
This patch adds/fixes memory op cost model for SVE with fixed-width
vector.
Differential Revision: https://reviews.llvm.org/D90950
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5d30b5f..15c67b4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -269,7 +269,7 @@
addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
}
- if (useSVEForFixedLengthVectors()) {
+ if (Subtarget->useSVEForFixedLengthVectors()) {
for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
if (useSVEForFixedLengthVectorVT(VT))
addRegisterClass(VT, &AArch64::ZPRRegClass);
@@ -1085,7 +1085,7 @@
// NOTE: Currently this has to happen after computeRegisterProperties rather
// than the preferred option of combining it with the addRegisterClass call.
- if (useSVEForFixedLengthVectors()) {
+ if (Subtarget->useSVEForFixedLengthVectors()) {
for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
if (useSVEForFixedLengthVectorVT(VT))
addTypeForFixedLengthSVE(VT);
@@ -4140,14 +4140,13 @@
}
}
-bool AArch64TargetLowering::useSVEForFixedLengthVectors() const {
- // Prefer NEON unless larger SVE registers are available.
- return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256;
+bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
+ return !Subtarget->useSVEForFixedLengthVectors();
}
bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
EVT VT, bool OverrideNEON) const {
- if (!useSVEForFixedLengthVectors())
+ if (!Subtarget->useSVEForFixedLengthVectors())
return false;
if (!VT.isFixedLengthVector())
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index bfc83a9..47248b9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -773,9 +773,7 @@
/// illegal as the original, thus leading to an infinite legalisation loop.
/// NOTE: Once BUILD_VECTOR is legal or can be custom lowered for all legal
/// vector types this override can be removed.
- bool mergeStoresAfterLegalization(EVT VT) const override {
- return !useSVEForFixedLengthVectors();
- }
+ bool mergeStoresAfterLegalization(EVT VT) const override;
private:
/// Keep a pointer to the AArch64Subtarget around so that we can
@@ -1008,7 +1006,6 @@
bool shouldLocalize(const MachineInstr &MI,
const TargetTransformInfo *TTI) const override;
- bool useSVEForFixedLengthVectors() const;
// Normally SVE is only used for byte size vectors that do not fit within a
// NEON vector. This changes when OverrideNEON is true, allowing SVE to be
// used for 64bit and 128bit vectors as well.
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index fdf979b..b4d71ac 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -368,3 +368,8 @@
return (SVEVectorBitsMin / 128) * 128;
return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
}
+
+bool AArch64Subtarget::useSVEForFixedLengthVectors() const {
+ // Prefer NEON unless larger SVE registers are available.
+ return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 67c682c..4eb4843 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -555,6 +555,7 @@
// implied by the architecture.
unsigned getMaxSVEVectorSizeInBits() const;
unsigned getMinSVEVectorSizeInBits() const;
+ bool useSVEForFixedLengthVectors() const;
};
} // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 595f403..4f7ebff 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -751,6 +751,10 @@
return Options;
}
+bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
+ return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
+}
+
int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
MaybeAlign Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind,
@@ -778,7 +782,7 @@
return LT.first * 2 * AmortizationCost;
}
- if (Ty->isVectorTy() &&
+ if (useNeonVector(Ty) &&
cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) {
unsigned ProfitableNumElements;
if (Opcode == Instruction::Store)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a624f8b..baf11cd 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -147,6 +147,7 @@
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
+ bool useNeonVector(const Type *Ty) const;
int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
unsigned AddressSpace,
diff --git a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
new file mode 100644
index 0000000..3a4e0f0
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
@@ -0,0 +1,88 @@
+; Check memory cost model action for fixed vector SVE and Neon
+; Vector bits size lower than 256 bits end up assuming Neon cost model
+; CHECK-NEON has same performance as CHECK-SVE-128
+
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+neon < %s | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s --check-prefix=CHECK-SVE-128
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s --check-prefix=CHECK-SVE-256
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s --check-prefix=CHECK-SVE-512
+
+define <16 x i8> @load16(<16 x i8>* %ptr) {
+; CHECK: 'Cost Model Analysis' for function 'load16':
+; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+ %out = load <16 x i8>, <16 x i8>* %ptr
+ ret <16 x i8> %out
+}
+
+define void @store16(<16 x i8>* %ptr, <16 x i8> %val) {
+; CHECK: 'Cost Model Analysis' for function 'store16':
+; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+ store <16 x i8> %val, <16 x i8>* %ptr
+ ret void
+}
+
+define <8 x i8> @load8(<8 x i8>* %ptr) {
+; CHECK: 'Cost Model Analysis' for function 'load8':
+; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+ %out = load <8 x i8>, <8 x i8>* %ptr
+ ret <8 x i8> %out
+}
+
+define void @store8(<8 x i8>* %ptr, <8 x i8> %val) {
+; CHECK: 'Cost Model Analysis' for function 'store8':
+; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+ store <8 x i8> %val, <8 x i8>* %ptr
+ ret void
+}
+
+define <4 x i8> @load4(<4 x i8>* %ptr) {
+; CHECK: 'Cost Model Analysis' for function 'load4':
+; CHECK-NEON: Cost Model: Found an estimated cost of 64 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 64 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+ %out = load <4 x i8>, <4 x i8>* %ptr
+ ret <4 x i8> %out
+}
+
+define void @store4(<4 x i8>* %ptr, <4 x i8> %val) {
+; CHECK: 'Cost Model Analysis' for function 'store4':
+; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+ store <4 x i8> %val, <4 x i8>* %ptr
+ ret void
+}
+
+define <16 x i16> @load_256(<16 x i16>* %ptr) {
+; CHECK: 'Cost Model Analysis' for function 'load_256':
+; CHECK-NEON: Cost Model: Found an estimated cost of 2 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 2 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+ %out = load <16 x i16>, <16 x i16>* %ptr
+ ret <16 x i16> %out
+}
+
+define <8 x i64> @load_512(<8 x i64>* %ptr) {
+; CHECK: 'Cost Model Analysis' for function 'load_512':
+; CHECK-NEON: Cost Model: Found an estimated cost of 4 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 4 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 2 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+ %out = load <8 x i64>, <8 x i64>* %ptr
+ ret <8 x i64> %out
+}
diff --git a/llvm/test/Analysis/CostModel/AArch64/scalable-mem-op-cost-model.ll b/llvm/test/Analysis/CostModel/AArch64/scalable-mem-op-cost-model.ll
new file mode 100644
index 0000000..1a7b262
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/scalable-mem-op-cost-model.ll
@@ -0,0 +1,51 @@
+; Checks if the memory cost model does not break when using scalable vectors
+
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 8 x i8> @load-sve-8(<vscale x 8 x i8>* %ptr) {
+; CHECK-LABEL: 'load-sve-8':
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+ %retval = load <vscale x 8 x i8>, <vscale x 8 x i8>* %ptr
+ ret <vscale x 8 x i8> %retval
+}
+
+define void @store-sve-8(<vscale x 8 x i8>* %ptr, <vscale x 8 x i8> %val) {
+; CHECK-LABEL: 'store-sve-8'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+ store <vscale x 8 x i8> %val, <vscale x 8 x i8>* %ptr
+ ret void
+}
+
+define <vscale x 16 x i8> @load-sve-16(<vscale x 16 x i8>* %ptr) {
+; CHECK-LABEL: 'load-sve-16':
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+ %retval = load <vscale x 16 x i8>, <vscale x 16 x i8>* %ptr
+ ret <vscale x 16 x i8> %retval
+}
+
+define void @store-sve-16(<vscale x 16 x i8>* %ptr, <vscale x 16 x i8> %val) {
+; CHECK-LABEL: 'store-sve-16'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+ store <vscale x 16 x i8> %val, <vscale x 16 x i8>* %ptr
+ ret void
+}
+
+define <vscale x 32 x i8> @load-sve-32(<vscale x 32 x i8>* %ptr) {
+; CHECK-LABEL: 'load-sve-32':
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+ %retval = load <vscale x 32 x i8>, <vscale x 32 x i8>* %ptr
+ ret <vscale x 32 x i8> %retval
+}
+
+define void @store-sve-32(<vscale x 32 x i8>* %ptr, <vscale x 32 x i8> %val) {
+; CHECK-LABEL: 'store-sve-32'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+ store <vscale x 32 x i8> %val, <vscale x 32 x i8>* %ptr
+ ret void
+}