[AArch64]Add memory op cost model for SVE This patch adds/fixes memory op cost model for SVE with fixed-width vector. Differential Revision: https://reviews.llvm.org/D90950

commit: 37f4ccb27545ca28a52a1a1c21cbccee03044d04 [log] [tgz]
author: Caroline Concatto <caroline.concatto@arm.com> Fri Nov 06 15:53:59 2020 +0000
committer: Caroline Concatto <caroline.concatto@arm.com> Wed Nov 11 12:49:19 2020 +0000
tree: 433a8bc834b9e5b4bafa8d5418b08b1fe353af58
parent: 04ce13e497be60f51d340e649c72138d49cb13e9 [diff]
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5d30b5f..15c67b4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -269,7 +269,7 @@
       addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
     }
 
-    if (useSVEForFixedLengthVectors()) {
+    if (Subtarget->useSVEForFixedLengthVectors()) {
       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
         if (useSVEForFixedLengthVectorVT(VT))
           addRegisterClass(VT, &AArch64::ZPRRegClass);
@@ -1085,7 +1085,7 @@
 
     // NOTE: Currently this has to happen after computeRegisterProperties rather
     // than the preferred option of combining it with the addRegisterClass call.
-    if (useSVEForFixedLengthVectors()) {
+    if (Subtarget->useSVEForFixedLengthVectors()) {
       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
         if (useSVEForFixedLengthVectorVT(VT))
           addTypeForFixedLengthSVE(VT);
@@ -4140,14 +4140,13 @@
   }
 }
 
-bool AArch64TargetLowering::useSVEForFixedLengthVectors() const {
-  // Prefer NEON unless larger SVE registers are available.
-  return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256;
+bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
+  return !Subtarget->useSVEForFixedLengthVectors();
 }
 
 bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
     EVT VT, bool OverrideNEON) const {
-  if (!useSVEForFixedLengthVectors())
+  if (!Subtarget->useSVEForFixedLengthVectors())
     return false;
 
   if (!VT.isFixedLengthVector())

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index bfc83a9..47248b9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h

@@ -773,9 +773,7 @@
   /// illegal as the original, thus leading to an infinite legalisation loop.
   /// NOTE: Once BUILD_VECTOR is legal or can be custom lowered for all legal
   /// vector types this override can be removed.
-  bool mergeStoresAfterLegalization(EVT VT) const override {
-    return !useSVEForFixedLengthVectors();
-  }
+  bool mergeStoresAfterLegalization(EVT VT) const override;
 
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
@@ -1008,7 +1006,6 @@
   bool shouldLocalize(const MachineInstr &MI,
                       const TargetTransformInfo *TTI) const override;
 
-  bool useSVEForFixedLengthVectors() const;
   // Normally SVE is only used for byte size vectors that do not fit within a
   // NEON vector. This changes when OverrideNEON is true, allowing SVE to be
   // used for 64bit and 128bit vectors as well.

diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index fdf979b..b4d71ac 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp

@@ -368,3 +368,8 @@
     return (SVEVectorBitsMin / 128) * 128;
   return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
 }
+
+bool AArch64Subtarget::useSVEForFixedLengthVectors() const {
+  // Prefer NEON unless larger SVE registers are available.
+  return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
+}

diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 67c682c..4eb4843 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h

@@ -555,6 +555,7 @@
   // implied by the architecture.
   unsigned getMaxSVEVectorSizeInBits() const;
   unsigned getMinSVEVectorSizeInBits() const;
+  bool useSVEForFixedLengthVectors() const;
 };
 } // End llvm namespace
 

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 595f403..4f7ebff 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

@@ -751,6 +751,10 @@
   return Options;
 }
 
+bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
+  return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
+}
+
 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
                                     MaybeAlign Alignment, unsigned AddressSpace,
                                     TTI::TargetCostKind CostKind,
@@ -778,7 +782,7 @@
     return LT.first * 2 * AmortizationCost;
   }
 
-  if (Ty->isVectorTy() &&
+  if (useNeonVector(Ty) &&
       cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) {
     unsigned ProfitableNumElements;
     if (Opcode == Instruction::Store)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a624f8b..baf11cd 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

@@ -147,6 +147,7 @@
 
   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                     bool IsZeroCmp) const;
+  bool useNeonVector(const Type *Ty) const;
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
                       unsigned AddressSpace,

diff --git a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
new file mode 100644
index 0000000..3a4e0f0
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll

@@ -0,0 +1,88 @@
+; Check memory cost model action for fixed vector SVE and Neon
+; Vector bits size lower than 256 bits end up assuming Neon cost model
+; CHECK-NEON has same performance as CHECK-SVE-128
+
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+neon  < %s | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s --check-prefix=CHECK-SVE-128
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s --check-prefix=CHECK-SVE-256
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s --check-prefix=CHECK-SVE-512
+
+define <16 x i8> @load16(<16 x i8>* %ptr) {
+; CHECK: 'Cost Model Analysis' for function 'load16':
+; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  %out = load <16 x i8>, <16 x i8>* %ptr
+  ret <16 x i8> %out
+}
+
+define void @store16(<16 x i8>* %ptr, <16 x i8> %val) {
+; CHECK: 'Cost Model Analysis' for function 'store16':
+; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  store <16 x i8> %val, <16 x i8>* %ptr
+  ret void
+}
+
+define <8 x i8> @load8(<8 x i8>* %ptr) {
+; CHECK: 'Cost Model Analysis' for function 'load8':
+; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  %out = load <8 x i8>, <8 x i8>* %ptr
+  ret <8 x i8> %out
+}
+
+define void @store8(<8 x i8>* %ptr, <8 x i8> %val) {
+; CHECK: 'Cost Model Analysis' for function 'store8':
+; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  store <8 x i8> %val, <8 x i8>* %ptr
+  ret void
+}
+
+define <4 x i8> @load4(<4 x i8>* %ptr) {
+; CHECK: 'Cost Model Analysis' for function 'load4':
+; CHECK-NEON: Cost Model: Found an estimated cost of 64 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 64 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  %out = load <4 x i8>, <4 x i8>* %ptr
+  ret <4 x i8> %out
+}
+
+define void @store4(<4 x i8>* %ptr, <4 x i8> %val) {
+; CHECK: 'Cost Model Analysis' for function 'store4':
+; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  store <4 x i8> %val, <4 x i8>* %ptr
+  ret void
+}
+
+define <16 x i16> @load_256(<16 x i16>* %ptr) {
+; CHECK: 'Cost Model Analysis' for function 'load_256':
+; CHECK-NEON: Cost Model: Found an estimated cost of 2 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 2 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  %out = load <16 x i16>, <16 x i16>* %ptr
+  ret <16 x i16> %out
+}
+
+define <8 x i64> @load_512(<8 x i64>* %ptr) {
+; CHECK: 'Cost Model Analysis' for function 'load_512':
+; CHECK-NEON: Cost Model: Found an estimated cost of 4 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 4 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 2 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  %out = load <8 x i64>, <8 x i64>* %ptr
+  ret <8 x i64> %out
+}

diff --git a/llvm/test/Analysis/CostModel/AArch64/scalable-mem-op-cost-model.ll b/llvm/test/Analysis/CostModel/AArch64/scalable-mem-op-cost-model.ll
new file mode 100644
index 0000000..1a7b262
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/scalable-mem-op-cost-model.ll

@@ -0,0 +1,51 @@
+; Checks if the memory cost model does not break when using scalable vectors
+
+; RUN: opt  -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 8 x i8> @load-sve-8(<vscale x 8 x i8>* %ptr) {
+; CHECK-LABEL: 'load-sve-8':
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+  %retval = load <vscale x 8 x i8>, <vscale x 8 x i8>* %ptr
+  ret <vscale x 8 x i8> %retval
+}
+
+define void  @store-sve-8(<vscale x 8 x i8>* %ptr, <vscale x 8 x i8> %val) {
+; CHECK-LABEL: 'store-sve-8'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+  store <vscale x 8 x i8> %val, <vscale x 8 x i8>* %ptr
+  ret void
+}
+
+define <vscale x 16 x i8> @load-sve-16(<vscale x 16 x i8>* %ptr) {
+; CHECK-LABEL: 'load-sve-16':
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+  %retval = load <vscale x 16 x i8>, <vscale x 16 x i8>* %ptr
+  ret <vscale x 16 x i8> %retval
+}
+
+define void  @store-sve-16(<vscale x 16 x i8>* %ptr, <vscale x 16 x i8> %val) {
+; CHECK-LABEL: 'store-sve-16'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+  store <vscale x 16 x i8> %val, <vscale x 16 x i8>* %ptr
+  ret void
+}
+
+define <vscale x 32 x i8> @load-sve-32(<vscale x 32 x i8>* %ptr) {
+; CHECK-LABEL: 'load-sve-32':
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+  %retval = load <vscale x 32 x i8>, <vscale x 32 x i8>* %ptr
+  ret <vscale x 32 x i8> %retval
+}
+
+define void  @store-sve-32(<vscale x 32 x i8>* %ptr, <vscale x 32 x i8> %val) {
+; CHECK-LABEL: 'store-sve-32'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+  store <vscale x 32 x i8> %val, <vscale x 32 x i8>* %ptr
+  ret void
+}
commit	37f4ccb27545ca28a52a1a1c21cbccee03044d04	[log] [tgz]
author	Caroline Concatto <caroline.concatto@arm.com>	Fri Nov 06 15:53:59 2020 +0000
committer	Caroline Concatto <caroline.concatto@arm.com>	Wed Nov 11 12:49:19 2020 +0000
tree	433a8bc834b9e5b4bafa8d5418b08b1fe353af58
parent	04ce13e497be60f51d340e649c72138d49cb13e9 [diff]