blob: c3e411e76efd9406c4b59608bb09609cf61ccb04 [file] [log] [blame]
Chandler Carruth93dcdc42015-01-31 11:17:59 +00001//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
Tim Northover3b0846e2014-05-24 12:50:23 +00002//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Tim Northover3b0846e2014-05-24 12:50:23 +00006//
7//===----------------------------------------------------------------------===//
Tim Northover3b0846e2014-05-24 12:50:23 +00008
Florian Hahn408c4402020-10-30 21:19:52 +00009#include "AArch64TargetTransformInfo.h"
Florian Hahnb3b993a2020-11-02 12:40:34 +000010#include "AArch64ExpandImm.h"
David Greend6327052022-04-27 12:09:01 +010011#include "AArch64PerfectShuffle.h"
Tim Northover3b0846e2014-05-24 12:50:23 +000012#include "MCTargetDesc/AArch64AddressingModes.h"
David Green8274be52024-10-28 18:53:38 +000013#include "Utils/AArch64SMEAttributes.h"
Sushant Gokhale9991ea22024-11-13 11:10:49 +053014#include "llvm/ADT/DenseMap.h"
Kevin Qinaef68412015-03-09 06:14:28 +000015#include "llvm/Analysis/LoopInfo.h"
Chandler Carruth6bda14b2017-06-06 11:49:48 +000016#include "llvm/Analysis/TargetTransformInfo.h"
Chandler Carruth705b1852015-01-31 03:43:40 +000017#include "llvm/CodeGen/BasicTTIImpl.h"
David Blaikieb3bde2e2017-11-17 01:07:10 +000018#include "llvm/CodeGen/CostTable.h"
19#include "llvm/CodeGen/TargetLowering.h"
Sushant Gokhalec4808742025-03-09 22:26:39 -070020#include "llvm/IR/DerivedTypes.h"
Reid Kleckner0e8c4bb2017-09-07 23:27:44 +000021#include "llvm/IR/IntrinsicInst.h"
David Green8e2a0e62022-04-27 13:51:50 +010022#include "llvm/IR/Intrinsics.h"
Reid Kleckner5d986952019-12-11 07:55:26 -080023#include "llvm/IR/IntrinsicsAArch64.h"
Florian Hahnb3b993a2020-11-02 12:40:34 +000024#include "llvm/IR/PatternMatch.h"
Tim Northover3b0846e2014-05-24 12:50:23 +000025#include "llvm/Support/Debug.h"
Alexandros Lamprineas831527a2025-01-17 10:49:43 +000026#include "llvm/TargetParser/AArch64TargetParser.h"
Joe Ellisc91cd4f2021-04-16 10:05:05 +000027#include "llvm/Transforms/InstCombine/InstCombiner.h"
David Sherwoodf15b6b22022-07-12 12:03:39 +010028#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
Tim Northover3b0846e2014-05-24 12:50:23 +000029#include <algorithm>
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -080030#include <optional>
Tim Northover3b0846e2014-05-24 12:50:23 +000031using namespace llvm;
Florian Hahnb3b993a2020-11-02 12:40:34 +000032using namespace llvm::PatternMatch;
Tim Northover3b0846e2014-05-24 12:50:23 +000033
34#define DEBUG_TYPE "aarch64tti"
35
Geoff Berry378374d2017-06-28 18:53:09 +000036static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
37 cl::init(true), cl::Hidden);
38
David Greenb35d3452024-12-31 11:07:42 +000039static cl::opt<bool> SVEPreferFixedOverScalableIfEqualCost(
40 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
41
David Sherwood8b0448c2021-12-06 11:02:29 +000042static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
43 cl::Hidden);
44
45static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
46 cl::init(10), cl::Hidden);
47
David Sherwoodc7dbe322023-04-25 08:46:41 +000048static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
49 cl::init(15), cl::Hidden);
50
zhongyundedf19d872023-06-07 21:50:54 +080051static cl::opt<unsigned>
52 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
53 cl::Hidden);
54
Sander de Smalen00a83142023-10-31 10:28:40 +000055static cl::opt<unsigned> CallPenaltyChangeSM(
56 "call-penalty-sm-change", cl::init(5), cl::Hidden,
57 cl::desc(
58 "Penalty of calling a function that requires a change to PSTATE.SM"));
59
60static cl::opt<unsigned> InlineCallPenaltyChangeSM(
61 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
62 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
63
David Greena2d68b42024-01-22 23:46:58 +000064static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
65 cl::init(true), cl::Hidden);
66
Graham Huntere16f2f52024-06-06 14:45:36 +010067static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
68 cl::init(true), cl::Hidden);
69
Graham Hunter2c0add92024-07-04 10:59:21 +010070// A complete guess as to a reasonable cost.
71static cl::opt<unsigned>
72 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
73 cl::desc("The cost of a histcnt instruction"));
74
Danila Malyutin1a609052024-10-17 21:04:04 +040075static cl::opt<unsigned> DMBLookaheadThreshold(
76 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
77 cl::desc("The number of instructions to search for a redundant dmb"));
78
Benjamin Kramerb6942a22023-01-08 17:25:29 +010079namespace {
David Sherwood7beb2ca2023-04-20 12:34:55 +000080class TailFoldingOption {
81 // These bitfields will only ever be set to something non-zero in operator=,
82 // when setting the -sve-tail-folding option. This option should always be of
83 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
84 // InitialBits is one of (disabled|all|simple). EnableBits represents
85 // additional flags we're enabling, and DisableBits for those flags we're
86 // disabling. The default flag is tracked in the variable NeedsDefault, since
87 // at the time of setting the option we may not know what the default value
88 // for the CPU is.
89 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
90 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
91 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
92
93 // This value needs to be initialised to true in case the user does not
94 // explicitly set the -sve-tail-folding option.
95 bool NeedsDefault = true;
96
97 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
98
99 void setNeedsDefault(bool V) { NeedsDefault = V; }
100
101 void setEnableBit(TailFoldingOpts Bit) {
102 EnableBits |= Bit;
103 DisableBits &= ~Bit;
104 }
105
106 void setDisableBit(TailFoldingOpts Bit) {
107 EnableBits &= ~Bit;
108 DisableBits |= Bit;
109 }
110
111 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
112 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
113
114 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
115 "Initial bits should only include one of "
116 "(disabled|all|simple|default)");
117 Bits = NeedsDefault ? DefaultBits : InitialBits;
118 Bits |= EnableBits;
119 Bits &= ~DisableBits;
120
121 return Bits;
122 }
123
124 void reportError(std::string Opt) {
125 errs() << "invalid argument '" << Opt
126 << "' to -sve-tail-folding=; the option should be of the form\n"
127 " (disabled|all|default|simple)[+(reductions|recurrences"
128 "|reverse|noreductions|norecurrences|noreverse)]\n";
129 report_fatal_error("Unrecognised tail-folding option");
130 }
David Sherwoodf15b6b22022-07-12 12:03:39 +0100131
132public:
David Sherwoodf15b6b22022-07-12 12:03:39 +0100133
134 void operator=(const std::string &Val) {
David Sherwood7beb2ca2023-04-20 12:34:55 +0000135 // If the user explicitly sets -sve-tail-folding= then treat as an error.
136 if (Val.empty()) {
137 reportError("");
David Sherwoodf15b6b22022-07-12 12:03:39 +0100138 return;
David Sherwood7beb2ca2023-04-20 12:34:55 +0000139 }
140
141 // Since the user is explicitly setting the option we don't automatically
142 // need the default unless they require it.
143 setNeedsDefault(false);
144
145 SmallVector<StringRef, 4> TailFoldTypes;
David Sherwoodf15b6b22022-07-12 12:03:39 +0100146 StringRef(Val).split(TailFoldTypes, '+', -1, false);
David Sherwood7beb2ca2023-04-20 12:34:55 +0000147
148 unsigned StartIdx = 1;
149 if (TailFoldTypes[0] == "disabled")
150 setInitialBits(TailFoldingOpts::Disabled);
151 else if (TailFoldTypes[0] == "all")
152 setInitialBits(TailFoldingOpts::All);
153 else if (TailFoldTypes[0] == "default")
154 setNeedsDefault(true);
155 else if (TailFoldTypes[0] == "simple")
156 setInitialBits(TailFoldingOpts::Simple);
157 else {
158 StartIdx = 0;
159 setInitialBits(TailFoldingOpts::Disabled);
160 }
161
162 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
163 if (TailFoldTypes[I] == "reductions")
164 setEnableBit(TailFoldingOpts::Reductions);
165 else if (TailFoldTypes[I] == "recurrences")
166 setEnableBit(TailFoldingOpts::Recurrences);
167 else if (TailFoldTypes[I] == "reverse")
168 setEnableBit(TailFoldingOpts::Reverse);
169 else if (TailFoldTypes[I] == "noreductions")
170 setDisableBit(TailFoldingOpts::Reductions);
171 else if (TailFoldTypes[I] == "norecurrences")
172 setDisableBit(TailFoldingOpts::Recurrences);
173 else if (TailFoldTypes[I] == "noreverse")
174 setDisableBit(TailFoldingOpts::Reverse);
175 else
176 reportError(Val);
David Sherwoodf15b6b22022-07-12 12:03:39 +0100177 }
178 }
179
David Sherwood7beb2ca2023-04-20 12:34:55 +0000180 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
181 return (getBits(DefaultBits) & Required) == Required;
182 }
David Sherwoodf15b6b22022-07-12 12:03:39 +0100183};
Benjamin Kramerb6942a22023-01-08 17:25:29 +0100184} // namespace
David Sherwoodf15b6b22022-07-12 12:03:39 +0100185
David Sherwood7beb2ca2023-04-20 12:34:55 +0000186TailFoldingOption TailFoldingOptionLoc;
David Sherwoodf15b6b22022-07-12 12:03:39 +0100187
chrisPyr71f4c7d2025-03-03 14:46:33 +0800188static cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
David Sherwoodf15b6b22022-07-12 12:03:39 +0100189 "sve-tail-folding",
190 cl::desc(
David Sherwood7beb2ca2023-04-20 12:34:55 +0000191 "Control the use of vectorisation using tail-folding for SVE where the"
192 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
193 "\ndisabled (Initial) No loop types will vectorize using "
194 "tail-folding"
195 "\ndefault (Initial) Uses the default tail-folding settings for "
196 "the target CPU"
197 "\nall (Initial) All legal loop types will vectorize using "
198 "tail-folding"
199 "\nsimple (Initial) Use tail-folding for simple loops (not "
200 "reductions or recurrences)"
201 "\nreductions Use tail-folding for loops containing reductions"
202 "\nnoreductions Inverse of above"
203 "\nrecurrences Use tail-folding for loops containing fixed order "
David Sherwood636efd22023-03-14 18:15:03 +0000204 "recurrences"
David Sherwood7beb2ca2023-04-20 12:34:55 +0000205 "\nnorecurrences Inverse of above"
206 "\nreverse Use tail-folding for loops requiring reversed "
207 "predicates"
208 "\nnoreverse Inverse of above"),
209 cl::location(TailFoldingOptionLoc));
David Sherwoodf15b6b22022-07-12 12:03:39 +0100210
Sander de Smalen137459a2022-10-19 14:14:00 +0000211// Experimental option that will only be fully functional when the
212// code-generator is changed to use SVE instead of NEON for all fixed-width
213// operations.
214static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
215 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
216
217// Experimental option that will only be fully functional when the cost-model
218// and code-generator have been changed to avoid using scalable vector
219// instructions that are not legal in streaming SVE mode.
220static cl::opt<bool> EnableScalableAutovecInStreamingMode(
221 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
222
Sander de Smalen6d30bc02023-10-30 10:47:07 +0000223static bool isSMEABIRoutineCall(const CallInst &CI) {
224 const auto *F = CI.getCalledFunction();
225 return F && StringSwitch<bool>(F->getName())
226 .Case("__arm_sme_state", true)
227 .Case("__arm_tpidr2_save", true)
228 .Case("__arm_tpidr2_restore", true)
229 .Case("__arm_za_disable", true)
230 .Default(false);
231}
232
233/// Returns true if the function has explicit operations that can only be
234/// lowered using incompatible instructions for the selected mode. This also
235/// returns true if the function F may use or modify ZA state.
236static bool hasPossibleIncompatibleOps(const Function *F) {
237 for (const BasicBlock &BB : *F) {
238 for (const Instruction &I : BB) {
239 // Be conservative for now and assume that any call to inline asm or to
240 // intrinsics could could result in non-streaming ops (e.g. calls to
241 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
242 // all native LLVM instructions can be lowered to compatible instructions.
243 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
244 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
245 isSMEABIRoutineCall(cast<CallInst>(I))))
246 return true;
247 }
248 }
249 return false;
250}
251
Alexandros Lamprineas831527a2025-01-17 10:49:43 +0000252uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const {
253 StringRef AttributeStr =
254 isMultiversionedFunction(F) ? "fmv-features" : "target-features";
255 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
256 SmallVector<StringRef, 8> Features;
257 FeatureStr.split(Features, ",");
258 return AArch64::getFMVPriority(Features);
259}
260
261bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const {
262 return F.hasFnAttribute("fmv-features");
263}
264
Csanád Hajdúa190f152025-02-14 09:56:07 +0100265const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
266 AArch64::FeatureExecuteOnly,
267};
268
Florian Hahn2665feb2017-06-27 22:27:32 +0000269bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
270 const Function *Callee) const {
Sander de Smalen3abf55a2024-01-31 11:38:29 +0000271 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
272
273 // When inlining, we should consider the body of the function, not the
274 // interface.
275 if (CalleeAttrs.hasStreamingBody()) {
276 CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
277 CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
278 }
279
Kerry McLaughlind8d4c182025-01-06 12:02:28 +0000280 if (CalleeAttrs.isNewZA() || CalleeAttrs.isNewZT0())
David Sherwood64bef3d2022-09-20 16:28:34 +0100281 return false;
282
Sander de Smalen6d30bc02023-10-30 10:47:07 +0000283 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
Sander de Smalenfb470db2024-08-02 10:29:08 +0100284 CallerAttrs.requiresSMChange(CalleeAttrs) ||
Sander de Smalen2ce168b2024-12-23 19:10:21 +0000285 CallerAttrs.requiresPreservingZT0(CalleeAttrs) ||
286 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs)) {
Sander de Smalen6d30bc02023-10-30 10:47:07 +0000287 if (hasPossibleIncompatibleOps(Callee))
288 return false;
289 }
290
Csanád Hajdúa190f152025-02-14 09:56:07 +0100291 const TargetMachine &TM = getTLI()->getTargetMachine();
292 const FeatureBitset &CallerBits =
293 TM.getSubtargetImpl(*Caller)->getFeatureBits();
294 const FeatureBitset &CalleeBits =
295 TM.getSubtargetImpl(*Callee)->getFeatureBits();
296 // Adjust the feature bitsets by inverting some of the bits. This is needed
297 // for target features that represent restrictions rather than capabilities,
298 // for example a "+execute-only" callee can be inlined into a caller without
299 // "+execute-only", but not vice versa.
300 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
301 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
302
303 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
Florian Hahn2665feb2017-06-27 22:27:32 +0000304}
305
KAWASHIMA Takahiro926173c2023-10-26 14:51:20 +0900306bool AArch64TTIImpl::areTypesABICompatible(
307 const Function *Caller, const Function *Callee,
308 const ArrayRef<Type *> &Types) const {
309 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
310 return false;
311
312 // We need to ensure that argument promotion does not attempt to promote
313 // pointers to fixed-length vector types larger than 128 bits like
314 // <8 x float> (and pointers to aggregate types which have such fixed-length
315 // vector type members) into the values of the pointees. Such vector types
316 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
317 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
318 // types can be safely treated as 128-bit NEON types and they cannot be
319 // distinguished in IR.
320 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
321 auto FVTy = dyn_cast<FixedVectorType>(Ty);
322 return FVTy &&
323 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
324 }))
325 return false;
326
327 return true;
328}
329
Sander de Smalen00a83142023-10-31 10:28:40 +0000330unsigned
331AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
332 unsigned DefaultCallPenalty) const {
333 // This function calculates a penalty for executing Call in F.
334 //
335 // There are two ways this function can be called:
336 // (1) F:
337 // call from F -> G (the call here is Call)
338 //
339 // For (1), Call.getCaller() == F, so it will always return a high cost if
340 // a streaming-mode change is required (thus promoting the need to inline the
341 // function)
342 //
343 // (2) F:
344 // call from F -> G (the call here is not Call)
345 // G:
346 // call from G -> H (the call here is Call)
347 //
348 // For (2), if after inlining the body of G into F the call to H requires a
349 // streaming-mode change, and the call to G from F would also require a
350 // streaming-mode change, then there is benefit to do the streaming-mode
351 // change only once and avoid inlining of G into F.
352 SMEAttrs FAttrs(*F);
353 SMEAttrs CalleeAttrs(Call);
354 if (FAttrs.requiresSMChange(CalleeAttrs)) {
355 if (F == Call.getCaller()) // (1)
356 return CallPenaltyChangeSM * DefaultCallPenalty;
357 if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
358 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
359 }
360
361 return DefaultCallPenalty;
362}
363
Jingu Kangbb82f742022-05-23 12:33:48 +0100364bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
365 TargetTransformInfo::RegisterKind K) const {
366 assert(K != TargetTransformInfo::RGK_Scalar);
Graham Hunter091a2352024-10-18 11:05:55 +0100367 return (K == TargetTransformInfo::RGK_FixedWidthVector &&
368 ST->isNeonAvailable());
Jingu Kangbb82f742022-05-23 12:33:48 +0100369}
370
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000371/// Calculate the cost of materializing a 64-bit value. This helper
Tim Northover3b0846e2014-05-24 12:50:23 +0000372/// method might only calculate a fraction of a larger immediate. Therefore it
373/// is valid to return a cost of ZERO.
Sergei Barannikove0c1e232025-04-21 21:42:40 +0300374InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) const {
Tim Northover3b0846e2014-05-24 12:50:23 +0000375 // Check if the immediate can be encoded within an instruction.
376 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
377 return 0;
378
379 if (Val < 0)
380 Val = ~Val;
381
382 // Calculate how many moves we will need to materialize this constant.
Adhemerval Zanella270249d2019-03-18 18:50:58 +0000383 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
384 AArch64_IMM::expandMOVImm(Val, 64, Insn);
385 return Insn.size();
Tim Northover3b0846e2014-05-24 12:50:23 +0000386}
387
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000388/// Calculate the cost of materializing the given constant.
Sergei Barannikove0c1e232025-04-21 21:42:40 +0300389InstructionCost
390AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
391 TTI::TargetCostKind CostKind) const {
Tim Northover3b0846e2014-05-24 12:50:23 +0000392 assert(Ty->isIntegerTy());
393
394 unsigned BitSize = Ty->getPrimitiveSizeInBits();
395 if (BitSize == 0)
396 return ~0U;
397
398 // Sign-extend all constants to a multiple of 64-bit.
399 APInt ImmVal = Imm;
400 if (BitSize & 0x3f)
401 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
402
403 // Split the constant into 64-bit chunks and calculate the cost for each
404 // chunk.
Sander de Smalenf9a50f02021-01-27 15:01:16 +0000405 InstructionCost Cost = 0;
Tim Northover3b0846e2014-05-24 12:50:23 +0000406 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
407 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
408 int64_t Val = Tmp.getSExtValue();
409 Cost += getIntImmCost(Val);
410 }
411 // We need at least one instruction to materialze the constant.
Sander de Smalenf9a50f02021-01-27 15:01:16 +0000412 return std::max<InstructionCost>(1, Cost);
Tim Northover3b0846e2014-05-24 12:50:23 +0000413}
414
Sander de Smalenf9a50f02021-01-27 15:01:16 +0000415InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
416 const APInt &Imm, Type *Ty,
417 TTI::TargetCostKind CostKind,
Sergei Barannikov0014b492025-04-22 06:27:29 +0300418 Instruction *Inst) const {
Tim Northover3b0846e2014-05-24 12:50:23 +0000419 assert(Ty->isIntegerTy());
420
421 unsigned BitSize = Ty->getPrimitiveSizeInBits();
422 // There is no cost model for constants with a bit size of 0. Return TCC_Free
423 // here, so that constant hoisting will ignore this constant.
424 if (BitSize == 0)
Chandler Carruth705b1852015-01-31 03:43:40 +0000425 return TTI::TCC_Free;
Tim Northover3b0846e2014-05-24 12:50:23 +0000426
427 unsigned ImmIdx = ~0U;
428 switch (Opcode) {
429 default:
Chandler Carruth705b1852015-01-31 03:43:40 +0000430 return TTI::TCC_Free;
Tim Northover3b0846e2014-05-24 12:50:23 +0000431 case Instruction::GetElementPtr:
432 // Always hoist the base address of a GetElementPtr.
433 if (Idx == 0)
Chandler Carruth705b1852015-01-31 03:43:40 +0000434 return 2 * TTI::TCC_Basic;
435 return TTI::TCC_Free;
Tim Northover3b0846e2014-05-24 12:50:23 +0000436 case Instruction::Store:
437 ImmIdx = 0;
438 break;
439 case Instruction::Add:
440 case Instruction::Sub:
441 case Instruction::Mul:
442 case Instruction::UDiv:
443 case Instruction::SDiv:
444 case Instruction::URem:
445 case Instruction::SRem:
446 case Instruction::And:
447 case Instruction::Or:
448 case Instruction::Xor:
449 case Instruction::ICmp:
450 ImmIdx = 1;
451 break;
452 // Always return TCC_Free for the shift value of a shift instruction.
453 case Instruction::Shl:
454 case Instruction::LShr:
455 case Instruction::AShr:
456 if (Idx == 1)
Chandler Carruth705b1852015-01-31 03:43:40 +0000457 return TTI::TCC_Free;
Tim Northover3b0846e2014-05-24 12:50:23 +0000458 break;
459 case Instruction::Trunc:
460 case Instruction::ZExt:
461 case Instruction::SExt:
462 case Instruction::IntToPtr:
463 case Instruction::PtrToInt:
464 case Instruction::BitCast:
465 case Instruction::PHI:
466 case Instruction::Call:
467 case Instruction::Select:
468 case Instruction::Ret:
469 case Instruction::Load:
470 break;
471 }
472
473 if (Idx == ImmIdx) {
Chandler Carruth93205eb2015-08-05 18:08:10 +0000474 int NumConstants = (BitSize + 63) / 64;
Sander de Smalenf9a50f02021-01-27 15:01:16 +0000475 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
Chandler Carruth705b1852015-01-31 03:43:40 +0000476 return (Cost <= NumConstants * TTI::TCC_Basic)
Chandler Carruth93205eb2015-08-05 18:08:10 +0000477 ? static_cast<int>(TTI::TCC_Free)
Chandler Carruth705b1852015-01-31 03:43:40 +0000478 : Cost;
Tim Northover3b0846e2014-05-24 12:50:23 +0000479 }
Sam Parker40574fe2020-04-28 14:11:27 +0100480 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
Tim Northover3b0846e2014-05-24 12:50:23 +0000481}
482
Sander de Smalenf9a50f02021-01-27 15:01:16 +0000483InstructionCost
484AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
485 const APInt &Imm, Type *Ty,
Sergei Barannikov0014b492025-04-22 06:27:29 +0300486 TTI::TargetCostKind CostKind) const {
Tim Northover3b0846e2014-05-24 12:50:23 +0000487 assert(Ty->isIntegerTy());
488
489 unsigned BitSize = Ty->getPrimitiveSizeInBits();
490 // There is no cost model for constants with a bit size of 0. Return TCC_Free
491 // here, so that constant hoisting will ignore this constant.
492 if (BitSize == 0)
Chandler Carruth705b1852015-01-31 03:43:40 +0000493 return TTI::TCC_Free;
Tim Northover3b0846e2014-05-24 12:50:23 +0000494
Florian Hahn93c82352019-12-04 10:49:24 +0000495 // Most (all?) AArch64 intrinsics do not support folding immediates into the
496 // selected instruction, so we compute the materialization cost for the
497 // immediate directly.
498 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
Sam Parker40574fe2020-04-28 14:11:27 +0100499 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
Florian Hahn93c82352019-12-04 10:49:24 +0000500
Tim Northover3b0846e2014-05-24 12:50:23 +0000501 switch (IID) {
502 default:
Chandler Carruth705b1852015-01-31 03:43:40 +0000503 return TTI::TCC_Free;
Tim Northover3b0846e2014-05-24 12:50:23 +0000504 case Intrinsic::sadd_with_overflow:
505 case Intrinsic::uadd_with_overflow:
506 case Intrinsic::ssub_with_overflow:
507 case Intrinsic::usub_with_overflow:
508 case Intrinsic::smul_with_overflow:
509 case Intrinsic::umul_with_overflow:
510 if (Idx == 1) {
Chandler Carruth93205eb2015-08-05 18:08:10 +0000511 int NumConstants = (BitSize + 63) / 64;
Sander de Smalenf9a50f02021-01-27 15:01:16 +0000512 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
Chandler Carruth705b1852015-01-31 03:43:40 +0000513 return (Cost <= NumConstants * TTI::TCC_Basic)
Chandler Carruth93205eb2015-08-05 18:08:10 +0000514 ? static_cast<int>(TTI::TCC_Free)
Chandler Carruth705b1852015-01-31 03:43:40 +0000515 : Cost;
Tim Northover3b0846e2014-05-24 12:50:23 +0000516 }
517 break;
518 case Intrinsic::experimental_stackmap:
519 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
Chandler Carruth705b1852015-01-31 03:43:40 +0000520 return TTI::TCC_Free;
Tim Northover3b0846e2014-05-24 12:50:23 +0000521 break;
522 case Intrinsic::experimental_patchpoint_void:
Il-Capitano308ed022024-03-26 14:38:52 +0100523 case Intrinsic::experimental_patchpoint:
Tim Northover3b0846e2014-05-24 12:50:23 +0000524 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
Chandler Carruth705b1852015-01-31 03:43:40 +0000525 return TTI::TCC_Free;
Tim Northover3b0846e2014-05-24 12:50:23 +0000526 break;
Philip Reamese6bc7032020-09-14 16:38:48 -0700527 case Intrinsic::experimental_gc_statepoint:
528 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
529 return TTI::TCC_Free;
530 break;
Tim Northover3b0846e2014-05-24 12:50:23 +0000531 }
Sam Parker40574fe2020-04-28 14:11:27 +0100532 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
Tim Northover3b0846e2014-05-24 12:50:23 +0000533}
534
Chandler Carruth705b1852015-01-31 03:43:40 +0000535TargetTransformInfo::PopcntSupportKind
Sergei Barannikov0014b492025-04-22 06:27:29 +0300536AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
Tim Northover3b0846e2014-05-24 12:50:23 +0000537 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
538 if (TyWidth == 32 || TyWidth == 64)
Chandler Carruth705b1852015-01-31 03:43:40 +0000539 return TTI::PSK_FastHardware;
Tim Northover3b0846e2014-05-24 12:50:23 +0000540 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
Chandler Carruth705b1852015-01-31 03:43:40 +0000541 return TTI::PSK_Software;
Tim Northover3b0846e2014-05-24 12:50:23 +0000542}
543
Graham Hunter03f852f2024-03-04 16:17:01 +0000544static bool isUnpackedVectorVT(EVT VecVT) {
545 return VecVT.isScalableVector() &&
546 VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
547}
548
Graham Hunter2c0add92024-07-04 10:59:21 +0100549static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
550 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
551 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
Sam Tebbsb49a6b22024-09-19 13:56:52 +0100552 unsigned TotalHistCnts = 1;
Graham Hunter2c0add92024-07-04 10:59:21 +0100553
Sam Tebbsb49a6b22024-09-19 13:56:52 +0100554 unsigned EltSize = EltTy->getScalarSizeInBits();
555 // Only allow (up to 64b) integers or pointers
556 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
Graham Hunter2c0add92024-07-04 10:59:21 +0100557 return InstructionCost::getInvalid();
558
Graham Hunter2c0add92024-07-04 10:59:21 +0100559 // FIXME: We should be able to generate histcnt for fixed-length vectors
560 // using ptrue with a specific VL.
Sam Tebbsb49a6b22024-09-19 13:56:52 +0100561 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
562 unsigned EC = VTy->getElementCount().getKnownMinValue();
563 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
Graham Hunter2c0add92024-07-04 10:59:21 +0100564 return InstructionCost::getInvalid();
565
Sam Tebbsb49a6b22024-09-19 13:56:52 +0100566 // HistCnt only supports 32b and 64b element types
567 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
568
Samuel Tebbsb1b436c12024-09-19 14:43:22 +0100569 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
Sam Tebbsb49a6b22024-09-19 13:56:52 +0100570 return InstructionCost(BaseHistCntCost);
571
572 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
573 TotalHistCnts = EC / NaturalVectorWidth;
574 }
575
576 return InstructionCost(BaseHistCntCost * TotalHistCnts);
Graham Hunter2c0add92024-07-04 10:59:21 +0100577}
578
Sander de Smalen2f6f2492021-01-22 17:14:44 +0000579InstructionCost
Florian Hahn0fcc6f72020-10-23 09:00:20 +0100580AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
Sergei Barannikove0c1e232025-04-21 21:42:40 +0300581 TTI::TargetCostKind CostKind) const {
David Green0b745a12024-08-09 14:25:07 +0100582 // The code-generator is currently not able to handle scalable vectors
583 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
584 // it. This change will be removed when code-generation for these types is
585 // sufficiently reliable.
Florian Hahn0fcc6f72020-10-23 09:00:20 +0100586 auto *RetTy = ICA.getReturnType();
David Green0b745a12024-08-09 14:25:07 +0100587 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
588 if (VTy->getElementCount() == ElementCount::getScalable(1))
589 return InstructionCost::getInvalid();
590
Florian Hahn0fcc6f72020-10-23 09:00:20 +0100591 switch (ICA.getID()) {
Graham Hunter2c0add92024-07-04 10:59:21 +0100592 case Intrinsic::experimental_vector_histogram_add:
593 if (!ST->hasSVE2())
594 return InstructionCost::getInvalid();
595 return getHistogramCost(ICA);
Florian Hahn0fcc6f72020-10-23 09:00:20 +0100596 case Intrinsic::umin:
Irina Dobrescub01417d2021-07-22 16:21:48 +0100597 case Intrinsic::umax:
Simon Pilgrim969918e2020-11-27 11:00:34 +0000598 case Intrinsic::smin:
599 case Intrinsic::smax: {
Florian Hahn0fcc6f72020-10-23 09:00:20 +0100600 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
Tuan Chuong Gohe36dd3e2023-07-12 07:46:12 +0100601 MVT::v8i16, MVT::v2i32, MVT::v4i32,
602 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
603 MVT::nxv2i64};
Daniil Fukalov7ed3d812022-08-18 00:38:34 +0300604 auto LT = getTypeLegalizationCost(RetTy);
Irina Dobrescub01417d2021-07-22 16:21:48 +0100605 // v2i64 types get converted to cmp+bif hence the cost of 2
606 if (LT.second == MVT::v2i64)
607 return LT.first * 2;
Florian Hahn0fcc6f72020-10-23 09:00:20 +0100608 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
609 return LT.first;
610 break;
611 }
David Green0175cd02021-01-27 10:38:32 +0000612 case Intrinsic::sadd_sat:
613 case Intrinsic::ssub_sat:
614 case Intrinsic::uadd_sat:
615 case Intrinsic::usub_sat: {
616 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
617 MVT::v8i16, MVT::v2i32, MVT::v4i32,
618 MVT::v2i64};
Daniil Fukalov7ed3d812022-08-18 00:38:34 +0300619 auto LT = getTypeLegalizationCost(RetTy);
David Green0175cd02021-01-27 10:38:32 +0000620 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
621 // need to extend the type, as it uses shr(qadd(shl, shl)).
622 unsigned Instrs =
623 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
624 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
625 return LT.first * Instrs;
626 break;
627 }
Stelios Ioannou30cb9c02021-02-24 12:51:30 +0000628 case Intrinsic::abs: {
629 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
630 MVT::v8i16, MVT::v2i32, MVT::v4i32,
631 MVT::v2i64};
Daniil Fukalov7ed3d812022-08-18 00:38:34 +0300632 auto LT = getTypeLegalizationCost(RetTy);
Stelios Ioannou30cb9c02021-02-24 12:51:30 +0000633 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
634 return LT.first;
635 break;
636 }
David Green8da62b82023-07-21 08:48:53 +0100637 case Intrinsic::bswap: {
638 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
639 MVT::v4i32, MVT::v2i64};
640 auto LT = getTypeLegalizationCost(RetTy);
641 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
642 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
643 return LT.first;
644 break;
645 }
Maciej Gabka95d2d1c2024-08-28 12:48:20 +0100646 case Intrinsic::stepvector: {
Sander de Smalen4f42d872021-04-14 16:53:01 +0100647 InstructionCost Cost = 1; // Cost of the `index' instruction
Daniil Fukalov7ed3d812022-08-18 00:38:34 +0300648 auto LT = getTypeLegalizationCost(RetTy);
David Sherwood748ae522021-02-08 15:46:24 +0000649 // Legalisation of illegal vectors involves an `index' instruction plus
650 // (LT.first - 1) vector adds.
651 if (LT.first > 1) {
652 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
Sander de Smalen4f42d872021-04-14 16:53:01 +0100653 InstructionCost AddCost =
David Sherwood748ae522021-02-08 15:46:24 +0000654 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
655 Cost += AddCost * (LT.first - 1);
656 }
657 return Cost;
658 }
Graham Hunter03f852f2024-03-04 16:17:01 +0000659 case Intrinsic::vector_extract:
660 case Intrinsic::vector_insert: {
661 // If both the vector and subvector types are legal types and the index
662 // is 0, then this should be a no-op or simple operation; return a
663 // relatively low cost.
664
665 // If arguments aren't actually supplied, then we cannot determine the
666 // value of the index. We also want to skip predicate types.
667 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
668 ICA.getReturnType()->getScalarType()->isIntegerTy(1))
669 break;
670
671 LLVMContext &C = RetTy->getContext();
672 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
673 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
674 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
675 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
676 // Skip this if either the vector or subvector types are unpacked
677 // SVE types; they may get lowered to stack stores and loads.
678 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
679 break;
680
681 TargetLoweringBase::LegalizeKind SubVecLK =
682 getTLI()->getTypeConversion(C, SubVecVT);
683 TargetLoweringBase::LegalizeKind VecLK =
684 getTLI()->getTypeConversion(C, VecVT);
685 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
Graham Hunter56abb8d2024-03-05 09:35:16 +0000686 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
Graham Hunter03f852f2024-03-04 16:17:01 +0000687 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
Graham Hunter56abb8d2024-03-05 09:35:16 +0000688 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
Graham Hunter03f852f2024-03-04 16:17:01 +0000689 return TTI::TCC_Free;
690 break;
691 }
Irina Dobrescude799192021-06-03 09:46:12 +0100692 case Intrinsic::bitreverse: {
693 static const CostTblEntry BitreverseTbl[] = {
694 {Intrinsic::bitreverse, MVT::i32, 1},
695 {Intrinsic::bitreverse, MVT::i64, 1},
696 {Intrinsic::bitreverse, MVT::v8i8, 1},
697 {Intrinsic::bitreverse, MVT::v16i8, 1},
698 {Intrinsic::bitreverse, MVT::v4i16, 2},
699 {Intrinsic::bitreverse, MVT::v8i16, 2},
700 {Intrinsic::bitreverse, MVT::v2i32, 2},
701 {Intrinsic::bitreverse, MVT::v4i32, 2},
702 {Intrinsic::bitreverse, MVT::v1i64, 2},
703 {Intrinsic::bitreverse, MVT::v2i64, 2},
704 };
Daniil Fukalov7ed3d812022-08-18 00:38:34 +0300705 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
Irina Dobrescude799192021-06-03 09:46:12 +0100706 const auto *Entry =
707 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
Simon Pilgrim676f2802021-09-29 12:28:38 +0100708 if (Entry) {
709 // Cost Model is using the legal type(i32) that i8 and i16 will be
710 // converted to +1 so that we match the actual lowering cost
711 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
712 TLI->getValueType(DL, RetTy, true) == MVT::i16)
713 return LegalisationCost.first * Entry->Cost + 1;
714
Irina Dobrescude799192021-06-03 09:46:12 +0100715 return LegalisationCost.first * Entry->Cost;
Simon Pilgrim676f2802021-09-29 12:28:38 +0100716 }
Benjamin Kramer3dceffd2021-06-10 17:23:37 +0200717 break;
Irina Dobrescude799192021-06-03 09:46:12 +0100718 }
Rosie Sumpterd7c219a2021-06-09 10:00:16 +0100719 case Intrinsic::ctpop: {
Eli Friedmanb219a9c2022-09-02 15:17:55 -0700720 if (!ST->hasNEON()) {
721 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
722 return getTypeLegalizationCost(RetTy).first * 12;
723 }
Rosie Sumpterd7c219a2021-06-09 10:00:16 +0100724 static const CostTblEntry CtpopCostTbl[] = {
725 {ISD::CTPOP, MVT::v2i64, 4},
726 {ISD::CTPOP, MVT::v4i32, 3},
727 {ISD::CTPOP, MVT::v8i16, 2},
728 {ISD::CTPOP, MVT::v16i8, 1},
729 {ISD::CTPOP, MVT::i64, 4},
730 {ISD::CTPOP, MVT::v2i32, 3},
731 {ISD::CTPOP, MVT::v4i16, 2},
732 {ISD::CTPOP, MVT::v8i8, 1},
733 {ISD::CTPOP, MVT::i32, 5},
734 };
Daniil Fukalov7ed3d812022-08-18 00:38:34 +0300735 auto LT = getTypeLegalizationCost(RetTy);
Rosie Sumpterd7c219a2021-06-09 10:00:16 +0100736 MVT MTy = LT.second;
737 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
738 // Extra cost of +1 when illegal vector types are legalized by promoting
739 // the integer type.
740 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
741 RetTy->getScalarSizeInBits()
742 ? 1
743 : 0;
744 return LT.first * Entry->Cost + ExtraCost;
745 }
746 break;
747 }
David Greenbc615e42022-01-07 16:20:23 +0000748 case Intrinsic::sadd_with_overflow:
749 case Intrinsic::uadd_with_overflow:
750 case Intrinsic::ssub_with_overflow:
751 case Intrinsic::usub_with_overflow:
David Greenc65270c2022-01-06 17:22:47 +0000752 case Intrinsic::smul_with_overflow:
753 case Intrinsic::umul_with_overflow: {
754 static const CostTblEntry WithOverflowCostTbl[] = {
David Greenbc615e42022-01-07 16:20:23 +0000755 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
756 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
757 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
758 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
759 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
760 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
761 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
762 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
763 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
764 {Intrinsic::usub_with_overflow, MVT::i8, 3},
765 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
766 {Intrinsic::usub_with_overflow, MVT::i16, 3},
767 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
768 {Intrinsic::usub_with_overflow, MVT::i32, 1},
769 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
770 {Intrinsic::usub_with_overflow, MVT::i64, 1},
David Greenc65270c2022-01-06 17:22:47 +0000771 {Intrinsic::smul_with_overflow, MVT::i8, 5},
772 {Intrinsic::umul_with_overflow, MVT::i8, 4},
773 {Intrinsic::smul_with_overflow, MVT::i16, 5},
774 {Intrinsic::umul_with_overflow, MVT::i16, 4},
775 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
776 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
777 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
778 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
779 };
780 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
781 if (MTy.isSimple())
782 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
783 MTy.getSimpleVT()))
784 return Entry->Cost;
785 break;
786 }
David Green2dcb2d82022-05-02 11:36:05 +0100787 case Intrinsic::fptosi_sat:
788 case Intrinsic::fptoui_sat: {
789 if (ICA.getArgTypes().empty())
790 break;
791 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
Daniil Fukalov7ed3d812022-08-18 00:38:34 +0300792 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
David Green2dcb2d82022-05-02 11:36:05 +0100793 EVT MTy = TLI->getValueType(DL, RetTy);
794 // Check for the legal types, which are where the size of the input and the
795 // output are the same, or we are using cvt f64->i32 or f32->i64.
796 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
797 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
David Green6907ab42024-07-28 10:47:40 +0100798 LT.second == MVT::v2f64)) {
799 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
800 (LT.second == MVT::f64 && MTy == MVT::i32) ||
801 (LT.second == MVT::f32 && MTy == MVT::i64)))
802 return LT.first;
803 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
804 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
805 MTy.getScalarSizeInBits() == 64)
806 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
807 }
808 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
809 // f32.
810 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
811 return LT.first + getIntrinsicInstrCost(
812 {ICA.getID(),
813 RetTy,
814 {ICA.getArgTypes()[0]->getWithNewType(
815 Type::getFloatTy(RetTy->getContext()))}},
816 CostKind);
817 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
818 (LT.second == MVT::f16 && MTy == MVT::i64) ||
819 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
820 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
David Green2dcb2d82022-05-02 11:36:05 +0100821 return LT.first;
David Green6907ab42024-07-28 10:47:40 +0100822 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
823 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
824 MTy.getScalarSizeInBits() == 32)
825 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
826 // Extending vector types v8f16->v8i32. These current scalarize but the
827 // codegen could be better.
828 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
829 MTy.getScalarSizeInBits() == 64)
830 return MTy.getVectorNumElements() * 3;
David Green2dcb2d82022-05-02 11:36:05 +0100831
David Green6907ab42024-07-28 10:47:40 +0100832 // If we can we use a legal convert followed by a min+max
David Green2dcb2d82022-05-02 11:36:05 +0100833 if ((LT.second.getScalarType() == MVT::f32 ||
834 LT.second.getScalarType() == MVT::f64 ||
David Green6907ab42024-07-28 10:47:40 +0100835 LT.second.getScalarType() == MVT::f16) &&
David Green2dcb2d82022-05-02 11:36:05 +0100836 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
837 Type *LegalTy =
838 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
839 if (LT.second.isVector())
840 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
841 InstructionCost Cost = 1;
842 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
843 LegalTy, {LegalTy, LegalTy});
844 Cost += getIntrinsicInstrCost(Attrs1, CostKind);
845 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
846 LegalTy, {LegalTy, LegalTy});
847 Cost += getIntrinsicInstrCost(Attrs2, CostKind);
David Green6907ab42024-07-28 10:47:40 +0100848 return LT.first * Cost +
849 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
850 : 1);
David Green2dcb2d82022-05-02 11:36:05 +0100851 }
David Green6907ab42024-07-28 10:47:40 +0100852 // Otherwise we need to follow the default expansion that clamps the value
853 // using a float min/max with a fcmp+sel for nan handling when signed.
854 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
855 RetTy = RetTy->getScalarType();
856 if (LT.second.isVector()) {
857 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
858 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
859 }
860 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
861 InstructionCost Cost = getIntrinsicInstrCost(Attrs1, CostKind);
862 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
863 Cost += getIntrinsicInstrCost(Attrs2, CostKind);
864 Cost +=
865 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
866 RetTy, FPTy, TTI::CastContextHint::None, CostKind);
867 if (IsSigned) {
868 Type *CondTy = RetTy->getWithNewBitWidth(1);
869 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
870 CmpInst::FCMP_UNO, CostKind);
871 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
872 CmpInst::FCMP_UNO, CostKind);
873 }
874 return LT.first * Cost;
David Green2dcb2d82022-05-02 11:36:05 +0100875 }
Zain Jaffal3d3d8fe2023-04-20 18:20:01 +0100876 case Intrinsic::fshl:
877 case Intrinsic::fshr: {
878 if (ICA.getArgs().empty())
879 break;
880
881 // TODO: Add handling for fshl where third argument is not a constant.
882 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
883 if (!OpInfoZ.isConstant())
884 break;
885
886 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
887 if (OpInfoZ.isUniform()) {
Zain Jaffal3d3d8fe2023-04-20 18:20:01 +0100888 static const CostTblEntry FshlTbl[] = {
David Greene44e24d2025-03-09 18:01:45 +0000889 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
890 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
891 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
892 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
Zain Jaffal3d3d8fe2023-04-20 18:20:01 +0100893 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
894 // to avoid having to duplicate the costs.
895 const auto *Entry =
896 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
897 if (Entry)
898 return LegalisationCost.first * Entry->Cost;
899 }
900
901 auto TyL = getTypeLegalizationCost(RetTy);
902 if (!RetTy->isIntegerTy())
903 break;
904
905 // Estimate cost manually, as types like i8 and i16 will get promoted to
906 // i32 and CostTableLookup will ignore the extra conversion cost.
907 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
908 RetTy->getScalarSizeInBits() < 64) ||
909 (RetTy->getScalarSizeInBits() % 64 != 0);
910 unsigned ExtraCost = HigherCost ? 1 : 0;
911 if (RetTy->getScalarSizeInBits() == 32 ||
912 RetTy->getScalarSizeInBits() == 64)
913 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
914 // extr instruction.
915 else if (HigherCost)
916 ExtraCost = 1;
917 else
918 break;
919 return TyL.first + ExtraCost;
920 }
David Sherwood96b2e352024-04-24 14:31:06 +0100921 case Intrinsic::get_active_lane_mask: {
922 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
923 if (RetTy) {
924 EVT RetVT = getTLI()->getValueType(DL, RetTy);
925 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
926 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
927 !getTLI()->isTypeLegal(RetVT)) {
928 // We don't have enough context at this point to determine if the mask
929 // is going to be kept live after the block, which will force the vXi1
930 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
931 // For now, we just assume the vectorizer created this intrinsic and
932 // the result will be the input for a PHI. In this case the cost will
933 // be extremely high for fixed-width vectors.
934 // NOTE: getScalarizationOverhead returns a cost that's far too
935 // pessimistic for the actual generated codegen. In reality there are
936 // two instructions generated per lane.
937 return RetTy->getNumElements() * 2;
938 }
939 }
940 break;
941 }
Ricardo Jesus2fe30bc2024-12-11 07:51:11 +0000942 case Intrinsic::experimental_vector_match: {
943 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
944 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
945 unsigned SearchSize = NeedleTy->getNumElements();
946 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
947 // Base cost for MATCH instructions. At least on the Neoverse V2 and
948 // Neoverse V3, these are cheap operations with the same latency as a
949 // vector ADD. In most cases, however, we also need to do an extra DUP.
950 // For fixed-length vectors we currently need an extra five--six
951 // instructions besides the MATCH.
952 InstructionCost Cost = 4;
953 if (isa<FixedVectorType>(RetTy))
954 Cost += 10;
955 return Cost;
956 }
957 break;
958 }
David Sherwoodde5d5882025-02-04 09:41:53 +0000959 case Intrinsic::experimental_cttz_elts: {
960 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
961 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
962 // This will consist of a SVE brkb and a cntp instruction. These
963 // typically have the same latency and half the throughput as a vector
964 // add instruction.
965 return 4;
966 }
967 break;
968 }
Florian Hahn0fcc6f72020-10-23 09:00:20 +0100969 default:
970 break;
971 }
972 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
973}
974
Bradley Smithc8f20ed2021-04-26 16:19:25 +0100975/// The function will remove redundant reinterprets casting in the presence
976/// of the control flow
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -0800977static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
978 IntrinsicInst &II) {
Bradley Smithc8f20ed2021-04-26 16:19:25 +0100979 SmallVector<Instruction *, 32> Worklist;
980 auto RequiredType = II.getType();
981
982 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
983 assert(PN && "Expected Phi Node!");
984
985 // Don't create a new Phi unless we can remove the old one.
986 if (!PN->hasOneUse())
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -0800987 return std::nullopt;
Bradley Smithc8f20ed2021-04-26 16:19:25 +0100988
989 for (Value *IncValPhi : PN->incoming_values()) {
990 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
991 if (!Reinterpret ||
992 Reinterpret->getIntrinsicID() !=
993 Intrinsic::aarch64_sve_convert_to_svbool ||
994 RequiredType != Reinterpret->getArgOperand(0)->getType())
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -0800995 return std::nullopt;
Bradley Smithc8f20ed2021-04-26 16:19:25 +0100996 }
997
998 // Create the new Phi
Nikita Popovf9f85172023-06-16 14:58:33 +0200999 IC.Builder.SetInsertPoint(PN);
Nikita Popov724f4a52023-05-16 18:11:17 +02001000 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
Bradley Smithc8f20ed2021-04-26 16:19:25 +01001001 Worklist.push_back(PN);
1002
1003 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1004 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1005 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1006 Worklist.push_back(Reinterpret);
1007 }
1008
1009 // Cleanup Phi Node and reinterprets
1010 return IC.replaceInstUsesWith(II, NPN);
1011}
1012
Paul Walkerc1927372025-04-01 13:27:46 +01001013// A collection of properties common to SVE intrinsics that allow for combines
1014// to be written without needing to know the specific intrinsic.
1015struct SVEIntrinsicInfo {
1016 //
1017 // Helper routines for common intrinsic definitions.
1018 //
1019
1020 // e.g. llvm.aarch64.sve.add pg, op1, op2
1021 // with IID ==> llvm.aarch64.sve.add_u
1022 static SVEIntrinsicInfo
1023 defaultMergingOp(Intrinsic::ID IID = Intrinsic::not_intrinsic) {
1024 return SVEIntrinsicInfo()
1025 .setGoverningPredicateOperandIdx(0)
1026 .setOperandIdxInactiveLanesTakenFrom(1)
1027 .setMatchingUndefIntrinsic(IID);
1028 }
1029
1030 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1031 static SVEIntrinsicInfo defaultMergingUnaryOp() {
1032 return SVEIntrinsicInfo()
1033 .setGoverningPredicateOperandIdx(1)
1034 .setOperandIdxInactiveLanesTakenFrom(0)
1035 .setOperandIdxWithNoActiveLanes(0);
1036 }
1037
1038 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1039 static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp() {
1040 return SVEIntrinsicInfo()
1041 .setGoverningPredicateOperandIdx(1)
1042 .setOperandIdxInactiveLanesTakenFrom(0);
1043 }
1044
1045 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1046 static SVEIntrinsicInfo defaultUndefOp() {
1047 return SVEIntrinsicInfo()
1048 .setGoverningPredicateOperandIdx(0)
1049 .setInactiveLanesAreNotDefined();
1050 }
1051
1052 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1053 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1054 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1055 return SVEIntrinsicInfo()
1056 .setGoverningPredicateOperandIdx(GPIndex)
1057 .setInactiveLanesAreUnused();
1058 }
1059
1060 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1061 // llvm.aarch64.sve.ld1 pg, ptr
1062 static SVEIntrinsicInfo defaultZeroingOp() {
1063 return SVEIntrinsicInfo()
1064 .setGoverningPredicateOperandIdx(0)
1065 .setInactiveLanesAreUnused()
1066 .setResultIsZeroInitialized();
1067 }
1068
1069 // All properties relate to predication and thus having a general predicate
1070 // is the minimum requirement to say there is intrinsic info to act on.
1071 explicit operator bool() const { return hasGoverningPredicate(); }
1072
1073 //
1074 // Properties relating to the governing predicate.
1075 //
1076
1077 bool hasGoverningPredicate() const {
1078 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1079 }
1080
1081 unsigned getGoverningPredicateOperandIdx() const {
1082 assert(hasGoverningPredicate() && "Propery not set!");
1083 return GoverningPredicateIdx;
1084 }
1085
1086 SVEIntrinsicInfo &setGoverningPredicateOperandIdx(unsigned Index) {
1087 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1088 GoverningPredicateIdx = Index;
1089 return *this;
1090 }
1091
1092 //
1093 // Properties relating to operations the intrinsic could be transformed into.
1094 // NOTE: This does not mean such a transformation is always possible, but the
1095 // knowledge makes it possible to reuse existing optimisations without needing
1096 // to embed specific handling for each intrinsic. For example, instruction
1097 // simplification can be used to optimise an intrinsic's active lanes.
1098 //
1099
1100 bool hasMatchingUndefIntrinsic() const {
1101 return UndefIntrinsic != Intrinsic::not_intrinsic;
1102 }
1103
1104 Intrinsic::ID getMatchingUndefIntrinsic() const {
1105 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1106 return UndefIntrinsic;
1107 }
1108
1109 SVEIntrinsicInfo &setMatchingUndefIntrinsic(Intrinsic::ID IID) {
1110 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1111 UndefIntrinsic = IID;
1112 return *this;
1113 }
1114
Paul Walker19970732025-04-08 11:38:27 +01001115 bool hasMatchingIROpode() const { return IROpcode != 0; }
1116
1117 unsigned getMatchingIROpode() const {
1118 assert(hasMatchingIROpode() && "Propery not set!");
1119 return IROpcode;
1120 }
1121
1122 SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) {
1123 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1124 IROpcode = Opcode;
1125 return *this;
1126 }
1127
Paul Walkerc1927372025-04-01 13:27:46 +01001128 //
1129 // Properties relating to the result of inactive lanes.
1130 //
1131
1132 bool inactiveLanesTakenFromOperand() const {
1133 return ResultLanes == InactiveLanesTakenFromOperand;
1134 }
1135
1136 unsigned getOperandIdxInactiveLanesTakenFrom() const {
1137 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1138 return OperandIdxForInactiveLanes;
1139 }
1140
1141 SVEIntrinsicInfo &setOperandIdxInactiveLanesTakenFrom(unsigned Index) {
1142 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1143 ResultLanes = InactiveLanesTakenFromOperand;
1144 OperandIdxForInactiveLanes = Index;
1145 return *this;
1146 }
1147
1148 bool inactiveLanesAreNotDefined() const {
1149 return ResultLanes == InactiveLanesAreNotDefined;
1150 }
1151
1152 SVEIntrinsicInfo &setInactiveLanesAreNotDefined() {
1153 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1154 ResultLanes = InactiveLanesAreNotDefined;
1155 return *this;
1156 }
1157
1158 bool inactiveLanesAreUnused() const {
1159 return ResultLanes == InactiveLanesAreUnused;
1160 }
1161
1162 SVEIntrinsicInfo &setInactiveLanesAreUnused() {
1163 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1164 ResultLanes = InactiveLanesAreUnused;
1165 return *this;
1166 }
1167
1168 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1169 // inactiveLanesAreZerod =
1170 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1171 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1172
1173 SVEIntrinsicInfo &setResultIsZeroInitialized() {
1174 ResultIsZeroInitialized = true;
1175 return *this;
1176 }
1177
1178 //
1179 // The first operand of unary merging operations is typically only used to
1180 // set the result for inactive lanes. Knowing this allows us to deadcode the
1181 // operand when we can prove there are no inactive lanes.
1182 //
1183
1184 bool hasOperandWithNoActiveLanes() const {
1185 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1186 }
1187
1188 unsigned getOperandIdxWithNoActiveLanes() const {
1189 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1190 return OperandIdxWithNoActiveLanes;
1191 }
1192
1193 SVEIntrinsicInfo &setOperandIdxWithNoActiveLanes(unsigned Index) {
1194 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1195 OperandIdxWithNoActiveLanes = Index;
1196 return *this;
1197 }
1198
1199private:
1200 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1201
1202 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
Paul Walker19970732025-04-08 11:38:27 +01001203 unsigned IROpcode = 0;
Paul Walkerc1927372025-04-01 13:27:46 +01001204
1205 enum PredicationStyle {
1206 Uninitialized,
1207 InactiveLanesTakenFromOperand,
1208 InactiveLanesAreNotDefined,
1209 InactiveLanesAreUnused
1210 } ResultLanes = Uninitialized;
1211
1212 bool ResultIsZeroInitialized = false;
1213 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1214 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1215};
1216
1217static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
1218 // Some SVE intrinsics do not use scalable vector types, but since they are
1219 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1220 if (!isa<ScalableVectorType>(II.getType()) &&
1221 all_of(II.args(), [&](const Value *V) {
1222 return !isa<ScalableVectorType>(V->getType());
1223 }))
1224 return SVEIntrinsicInfo();
1225
1226 Intrinsic::ID IID = II.getIntrinsicID();
1227 switch (IID) {
1228 default:
1229 break;
1230 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1231 case Intrinsic::aarch64_sve_fcvt_f16f32:
1232 case Intrinsic::aarch64_sve_fcvt_f16f64:
1233 case Intrinsic::aarch64_sve_fcvt_f32f16:
1234 case Intrinsic::aarch64_sve_fcvt_f32f64:
1235 case Intrinsic::aarch64_sve_fcvt_f64f16:
1236 case Intrinsic::aarch64_sve_fcvt_f64f32:
1237 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1238 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1239 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1240 case Intrinsic::aarch64_sve_fcvtzs:
1241 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1242 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1243 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1244 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1245 case Intrinsic::aarch64_sve_fcvtzu:
1246 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1247 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1248 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1249 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1250 case Intrinsic::aarch64_sve_scvtf:
1251 case Intrinsic::aarch64_sve_scvtf_f16i32:
1252 case Intrinsic::aarch64_sve_scvtf_f16i64:
1253 case Intrinsic::aarch64_sve_scvtf_f32i64:
1254 case Intrinsic::aarch64_sve_scvtf_f64i32:
1255 case Intrinsic::aarch64_sve_ucvtf:
1256 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1257 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1258 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1259 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1260 return SVEIntrinsicInfo::defaultMergingUnaryOp();
1261
1262 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1263 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1264 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1265 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1266 return SVEIntrinsicInfo::defaultMergingUnaryNarrowingTopOp();
1267
1268 case Intrinsic::aarch64_sve_fabd:
1269 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1270 case Intrinsic::aarch64_sve_fadd:
Paul Walker96ec17d2025-04-25 11:30:03 +01001271 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1272 .setMatchingIROpcode(Instruction::FAdd);
Paul Walkerc1927372025-04-01 13:27:46 +01001273 case Intrinsic::aarch64_sve_fdiv:
Paul Walker96ec17d2025-04-25 11:30:03 +01001274 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1275 .setMatchingIROpcode(Instruction::FDiv);
Paul Walkerc1927372025-04-01 13:27:46 +01001276 case Intrinsic::aarch64_sve_fmax:
1277 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1278 case Intrinsic::aarch64_sve_fmaxnm:
1279 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1280 case Intrinsic::aarch64_sve_fmin:
1281 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1282 case Intrinsic::aarch64_sve_fminnm:
1283 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1284 case Intrinsic::aarch64_sve_fmla:
1285 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1286 case Intrinsic::aarch64_sve_fmls:
1287 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1288 case Intrinsic::aarch64_sve_fmul:
Paul Walker19970732025-04-08 11:38:27 +01001289 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1290 .setMatchingIROpcode(Instruction::FMul);
Paul Walkerc1927372025-04-01 13:27:46 +01001291 case Intrinsic::aarch64_sve_fmulx:
1292 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1293 case Intrinsic::aarch64_sve_fnmla:
1294 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1295 case Intrinsic::aarch64_sve_fnmls:
1296 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1297 case Intrinsic::aarch64_sve_fsub:
Paul Walker96ec17d2025-04-25 11:30:03 +01001298 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1299 .setMatchingIROpcode(Instruction::FSub);
Paul Walkerc1927372025-04-01 13:27:46 +01001300 case Intrinsic::aarch64_sve_add:
Paul Walker96ec17d2025-04-25 11:30:03 +01001301 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1302 .setMatchingIROpcode(Instruction::Add);
Paul Walkerc1927372025-04-01 13:27:46 +01001303 case Intrinsic::aarch64_sve_mla:
1304 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1305 case Intrinsic::aarch64_sve_mls:
1306 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1307 case Intrinsic::aarch64_sve_mul:
Paul Walker19970732025-04-08 11:38:27 +01001308 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1309 .setMatchingIROpcode(Instruction::Mul);
Paul Walkerc1927372025-04-01 13:27:46 +01001310 case Intrinsic::aarch64_sve_sabd:
1311 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
Paul Walker149d7952025-05-01 13:20:05 +01001312 case Intrinsic::aarch64_sve_sdiv:
1313 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1314 .setMatchingIROpcode(Instruction::SDiv);
Paul Walkerc1927372025-04-01 13:27:46 +01001315 case Intrinsic::aarch64_sve_smax:
1316 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1317 case Intrinsic::aarch64_sve_smin:
1318 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1319 case Intrinsic::aarch64_sve_smulh:
1320 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1321 case Intrinsic::aarch64_sve_sub:
Paul Walker96ec17d2025-04-25 11:30:03 +01001322 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1323 .setMatchingIROpcode(Instruction::Sub);
Paul Walkerc1927372025-04-01 13:27:46 +01001324 case Intrinsic::aarch64_sve_uabd:
1325 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
Paul Walker149d7952025-05-01 13:20:05 +01001326 case Intrinsic::aarch64_sve_udiv:
1327 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1328 .setMatchingIROpcode(Instruction::UDiv);
Paul Walkerc1927372025-04-01 13:27:46 +01001329 case Intrinsic::aarch64_sve_umax:
1330 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1331 case Intrinsic::aarch64_sve_umin:
1332 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1333 case Intrinsic::aarch64_sve_umulh:
1334 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1335 case Intrinsic::aarch64_sve_asr:
Paul Walker8dc89e32025-04-30 13:21:46 +01001336 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1337 .setMatchingIROpcode(Instruction::AShr);
Paul Walkerc1927372025-04-01 13:27:46 +01001338 case Intrinsic::aarch64_sve_lsl:
Paul Walker8dc89e32025-04-30 13:21:46 +01001339 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1340 .setMatchingIROpcode(Instruction::Shl);
Paul Walkerc1927372025-04-01 13:27:46 +01001341 case Intrinsic::aarch64_sve_lsr:
Paul Walker8dc89e32025-04-30 13:21:46 +01001342 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1343 .setMatchingIROpcode(Instruction::LShr);
Paul Walkerc1927372025-04-01 13:27:46 +01001344 case Intrinsic::aarch64_sve_and:
Paul Walker96ec17d2025-04-25 11:30:03 +01001345 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1346 .setMatchingIROpcode(Instruction::And);
Paul Walkerc1927372025-04-01 13:27:46 +01001347 case Intrinsic::aarch64_sve_bic:
1348 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1349 case Intrinsic::aarch64_sve_eor:
Paul Walker96ec17d2025-04-25 11:30:03 +01001350 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1351 .setMatchingIROpcode(Instruction::Xor);
Paul Walkerc1927372025-04-01 13:27:46 +01001352 case Intrinsic::aarch64_sve_orr:
Paul Walker96ec17d2025-04-25 11:30:03 +01001353 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1354 .setMatchingIROpcode(Instruction::Or);
Paul Walkerc1927372025-04-01 13:27:46 +01001355 case Intrinsic::aarch64_sve_sqsub:
1356 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1357 case Intrinsic::aarch64_sve_uqsub:
1358 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1359
Paul Walker96ec17d2025-04-25 11:30:03 +01001360 case Intrinsic::aarch64_sve_add_u:
1361 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1362 Instruction::Add);
1363 case Intrinsic::aarch64_sve_and_u:
1364 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1365 Instruction::And);
Paul Walker8dc89e32025-04-30 13:21:46 +01001366 case Intrinsic::aarch64_sve_asr_u:
1367 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1368 Instruction::AShr);
Paul Walker96ec17d2025-04-25 11:30:03 +01001369 case Intrinsic::aarch64_sve_eor_u:
1370 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1371 Instruction::Xor);
1372 case Intrinsic::aarch64_sve_fadd_u:
1373 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1374 Instruction::FAdd);
1375 case Intrinsic::aarch64_sve_fdiv_u:
1376 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1377 Instruction::FDiv);
Paul Walker19970732025-04-08 11:38:27 +01001378 case Intrinsic::aarch64_sve_fmul_u:
1379 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1380 Instruction::FMul);
Paul Walker96ec17d2025-04-25 11:30:03 +01001381 case Intrinsic::aarch64_sve_fsub_u:
1382 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1383 Instruction::FSub);
Paul Walker8dc89e32025-04-30 13:21:46 +01001384 case Intrinsic::aarch64_sve_lsl_u:
1385 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1386 Instruction::Shl);
1387 case Intrinsic::aarch64_sve_lsr_u:
1388 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1389 Instruction::LShr);
Paul Walker19970732025-04-08 11:38:27 +01001390 case Intrinsic::aarch64_sve_mul_u:
1391 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1392 Instruction::Mul);
Paul Walker96ec17d2025-04-25 11:30:03 +01001393 case Intrinsic::aarch64_sve_orr_u:
1394 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1395 Instruction::Or);
Paul Walker149d7952025-05-01 13:20:05 +01001396 case Intrinsic::aarch64_sve_sdiv_u:
1397 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1398 Instruction::SDiv);
Paul Walker96ec17d2025-04-25 11:30:03 +01001399 case Intrinsic::aarch64_sve_sub_u:
1400 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1401 Instruction::Sub);
Paul Walker149d7952025-05-01 13:20:05 +01001402 case Intrinsic::aarch64_sve_udiv_u:
1403 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1404 Instruction::UDiv);
Paul Walker19970732025-04-08 11:38:27 +01001405
Paul Walkerc1927372025-04-01 13:27:46 +01001406 case Intrinsic::aarch64_sve_addqv:
1407 case Intrinsic::aarch64_sve_and_z:
1408 case Intrinsic::aarch64_sve_bic_z:
1409 case Intrinsic::aarch64_sve_brka_z:
1410 case Intrinsic::aarch64_sve_brkb_z:
1411 case Intrinsic::aarch64_sve_brkn_z:
1412 case Intrinsic::aarch64_sve_brkpa_z:
1413 case Intrinsic::aarch64_sve_brkpb_z:
1414 case Intrinsic::aarch64_sve_cntp:
1415 case Intrinsic::aarch64_sve_compact:
1416 case Intrinsic::aarch64_sve_eor_z:
1417 case Intrinsic::aarch64_sve_eorv:
1418 case Intrinsic::aarch64_sve_eorqv:
1419 case Intrinsic::aarch64_sve_nand_z:
1420 case Intrinsic::aarch64_sve_nor_z:
1421 case Intrinsic::aarch64_sve_orn_z:
1422 case Intrinsic::aarch64_sve_orr_z:
1423 case Intrinsic::aarch64_sve_orv:
1424 case Intrinsic::aarch64_sve_orqv:
1425 case Intrinsic::aarch64_sve_pnext:
1426 case Intrinsic::aarch64_sve_rdffr_z:
1427 case Intrinsic::aarch64_sve_saddv:
1428 case Intrinsic::aarch64_sve_uaddv:
1429 case Intrinsic::aarch64_sve_umaxv:
1430 case Intrinsic::aarch64_sve_umaxqv:
1431 case Intrinsic::aarch64_sve_cmpeq:
1432 case Intrinsic::aarch64_sve_cmpeq_wide:
1433 case Intrinsic::aarch64_sve_cmpge:
1434 case Intrinsic::aarch64_sve_cmpge_wide:
1435 case Intrinsic::aarch64_sve_cmpgt:
1436 case Intrinsic::aarch64_sve_cmpgt_wide:
1437 case Intrinsic::aarch64_sve_cmphi:
1438 case Intrinsic::aarch64_sve_cmphi_wide:
1439 case Intrinsic::aarch64_sve_cmphs:
1440 case Intrinsic::aarch64_sve_cmphs_wide:
1441 case Intrinsic::aarch64_sve_cmple_wide:
1442 case Intrinsic::aarch64_sve_cmplo_wide:
1443 case Intrinsic::aarch64_sve_cmpls_wide:
1444 case Intrinsic::aarch64_sve_cmplt_wide:
1445 case Intrinsic::aarch64_sve_cmpne:
1446 case Intrinsic::aarch64_sve_cmpne_wide:
1447 case Intrinsic::aarch64_sve_facge:
1448 case Intrinsic::aarch64_sve_facgt:
1449 case Intrinsic::aarch64_sve_fcmpeq:
1450 case Intrinsic::aarch64_sve_fcmpge:
1451 case Intrinsic::aarch64_sve_fcmpgt:
1452 case Intrinsic::aarch64_sve_fcmpne:
1453 case Intrinsic::aarch64_sve_fcmpuo:
1454 case Intrinsic::aarch64_sve_ld1:
1455 case Intrinsic::aarch64_sve_ld1_gather:
1456 case Intrinsic::aarch64_sve_ld1_gather_index:
1457 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1458 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1459 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1460 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1461 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1462 case Intrinsic::aarch64_sve_ld1q_gather_index:
1463 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1464 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1465 case Intrinsic::aarch64_sve_ld1ro:
1466 case Intrinsic::aarch64_sve_ld1rq:
1467 case Intrinsic::aarch64_sve_ld1udq:
1468 case Intrinsic::aarch64_sve_ld1uwq:
1469 case Intrinsic::aarch64_sve_ld2_sret:
1470 case Intrinsic::aarch64_sve_ld2q_sret:
1471 case Intrinsic::aarch64_sve_ld3_sret:
1472 case Intrinsic::aarch64_sve_ld3q_sret:
1473 case Intrinsic::aarch64_sve_ld4_sret:
1474 case Intrinsic::aarch64_sve_ld4q_sret:
1475 case Intrinsic::aarch64_sve_ldff1:
1476 case Intrinsic::aarch64_sve_ldff1_gather:
1477 case Intrinsic::aarch64_sve_ldff1_gather_index:
1478 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1479 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1480 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1481 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1482 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1483 case Intrinsic::aarch64_sve_ldnf1:
1484 case Intrinsic::aarch64_sve_ldnt1:
1485 case Intrinsic::aarch64_sve_ldnt1_gather:
1486 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1487 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1488 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1489 return SVEIntrinsicInfo::defaultZeroingOp();
1490
1491 case Intrinsic::aarch64_sve_prf:
1492 case Intrinsic::aarch64_sve_prfb_gather_index:
1493 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1494 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1495 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1496 case Intrinsic::aarch64_sve_prfd_gather_index:
1497 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1498 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1499 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1500 case Intrinsic::aarch64_sve_prfh_gather_index:
1501 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1502 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1503 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1504 case Intrinsic::aarch64_sve_prfw_gather_index:
1505 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1506 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1507 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1508 return SVEIntrinsicInfo::defaultVoidOp(0);
1509
1510 case Intrinsic::aarch64_sve_st1_scatter:
1511 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1512 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1513 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1514 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1515 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1516 case Intrinsic::aarch64_sve_st1dq:
1517 case Intrinsic::aarch64_sve_st1q_scatter_index:
1518 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1519 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1520 case Intrinsic::aarch64_sve_st1wq:
1521 case Intrinsic::aarch64_sve_stnt1:
1522 case Intrinsic::aarch64_sve_stnt1_scatter:
1523 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1524 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1525 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1526 return SVEIntrinsicInfo::defaultVoidOp(1);
1527 case Intrinsic::aarch64_sve_st2:
1528 case Intrinsic::aarch64_sve_st2q:
1529 return SVEIntrinsicInfo::defaultVoidOp(2);
1530 case Intrinsic::aarch64_sve_st3:
1531 case Intrinsic::aarch64_sve_st3q:
1532 return SVEIntrinsicInfo::defaultVoidOp(3);
1533 case Intrinsic::aarch64_sve_st4:
1534 case Intrinsic::aarch64_sve_st4q:
1535 return SVEIntrinsicInfo::defaultVoidOp(4);
1536 }
1537
1538 return SVEIntrinsicInfo();
1539}
1540
1541static bool isAllActivePredicate(Value *Pred) {
1542 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1543 Value *UncastedPred;
1544 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1545 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1546 m_Value(UncastedPred)))))
1547 // If the predicate has the same or less lanes than the uncasted
1548 // predicate then we know the casting has no effect.
1549 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1550 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1551 Pred = UncastedPred;
Matthew Devereau91a20562025-04-13 20:40:51 +01001552 auto *C = dyn_cast<Constant>(Pred);
1553 return (C && C->isAllOnesValue());
Paul Walkerc1927372025-04-01 13:27:46 +01001554}
1555
Paul Walkera7999f32025-04-17 15:58:39 +01001556// Simplify `V` by only considering the operations that affect active lanes.
1557// This function should only return existing Values or newly created Constants.
1558static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1559 auto *Dup = dyn_cast<IntrinsicInst>(V);
1560 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1561 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1562 return ConstantVector::getSplat(
1563 cast<VectorType>(V->getType())->getElementCount(),
1564 cast<Constant>(Dup->getOperand(2)));
1565
1566 return V;
1567}
1568
1569static std::optional<Instruction *>
1570simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II,
1571 const SVEIntrinsicInfo &IInfo) {
1572 const unsigned Opc = IInfo.getMatchingIROpode();
1573 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1574
1575 Value *Pg = II.getOperand(0);
1576 Value *Op1 = II.getOperand(1);
1577 Value *Op2 = II.getOperand(2);
1578 const DataLayout &DL = II.getDataLayout();
1579
1580 // Canonicalise constants to the RHS.
1581 if (Instruction::isCommutative(Opc) && IInfo.inactiveLanesAreNotDefined() &&
1582 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1583 IC.replaceOperand(II, 1, Op2);
1584 IC.replaceOperand(II, 2, Op1);
1585 return &II;
1586 }
1587
1588 // Only active lanes matter when simplifying the operation.
1589 Op1 = stripInactiveLanes(Op1, Pg);
1590 Op2 = stripInactiveLanes(Op2, Pg);
1591
1592 Value *SimpleII;
1593 if (auto FII = dyn_cast<FPMathOperator>(&II))
1594 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1595 else
1596 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1597
Paul Walker8dc89e32025-04-30 13:21:46 +01001598 // An SVE intrinsic's result is always defined. However, this is not the case
1599 // for its equivalent IR instruction (e.g. when shifting by an amount more
1600 // than the data's bitwidth). Simplifications to an undefined result must be
1601 // ignored to preserve the intrinsic's expected behaviour.
1602 if (!SimpleII || isa<UndefValue>(SimpleII))
Paul Walkera7999f32025-04-17 15:58:39 +01001603 return std::nullopt;
1604
1605 if (IInfo.inactiveLanesAreNotDefined())
1606 return IC.replaceInstUsesWith(II, SimpleII);
1607
1608 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1609
1610 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1611 if (SimpleII == Inactive)
1612 return IC.replaceInstUsesWith(II, SimpleII);
1613
1614 // Inactive lanes must be preserved.
1615 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1616 return IC.replaceInstUsesWith(II, SimpleII);
1617}
1618
Paul Walkerc1927372025-04-01 13:27:46 +01001619// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1620// to operations with less strict inactive lane requirements.
1621static std::optional<Instruction *>
1622simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II,
1623 const SVEIntrinsicInfo &IInfo) {
1624 if (!IInfo.hasGoverningPredicate())
1625 return std::nullopt;
1626
1627 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1628
1629 // If there are no active lanes.
1630 if (match(OpPredicate, m_ZeroInt())) {
1631 if (IInfo.inactiveLanesTakenFromOperand())
1632 return IC.replaceInstUsesWith(
1633 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1634
1635 if (IInfo.inactiveLanesAreUnused()) {
1636 if (IInfo.resultIsZeroInitialized())
1637 IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
1638
1639 return IC.eraseInstFromFunction(II);
1640 }
1641 }
1642
1643 // If there are no inactive lanes.
1644 if (isAllActivePredicate(OpPredicate)) {
1645 if (IInfo.hasOperandWithNoActiveLanes()) {
1646 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1647 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1648 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1649 }
1650
1651 if (IInfo.hasMatchingUndefIntrinsic()) {
1652 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1653 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1654 II.setCalledFunction(NewDecl);
1655 return &II;
1656 }
1657 }
1658
Paul Walkera7999f32025-04-17 15:58:39 +01001659 // Operation specific simplifications.
1660 if (IInfo.hasMatchingIROpode() &&
1661 Instruction::isBinaryOp(IInfo.getMatchingIROpode()))
1662 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1663
Paul Walkerc1927372025-04-01 13:27:46 +01001664 return std::nullopt;
1665}
1666
Matt Devereaucee8b252022-01-05 13:42:01 +00001667// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1668// => (binop (pred) (from_svbool _) (from_svbool _))
1669//
1670// The above transformation eliminates a `to_svbool` in the predicate
1671// operand of bitwise operation `binop` by narrowing the vector width of
1672// the operation. For example, it would convert a `<vscale x 16 x i1>
1673// and` into a `<vscale x 4 x i1> and`. This is profitable because
1674// to_svbool must zero the new lanes during widening, whereas
1675// from_svbool is free.
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001676static std::optional<Instruction *>
1677tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
Matt Devereaucee8b252022-01-05 13:42:01 +00001678 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1679 if (!BinOp)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001680 return std::nullopt;
Matt Devereaucee8b252022-01-05 13:42:01 +00001681
1682 auto IntrinsicID = BinOp->getIntrinsicID();
1683 switch (IntrinsicID) {
1684 case Intrinsic::aarch64_sve_and_z:
1685 case Intrinsic::aarch64_sve_bic_z:
1686 case Intrinsic::aarch64_sve_eor_z:
1687 case Intrinsic::aarch64_sve_nand_z:
1688 case Intrinsic::aarch64_sve_nor_z:
1689 case Intrinsic::aarch64_sve_orn_z:
1690 case Intrinsic::aarch64_sve_orr_z:
1691 break;
1692 default:
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001693 return std::nullopt;
Matt Devereaucee8b252022-01-05 13:42:01 +00001694 }
1695
1696 auto BinOpPred = BinOp->getOperand(0);
1697 auto BinOpOp1 = BinOp->getOperand(1);
1698 auto BinOpOp2 = BinOp->getOperand(2);
1699
1700 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1701 if (!PredIntr ||
1702 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001703 return std::nullopt;
Matt Devereaucee8b252022-01-05 13:42:01 +00001704
1705 auto PredOp = PredIntr->getOperand(0);
1706 auto PredOpTy = cast<VectorType>(PredOp->getType());
1707 if (PredOpTy != II.getType())
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001708 return std::nullopt;
Matt Devereaucee8b252022-01-05 13:42:01 +00001709
Matt Devereaucee8b252022-01-05 13:42:01 +00001710 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
Nikita Popov724f4a52023-05-16 18:11:17 +02001711 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
Matt Devereaucee8b252022-01-05 13:42:01 +00001712 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1713 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1714 if (BinOpOp1 == BinOpOp2)
1715 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1716 else
Nikita Popov724f4a52023-05-16 18:11:17 +02001717 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
Matt Devereaucee8b252022-01-05 13:42:01 +00001718 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1719
1720 auto NarrowedBinOp =
Nikita Popov724f4a52023-05-16 18:11:17 +02001721 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
Matt Devereaucee8b252022-01-05 13:42:01 +00001722 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1723}
1724
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001725static std::optional<Instruction *>
1726instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
Bradley Smithc8f20ed2021-04-26 16:19:25 +01001727 // If the reinterpret instruction operand is a PHI Node
1728 if (isa<PHINode>(II.getArgOperand(0)))
1729 return processPhiNode(IC, II);
1730
Matt Devereaucee8b252022-01-05 13:42:01 +00001731 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1732 return BinOpCombine;
1733
Sander de Smalen11926e62023-05-22 13:52:18 +00001734 // Ignore converts to/from svcount_t.
1735 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1736 isa<TargetExtType>(II.getType()))
1737 return std::nullopt;
1738
Bradley Smithc8f20ed2021-04-26 16:19:25 +01001739 SmallVector<Instruction *, 32> CandidatesForRemoval;
1740 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1741
1742 const auto *IVTy = cast<VectorType>(II.getType());
1743
1744 // Walk the chain of conversions.
1745 while (Cursor) {
1746 // If the type of the cursor has fewer lanes than the final result, zeroing
1747 // must take place, which breaks the equivalence chain.
1748 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1749 if (CursorVTy->getElementCount().getKnownMinValue() <
1750 IVTy->getElementCount().getKnownMinValue())
1751 break;
1752
1753 // If the cursor has the same type as I, it is a viable replacement.
1754 if (Cursor->getType() == IVTy)
1755 EarliestReplacement = Cursor;
1756
1757 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1758
1759 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1760 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1761 Intrinsic::aarch64_sve_convert_to_svbool ||
1762 IntrinsicCursor->getIntrinsicID() ==
1763 Intrinsic::aarch64_sve_convert_from_svbool))
1764 break;
1765
1766 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1767 Cursor = IntrinsicCursor->getOperand(0);
1768 }
1769
1770 // If no viable replacement in the conversion chain was found, there is
1771 // nothing to do.
1772 if (!EarliestReplacement)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001773 return std::nullopt;
Bradley Smithc8f20ed2021-04-26 16:19:25 +01001774
1775 return IC.replaceInstUsesWith(II, EarliestReplacement);
1776}
1777
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001778static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1779 IntrinsicInst &II) {
zhongyunde 00443407bf90ffb2023-09-27 22:42:43 -04001780 // svsel(ptrue, x, y) => x
1781 auto *OpPredicate = II.getOperand(0);
1782 if (isAllActivePredicate(OpPredicate))
1783 return IC.replaceInstUsesWith(II, II.getOperand(1));
1784
1785 auto Select =
1786 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
Matt Devereaua9e08bc2022-03-16 11:41:14 +00001787 return IC.replaceInstUsesWith(II, Select);
1788}
1789
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001790static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1791 IntrinsicInst &II) {
Bradley Smith89085bc2021-04-23 13:55:42 +01001792 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1793 if (!Pg)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001794 return std::nullopt;
Bradley Smith89085bc2021-04-23 13:55:42 +01001795
1796 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001797 return std::nullopt;
Bradley Smith89085bc2021-04-23 13:55:42 +01001798
1799 const auto PTruePattern =
1800 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1801 if (PTruePattern != AArch64SVEPredPattern::vl1)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001802 return std::nullopt;
Bradley Smith89085bc2021-04-23 13:55:42 +01001803
1804 // The intrinsic is inserting into lane zero so use an insert instead.
1805 auto *IdxTy = Type::getInt64Ty(II.getContext());
1806 auto *Insert = InsertElementInst::Create(
1807 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
Jeremy Morse8e702732025-01-24 10:53:11 +00001808 Insert->insertBefore(II.getIterator());
Bradley Smith89085bc2021-04-23 13:55:42 +01001809 Insert->takeName(&II);
1810
1811 return IC.replaceInstUsesWith(II, Insert);
1812}
1813
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001814static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1815 IntrinsicInst &II) {
Usman Nadeemab111e92021-09-10 17:57:29 -07001816 // Replace DupX with a regular IR splat.
Usman Nadeemab111e92021-09-10 17:57:29 -07001817 auto *RetTy = cast<ScalableVectorType>(II.getType());
Nikita Popov724f4a52023-05-16 18:11:17 +02001818 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1819 II.getArgOperand(0));
Usman Nadeemab111e92021-09-10 17:57:29 -07001820 Splat->takeName(&II);
1821 return IC.replaceInstUsesWith(II, Splat);
1822}
1823
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001824static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1825 IntrinsicInst &II) {
Bradley Smith60c9b5f2021-05-20 11:13:34 +01001826 LLVMContext &Ctx = II.getContext();
Bradley Smith60c9b5f2021-05-20 11:13:34 +01001827
Matthew Devereau91a20562025-04-13 20:40:51 +01001828 if (!isAllActivePredicate(II.getArgOperand(0)))
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001829 return std::nullopt;
Bradley Smith60c9b5f2021-05-20 11:13:34 +01001830
1831 // Check that we have a compare of zero..
Usman Nadeemab111e92021-09-10 17:57:29 -07001832 auto *SplatValue =
1833 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
1834 if (!SplatValue || !SplatValue->isZero())
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001835 return std::nullopt;
Bradley Smith60c9b5f2021-05-20 11:13:34 +01001836
1837 // ..against a dupq
1838 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1839 if (!DupQLane ||
1840 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001841 return std::nullopt;
Bradley Smith60c9b5f2021-05-20 11:13:34 +01001842
1843 // Where the dupq is a lane 0 replicate of a vector insert
cceerczw67a90932024-08-23 22:30:51 +08001844 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1845 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001846 return std::nullopt;
Bradley Smith60c9b5f2021-05-20 11:13:34 +01001847
1848 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
Bradley Smitha83aa332022-06-16 14:45:28 +00001849 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001850 return std::nullopt;
Bradley Smith60c9b5f2021-05-20 11:13:34 +01001851
1852 // Where the vector insert is a fixed constant vector insert into undef at
1853 // index zero
1854 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001855 return std::nullopt;
Bradley Smith60c9b5f2021-05-20 11:13:34 +01001856
1857 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001858 return std::nullopt;
Bradley Smith60c9b5f2021-05-20 11:13:34 +01001859
1860 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1861 if (!ConstVec)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001862 return std::nullopt;
Bradley Smith60c9b5f2021-05-20 11:13:34 +01001863
1864 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1865 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1866 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001867 return std::nullopt;
Bradley Smith60c9b5f2021-05-20 11:13:34 +01001868
1869 unsigned NumElts = VecTy->getNumElements();
1870 unsigned PredicateBits = 0;
1871
1872 // Expand intrinsic operands to a 16-bit byte level predicate
1873 for (unsigned I = 0; I < NumElts; ++I) {
1874 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1875 if (!Arg)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001876 return std::nullopt;
Bradley Smith60c9b5f2021-05-20 11:13:34 +01001877 if (!Arg->isZero())
1878 PredicateBits |= 1 << (I * (16 / NumElts));
1879 }
1880
1881 // If all bits are zero bail early with an empty predicate
1882 if (PredicateBits == 0) {
1883 auto *PFalse = Constant::getNullValue(II.getType());
1884 PFalse->takeName(&II);
1885 return IC.replaceInstUsesWith(II, PFalse);
1886 }
1887
1888 // Calculate largest predicate type used (where byte predicate is largest)
1889 unsigned Mask = 8;
1890 for (unsigned I = 0; I < 16; ++I)
1891 if ((PredicateBits & (1 << I)) != 0)
1892 Mask |= (I % 8);
1893
1894 unsigned PredSize = Mask & -Mask;
1895 auto *PredType = ScalableVectorType::get(
1896 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1897
1898 // Ensure all relevant bits are set
1899 for (unsigned I = 0; I < 16; I += PredSize)
1900 if ((PredicateBits & (1 << I)) == 0)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001901 return std::nullopt;
Bradley Smith60c9b5f2021-05-20 11:13:34 +01001902
1903 auto *PTruePat =
1904 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
Nikita Popov724f4a52023-05-16 18:11:17 +02001905 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1906 {PredType}, {PTruePat});
1907 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
Bradley Smith60c9b5f2021-05-20 11:13:34 +01001908 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1909 auto *ConvertFromSVBool =
Nikita Popov724f4a52023-05-16 18:11:17 +02001910 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1911 {II.getType()}, {ConvertToSVBool});
Bradley Smith60c9b5f2021-05-20 11:13:34 +01001912
1913 ConvertFromSVBool->takeName(&II);
1914 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1915}
1916
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001917static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1918 IntrinsicInst &II) {
Joe Ellisc91cd4f2021-04-16 10:05:05 +00001919 Value *Pg = II.getArgOperand(0);
1920 Value *Vec = II.getArgOperand(1);
Usman Nadeem85bbc052021-07-27 21:02:32 -07001921 auto IntrinsicID = II.getIntrinsicID();
1922 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
Joe Ellisc91cd4f2021-04-16 10:05:05 +00001923
Sander de Smaleneb1a5122021-07-19 10:48:42 +01001924 // lastX(splat(X)) --> X
1925 if (auto *SplatVal = getSplatValue(Vec))
1926 return IC.replaceInstUsesWith(II, SplatVal);
1927
Usman Nadeem85bbc052021-07-27 21:02:32 -07001928 // If x and/or y is a splat value then:
1929 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1930 Value *LHS, *RHS;
1931 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1932 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1933 auto *OldBinOp = cast<BinaryOperator>(Vec);
1934 auto OpC = OldBinOp->getOpcode();
1935 auto *NewLHS =
Nikita Popov724f4a52023-05-16 18:11:17 +02001936 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
Usman Nadeem85bbc052021-07-27 21:02:32 -07001937 auto *NewRHS =
Nikita Popov724f4a52023-05-16 18:11:17 +02001938 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
Usman Nadeem85bbc052021-07-27 21:02:32 -07001939 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
Jeremy Morseb9d83ef2024-03-19 16:36:29 +00001940 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
Usman Nadeem85bbc052021-07-27 21:02:32 -07001941 return IC.replaceInstUsesWith(II, NewBinOp);
1942 }
1943 }
1944
Joe Ellisc91cd4f2021-04-16 10:05:05 +00001945 auto *C = dyn_cast<Constant>(Pg);
1946 if (IsAfter && C && C->isNullValue()) {
1947 // The intrinsic is extracting lane 0 so use an extract instead.
1948 auto *IdxTy = Type::getInt64Ty(II.getContext());
1949 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
Jeremy Morse8e702732025-01-24 10:53:11 +00001950 Extract->insertBefore(II.getIterator());
Joe Ellisc91cd4f2021-04-16 10:05:05 +00001951 Extract->takeName(&II);
1952 return IC.replaceInstUsesWith(II, Extract);
1953 }
1954
1955 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1956 if (!IntrPG)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001957 return std::nullopt;
Joe Ellisc91cd4f2021-04-16 10:05:05 +00001958
1959 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001960 return std::nullopt;
Joe Ellisc91cd4f2021-04-16 10:05:05 +00001961
1962 const auto PTruePattern =
1963 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1964
1965 // Can the intrinsic's predicate be converted to a known constant index?
Jun Ma8c471032021-08-25 17:25:39 +08001966 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1967 if (!MinNumElts)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001968 return std::nullopt;
Joe Ellisc91cd4f2021-04-16 10:05:05 +00001969
Jun Ma8c471032021-08-25 17:25:39 +08001970 unsigned Idx = MinNumElts - 1;
Joe Ellisc91cd4f2021-04-16 10:05:05 +00001971 // Increment the index if extracting the element after the last active
1972 // predicate element.
1973 if (IsAfter)
1974 ++Idx;
1975
1976 // Ignore extracts whose index is larger than the known minimum vector
1977 // length. NOTE: This is an artificial constraint where we prefer to
1978 // maintain what the user asked for until an alternative is proven faster.
1979 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1980 if (Idx >= PgVTy->getMinNumElements())
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001981 return std::nullopt;
Joe Ellisc91cd4f2021-04-16 10:05:05 +00001982
1983 // The intrinsic is extracting a fixed lane so use an extract instead.
1984 auto *IdxTy = Type::getInt64Ty(II.getContext());
1985 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
Jeremy Morse8e702732025-01-24 10:53:11 +00001986 Extract->insertBefore(II.getIterator());
Joe Ellisc91cd4f2021-04-16 10:05:05 +00001987 Extract->takeName(&II);
1988 return IC.replaceInstUsesWith(II, Extract);
1989}
1990
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08001991static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1992 IntrinsicInst &II) {
Cullen Rhodes7c3cda52022-07-08 15:18:27 +00001993 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1994 // integer variant across a variety of micro-architectures. Replace scalar
1995 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1996 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1997 // depending on the micro-architecture, but has been observed as generally
1998 // being faster, particularly when the CLAST[AB] op is a loop-carried
1999 // dependency.
Cullen Rhodes7c3cda52022-07-08 15:18:27 +00002000 Value *Pg = II.getArgOperand(0);
2001 Value *Fallback = II.getArgOperand(1);
2002 Value *Vec = II.getArgOperand(2);
2003 Type *Ty = II.getType();
2004
2005 if (!Ty->isIntegerTy())
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002006 return std::nullopt;
Cullen Rhodes7c3cda52022-07-08 15:18:27 +00002007
2008 Type *FPTy;
2009 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2010 default:
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002011 return std::nullopt;
Cullen Rhodes7c3cda52022-07-08 15:18:27 +00002012 case 16:
Nikita Popov724f4a52023-05-16 18:11:17 +02002013 FPTy = IC.Builder.getHalfTy();
Cullen Rhodes7c3cda52022-07-08 15:18:27 +00002014 break;
2015 case 32:
Nikita Popov724f4a52023-05-16 18:11:17 +02002016 FPTy = IC.Builder.getFloatTy();
Cullen Rhodes7c3cda52022-07-08 15:18:27 +00002017 break;
2018 case 64:
Nikita Popov724f4a52023-05-16 18:11:17 +02002019 FPTy = IC.Builder.getDoubleTy();
Cullen Rhodes7c3cda52022-07-08 15:18:27 +00002020 break;
2021 }
2022
Nikita Popov724f4a52023-05-16 18:11:17 +02002023 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
Cullen Rhodes7c3cda52022-07-08 15:18:27 +00002024 auto *FPVTy = VectorType::get(
2025 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
Nikita Popov724f4a52023-05-16 18:11:17 +02002026 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2027 auto *FPII = IC.Builder.CreateIntrinsic(
2028 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2029 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
Cullen Rhodes7c3cda52022-07-08 15:18:27 +00002030 return IC.replaceInstUsesWith(II, FPIItoInt);
2031}
2032
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002033static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2034 IntrinsicInst &II) {
Peter Waller2d574a12021-05-12 14:47:22 +00002035 LLVMContext &Ctx = II.getContext();
Peter Waller2d574a12021-05-12 14:47:22 +00002036 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2037 // can work with RDFFR_PP for ptest elimination.
2038 auto *AllPat =
2039 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
Nikita Popov724f4a52023-05-16 18:11:17 +02002040 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2041 {II.getType()}, {AllPat});
Peter Waller2d574a12021-05-12 14:47:22 +00002042 auto *RDFFR =
Rahul Joshi74b7abf2025-03-31 08:10:34 -07002043 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
Peter Waller2d574a12021-05-12 14:47:22 +00002044 RDFFR->takeName(&II);
2045 return IC.replaceInstUsesWith(II, RDFFR);
2046}
2047
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002048static std::optional<Instruction *>
Jun Maae5433942021-06-18 11:55:01 +08002049instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
2050 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2051
2052 if (Pattern == AArch64SVEPredPattern::all) {
Jun Maae5433942021-06-18 11:55:01 +08002053 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
Nikita Popov724f4a52023-05-16 18:11:17 +02002054 auto *VScale = IC.Builder.CreateVScale(StepVal);
Jun Maae5433942021-06-18 11:55:01 +08002055 VScale->takeName(&II);
2056 return IC.replaceInstUsesWith(II, VScale);
2057 }
2058
Jun Ma8c471032021-08-25 17:25:39 +08002059 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
Jun Maae5433942021-06-18 11:55:01 +08002060
Jun Ma8c471032021-08-25 17:25:39 +08002061 return MinNumElts && NumElts >= MinNumElts
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002062 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
Jun Maae5433942021-06-18 11:55:01 +08002063 II, ConstantInt::get(II.getType(), MinNumElts)))
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002064 : std::nullopt;
Jun Maae5433942021-06-18 11:55:01 +08002065}
2066
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002067static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2068 IntrinsicInst &II) {
Bradley Smithdaf1a1f2022-11-11 15:24:57 +00002069 Value *PgVal = II.getArgOperand(0);
2070 Value *OpVal = II.getArgOperand(1);
2071
Bradley Smithdaf1a1f2022-11-11 15:24:57 +00002072 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2073 // Later optimizations prefer this form.
2074 if (PgVal == OpVal &&
2075 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2076 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2077 Value *Ops[] = {PgVal, OpVal};
2078 Type *Tys[] = {PgVal->getType()};
2079
2080 auto *PTest =
Nikita Popov724f4a52023-05-16 18:11:17 +02002081 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
Bradley Smithdaf1a1f2022-11-11 15:24:57 +00002082 PTest->takeName(&II);
2083
2084 return IC.replaceInstUsesWith(II, PTest);
2085 }
2086
2087 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
2088 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
Bradley Smith191f9fa2021-07-13 14:42:36 +00002089
Cullen Rhodes50621162022-11-04 08:40:18 +00002090 if (!Pg || !Op)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002091 return std::nullopt;
Cullen Rhodes388cacb2022-10-12 08:36:03 +00002092
Cullen Rhodes50621162022-11-04 08:40:18 +00002093 Intrinsic::ID OpIID = Op->getIntrinsicID();
2094
Cullen Rhodes50621162022-11-04 08:40:18 +00002095 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2096 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2097 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2098 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2099 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
Bradley Smith191f9fa2021-07-13 14:42:36 +00002100
Nikita Popov724f4a52023-05-16 18:11:17 +02002101 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
Bradley Smith191f9fa2021-07-13 14:42:36 +00002102
2103 PTest->takeName(&II);
2104 return IC.replaceInstUsesWith(II, PTest);
2105 }
2106
Cullen Rhodes388cacb2022-10-12 08:36:03 +00002107 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2108 // Later optimizations may rewrite sequence to use the flag-setting variant
2109 // of instruction X to remove PTEST.
Cullen Rhodes50621162022-11-04 08:40:18 +00002110 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2111 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2112 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2113 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2114 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2115 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2116 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2117 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2118 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2119 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2120 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2121 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2122 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2123 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2124 Type *Tys[] = {Pg->getType()};
Cullen Rhodes388cacb2022-10-12 08:36:03 +00002125
Nikita Popov724f4a52023-05-16 18:11:17 +02002126 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
Cullen Rhodes388cacb2022-10-12 08:36:03 +00002127 PTest->takeName(&II);
2128
2129 return IC.replaceInstUsesWith(II, PTest);
2130 }
2131
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002132 return std::nullopt;
Bradley Smith191f9fa2021-07-13 14:42:36 +00002133}
2134
Matt Devereaua107cf02022-12-15 16:09:13 +00002135template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002136static std::optional<Instruction *>
Matt Devereaua107cf02022-12-15 16:09:13 +00002137instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
2138 bool MergeIntoAddendOp) {
Matt4a596942021-11-03 11:31:41 +00002139 Value *P = II.getOperand(0);
Matt Devereaua107cf02022-12-15 16:09:13 +00002140 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2141 if (MergeIntoAddendOp) {
2142 AddendOp = II.getOperand(1);
2143 Mul = II.getOperand(2);
2144 } else {
2145 AddendOp = II.getOperand(2);
2146 Mul = II.getOperand(1);
2147 }
2148
2149 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
2150 m_Value(MulOp1))))
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002151 return std::nullopt;
Matt4a596942021-11-03 11:31:41 +00002152
Matt Devereaua107cf02022-12-15 16:09:13 +00002153 if (!Mul->hasOneUse())
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002154 return std::nullopt;
Matt4a596942021-11-03 11:31:41 +00002155
Matt Devereaua107cf02022-12-15 16:09:13 +00002156 Instruction *FMFSource = nullptr;
2157 if (II.getType()->isFPOrFPVectorTy()) {
2158 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2159 // Stop the combine when the flags on the inputs differ in case dropping
2160 // flags would lead to us missing out on more beneficial optimizations.
2161 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2162 return std::nullopt;
2163 if (!FAddFlags.allowContract())
2164 return std::nullopt;
2165 FMFSource = &II;
2166 }
Matt4a596942021-11-03 11:31:41 +00002167
Matt Devereaua107cf02022-12-15 16:09:13 +00002168 CallInst *Res;
2169 if (MergeIntoAddendOp)
Nikita Popov724f4a52023-05-16 18:11:17 +02002170 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2171 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
Matt Devereaua107cf02022-12-15 16:09:13 +00002172 else
Nikita Popov724f4a52023-05-16 18:11:17 +02002173 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2174 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
Matt Devereaua107cf02022-12-15 16:09:13 +00002175
2176 return IC.replaceInstUsesWith(II, Res);
Matt4a596942021-11-03 11:31:41 +00002177}
2178
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002179static std::optional<Instruction *>
Matt Devereauf526c602021-11-04 16:10:55 +00002180instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
Matt Devereauf526c602021-11-04 16:10:55 +00002181 Value *Pred = II.getOperand(0);
2182 Value *PtrOp = II.getOperand(1);
2183 Type *VecTy = II.getType();
Matt Devereauf526c602021-11-04 16:10:55 +00002184
Paul Walker01bc67e2021-12-03 14:36:54 +00002185 if (isAllActivePredicate(Pred)) {
Youngsuk Kimf69b9b72023-07-08 13:05:58 -04002186 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
Sander de Smalen0b412382022-02-11 07:53:20 +00002187 Load->copyMetadata(II);
Matt Devereauf526c602021-11-04 16:10:55 +00002188 return IC.replaceInstUsesWith(II, Load);
2189 }
2190
2191 CallInst *MaskedLoad =
Youngsuk Kimf69b9b72023-07-08 13:05:58 -04002192 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
Nikita Popov724f4a52023-05-16 18:11:17 +02002193 Pred, ConstantAggregateZero::get(VecTy));
Sander de Smalen0b412382022-02-11 07:53:20 +00002194 MaskedLoad->copyMetadata(II);
Matt Devereauf526c602021-11-04 16:10:55 +00002195 return IC.replaceInstUsesWith(II, MaskedLoad);
2196}
2197
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002198static std::optional<Instruction *>
Matt Devereauf526c602021-11-04 16:10:55 +00002199instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
Matt Devereauf526c602021-11-04 16:10:55 +00002200 Value *VecOp = II.getOperand(0);
2201 Value *Pred = II.getOperand(1);
2202 Value *PtrOp = II.getOperand(2);
Matt Devereauf526c602021-11-04 16:10:55 +00002203
Paul Walker01bc67e2021-12-03 14:36:54 +00002204 if (isAllActivePredicate(Pred)) {
Youngsuk Kimf69b9b72023-07-08 13:05:58 -04002205 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
Sander de Smalen0b412382022-02-11 07:53:20 +00002206 Store->copyMetadata(II);
Matt Devereauf526c602021-11-04 16:10:55 +00002207 return IC.eraseInstFromFunction(II);
2208 }
2209
Nikita Popov724f4a52023-05-16 18:11:17 +02002210 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
Youngsuk Kimf69b9b72023-07-08 13:05:58 -04002211 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
Sander de Smalen0b412382022-02-11 07:53:20 +00002212 MaskedStore->copyMetadata(II);
Matt Devereauf526c602021-11-04 16:10:55 +00002213 return IC.eraseInstFromFunction(II);
2214}
2215
Matthew Devereauf085a9d2021-09-01 16:41:42 +01002216static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
2217 switch (Intrinsic) {
Jolanta Jensendc63b352023-05-17 09:21:40 +00002218 case Intrinsic::aarch64_sve_fmul_u:
Matthew Devereauf085a9d2021-09-01 16:41:42 +01002219 return Instruction::BinaryOps::FMul;
Jolanta Jensendc63b352023-05-17 09:21:40 +00002220 case Intrinsic::aarch64_sve_fadd_u:
Matthew Devereauf085a9d2021-09-01 16:41:42 +01002221 return Instruction::BinaryOps::FAdd;
Jolanta Jensendc63b352023-05-17 09:21:40 +00002222 case Intrinsic::aarch64_sve_fsub_u:
Matthew Devereauf085a9d2021-09-01 16:41:42 +01002223 return Instruction::BinaryOps::FSub;
2224 default:
2225 return Instruction::BinaryOpsEnd;
2226 }
2227}
2228
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002229static std::optional<Instruction *>
2230instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
Paul Walker65031c12023-04-04 12:51:25 +00002231 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2232 if (II.isStrictFP())
2233 return std::nullopt;
2234
Matthew Devereau2ac19992021-10-04 16:56:56 +01002235 auto *OpPredicate = II.getOperand(0);
Matthew Devereauf085a9d2021-09-01 16:41:42 +01002236 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2237 if (BinOpCode == Instruction::BinaryOpsEnd ||
Matthew Devereau91a20562025-04-13 20:40:51 +01002238 !isAllActivePredicate(OpPredicate))
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002239 return std::nullopt;
Yingwei Zhenga77346b2025-01-06 14:37:04 +08002240 auto BinOp = IC.Builder.CreateBinOpFMF(
2241 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
Matthew Devereau2ac19992021-10-04 16:56:56 +01002242 return IC.replaceInstUsesWith(II, BinOp);
Matthew Devereauf085a9d2021-09-01 16:41:42 +01002243}
2244
Matt Devereaua107cf02022-12-15 16:09:13 +00002245static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2246 IntrinsicInst &II) {
Paul Walkerc7c71aa2023-06-17 16:48:09 +01002247 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2248 Intrinsic::aarch64_sve_mla>(
2249 IC, II, true))
2250 return MLA;
2251 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2252 Intrinsic::aarch64_sve_mad>(
2253 IC, II, false))
2254 return MAD;
2255 return std::nullopt;
2256}
2257
2258static std::optional<Instruction *>
2259instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
Matt Devereaua107cf02022-12-15 16:09:13 +00002260 if (auto FMLA =
2261 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2262 Intrinsic::aarch64_sve_fmla>(IC, II,
2263 true))
Matt4a596942021-11-03 11:31:41 +00002264 return FMLA;
Matt Devereaua107cf02022-12-15 16:09:13 +00002265 if (auto FMAD =
2266 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2267 Intrinsic::aarch64_sve_fmad>(IC, II,
2268 false))
2269 return FMAD;
Paul Walkerb7287a82023-06-17 17:51:49 +01002270 if (auto FMLA =
Jolanta Jensendc63b352023-05-17 09:21:40 +00002271 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
Paul Walkerb7287a82023-06-17 17:51:49 +01002272 Intrinsic::aarch64_sve_fmla>(IC, II,
2273 true))
2274 return FMLA;
Jolanta Jensen5cd16e22023-06-20 12:51:41 +00002275 return std::nullopt;
Matt Devereaua107cf02022-12-15 16:09:13 +00002276}
2277
Paul Walkerc7c71aa2023-06-17 16:48:09 +01002278static std::optional<Instruction *>
2279instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
2280 if (auto FMLA =
2281 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2282 Intrinsic::aarch64_sve_fmla>(IC, II,
2283 true))
2284 return FMLA;
2285 if (auto FMAD =
2286 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2287 Intrinsic::aarch64_sve_fmad>(IC, II,
2288 false))
2289 return FMAD;
2290 if (auto FMLA_U =
2291 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2292 Intrinsic::aarch64_sve_fmla_u>(
2293 IC, II, true))
2294 return FMLA_U;
2295 return instCombineSVEVectorBinOp(IC, II);
2296}
2297
2298static std::optional<Instruction *>
2299instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
Matt Devereaua107cf02022-12-15 16:09:13 +00002300 if (auto FMLS =
2301 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2302 Intrinsic::aarch64_sve_fmls>(IC, II,
2303 true))
2304 return FMLS;
Matt Devereaua107cf02022-12-15 16:09:13 +00002305 if (auto FMSB =
2306 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2307 Intrinsic::aarch64_sve_fnmsb>(
2308 IC, II, false))
2309 return FMSB;
Paul Walkerb7287a82023-06-17 17:51:49 +01002310 if (auto FMLS =
Jolanta Jensendc63b352023-05-17 09:21:40 +00002311 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
Paul Walkerb7287a82023-06-17 17:51:49 +01002312 Intrinsic::aarch64_sve_fmls>(IC, II,
2313 true))
2314 return FMLS;
Jolanta Jensen5cd16e22023-06-20 12:51:41 +00002315 return std::nullopt;
Matt4a596942021-11-03 11:31:41 +00002316}
2317
Paul Walkerc7c71aa2023-06-17 16:48:09 +01002318static std::optional<Instruction *>
2319instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
2320 if (auto FMLS =
2321 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2322 Intrinsic::aarch64_sve_fmls>(IC, II,
2323 true))
2324 return FMLS;
2325 if (auto FMSB =
2326 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2327 Intrinsic::aarch64_sve_fnmsb>(
2328 IC, II, false))
2329 return FMSB;
2330 if (auto FMLS_U =
2331 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2332 Intrinsic::aarch64_sve_fmls_u>(
2333 IC, II, true))
2334 return FMLS_U;
2335 return instCombineSVEVectorBinOp(IC, II);
2336}
2337
2338static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2339 IntrinsicInst &II) {
2340 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2341 Intrinsic::aarch64_sve_mls>(
2342 IC, II, true))
2343 return MLS;
2344 return std::nullopt;
2345}
2346
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002347static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2348 IntrinsicInst &II) {
Usman Nadeem5420fc42021-08-05 17:23:01 -07002349 Value *UnpackArg = II.getArgOperand(0);
2350 auto *RetTy = cast<ScalableVectorType>(II.getType());
2351 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2352 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2353
2354 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2355 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2356 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2357 ScalarArg =
Nikita Popov724f4a52023-05-16 18:11:17 +02002358 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
Usman Nadeem5420fc42021-08-05 17:23:01 -07002359 Value *NewVal =
Nikita Popov724f4a52023-05-16 18:11:17 +02002360 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
Usman Nadeem5420fc42021-08-05 17:23:01 -07002361 NewVal->takeName(&II);
2362 return IC.replaceInstUsesWith(II, NewVal);
2363 }
2364
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002365 return std::nullopt;
Usman Nadeem5420fc42021-08-05 17:23:01 -07002366}
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002367static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2368 IntrinsicInst &II) {
Bradley Smith191f9fa2021-07-13 14:42:36 +00002369 auto *OpVal = II.getOperand(0);
2370 auto *OpIndices = II.getOperand(1);
2371 VectorType *VTy = cast<VectorType>(II.getType());
2372
Usman Nadeemab111e92021-09-10 17:57:29 -07002373 // Check whether OpIndices is a constant splat value < minimal element count
2374 // of result.
2375 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
Bradley Smith191f9fa2021-07-13 14:42:36 +00002376 if (!SplatValue ||
2377 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002378 return std::nullopt;
Bradley Smith191f9fa2021-07-13 14:42:36 +00002379
2380 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2381 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
Nikita Popov724f4a52023-05-16 18:11:17 +02002382 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
Bradley Smith191f9fa2021-07-13 14:42:36 +00002383 auto *VectorSplat =
Nikita Popov724f4a52023-05-16 18:11:17 +02002384 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
Bradley Smith191f9fa2021-07-13 14:42:36 +00002385
2386 VectorSplat->takeName(&II);
2387 return IC.replaceInstUsesWith(II, VectorSplat);
2388}
2389
Usman Nadeem267d6b52024-02-15 10:40:09 -08002390static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2391 IntrinsicInst &II) {
2392 Value *A, *B;
2393 Type *RetTy = II.getType();
2394 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2395 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2396
2397 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2398 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2399 if ((match(II.getArgOperand(0),
2400 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
2401 match(II.getArgOperand(1),
2402 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
2403 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2404 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2405 auto *TyA = cast<ScalableVectorType>(A->getType());
2406 if (TyA == B->getType() &&
2407 RetTy == ScalableVectorType::getDoubleElementsVectorType(TyA)) {
2408 auto *SubVec = IC.Builder.CreateInsertVector(
Craig Topper123758b2025-05-02 16:10:18 -07002409 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2410 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2411 TyA->getMinNumElements());
Usman Nadeem267d6b52024-02-15 10:40:09 -08002412 ConcatVec->takeName(&II);
2413 return IC.replaceInstUsesWith(II, ConcatVec);
2414 }
2415 }
2416
2417 return std::nullopt;
2418}
2419
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002420static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2421 IntrinsicInst &II) {
Usman Nadeem757384a2021-09-12 15:53:26 -07002422 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2423 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2424 Value *A, *B;
2425 if (match(II.getArgOperand(0),
2426 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
2427 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
2428 m_Specific(A), m_Specific(B))))
2429 return IC.replaceInstUsesWith(
2430 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2431
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002432 return std::nullopt;
Usman Nadeem757384a2021-09-12 15:53:26 -07002433}
2434
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002435static std::optional<Instruction *>
2436instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
Peter Waller7a341452021-11-03 13:40:22 +00002437 Value *Mask = II.getOperand(0);
2438 Value *BasePtr = II.getOperand(1);
2439 Value *Index = II.getOperand(2);
2440 Type *Ty = II.getType();
Peter Waller7a341452021-11-03 13:40:22 +00002441 Value *PassThru = ConstantAggregateZero::get(Ty);
2442
2443 // Contiguous gather => masked load.
2444 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2445 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2446 Value *IndexBase;
2447 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
2448 m_Value(IndexBase), m_SpecificInt(1)))) {
Peter Waller7a341452021-11-03 13:40:22 +00002449 Align Alignment =
Nikita Popov2d209d92024-06-27 16:38:15 +02002450 BasePtr->getPointerAlignment(II.getDataLayout());
Peter Waller7a341452021-11-03 13:40:22 +00002451
Nikita Popov724f4a52023-05-16 18:11:17 +02002452 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2453 BasePtr, IndexBase);
Peter Waller7a341452021-11-03 13:40:22 +00002454 CallInst *MaskedLoad =
Nikita Popov724f4a52023-05-16 18:11:17 +02002455 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
Peter Waller7a341452021-11-03 13:40:22 +00002456 MaskedLoad->takeName(&II);
2457 return IC.replaceInstUsesWith(II, MaskedLoad);
2458 }
2459
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002460 return std::nullopt;
Peter Waller7a341452021-11-03 13:40:22 +00002461}
2462
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002463static std::optional<Instruction *>
2464instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
Peter Waller7a341452021-11-03 13:40:22 +00002465 Value *Val = II.getOperand(0);
2466 Value *Mask = II.getOperand(1);
2467 Value *BasePtr = II.getOperand(2);
2468 Value *Index = II.getOperand(3);
2469 Type *Ty = Val->getType();
Peter Waller7a341452021-11-03 13:40:22 +00002470
2471 // Contiguous scatter => masked store.
Nikita Popov3196ef82022-02-08 15:16:16 +01002472 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
Peter Waller7a341452021-11-03 13:40:22 +00002473 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2474 Value *IndexBase;
2475 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
2476 m_Value(IndexBase), m_SpecificInt(1)))) {
Peter Waller7a341452021-11-03 13:40:22 +00002477 Align Alignment =
Nikita Popov2d209d92024-06-27 16:38:15 +02002478 BasePtr->getPointerAlignment(II.getDataLayout());
Peter Waller7a341452021-11-03 13:40:22 +00002479
Nikita Popov724f4a52023-05-16 18:11:17 +02002480 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2481 BasePtr, IndexBase);
Nikita Popov724f4a52023-05-16 18:11:17 +02002482 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
Peter Waller7a341452021-11-03 13:40:22 +00002483
2484 return IC.eraseInstFromFunction(II);
2485 }
2486
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002487 return std::nullopt;
Peter Waller7a341452021-11-03 13:40:22 +00002488}
2489
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002490static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2491 IntrinsicInst &II) {
Nikita Popov724f4a52023-05-16 18:11:17 +02002492 Type *Int32Ty = IC.Builder.getInt32Ty();
Matt Devereaufb477252021-12-09 15:32:35 +00002493 Value *Pred = II.getOperand(0);
2494 Value *Vec = II.getOperand(1);
2495 Value *DivVec = II.getOperand(2);
2496
2497 Value *SplatValue = getSplatValue(DivVec);
2498 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2499 if (!SplatConstantInt)
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002500 return std::nullopt;
Matthew Devereau1808fc12024-09-20 13:53:02 +01002501
Matt Devereaufb477252021-12-09 15:32:35 +00002502 APInt Divisor = SplatConstantInt->getValue();
Matthew Devereau1808fc12024-09-20 13:53:02 +01002503 const int64_t DivisorValue = Divisor.getSExtValue();
2504 if (DivisorValue == -1)
2505 return std::nullopt;
2506 if (DivisorValue == 1)
2507 IC.replaceInstUsesWith(II, Vec);
Matt Devereaufb477252021-12-09 15:32:35 +00002508
2509 if (Divisor.isPowerOf2()) {
2510 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
Nikita Popov724f4a52023-05-16 18:11:17 +02002511 auto ASRD = IC.Builder.CreateIntrinsic(
Matt Devereaufb477252021-12-09 15:32:35 +00002512 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2513 return IC.replaceInstUsesWith(II, ASRD);
2514 }
2515 if (Divisor.isNegatedPowerOf2()) {
2516 Divisor.negate();
2517 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
Nikita Popov724f4a52023-05-16 18:11:17 +02002518 auto ASRD = IC.Builder.CreateIntrinsic(
Matt Devereaufb477252021-12-09 15:32:35 +00002519 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
Nikita Popov724f4a52023-05-16 18:11:17 +02002520 auto NEG = IC.Builder.CreateIntrinsic(
2521 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
Matt Devereaufb477252021-12-09 15:32:35 +00002522 return IC.replaceInstUsesWith(II, NEG);
2523 }
2524
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002525 return std::nullopt;
Matt Devereaufb477252021-12-09 15:32:35 +00002526}
2527
Matt Devereau48df06f2023-01-16 14:21:18 +00002528bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
Matt Devereaue18b9712022-12-16 11:19:28 +00002529 size_t VecSize = Vec.size();
2530 if (VecSize == 1)
2531 return true;
2532 if (!isPowerOf2_64(VecSize))
2533 return false;
2534 size_t HalfVecSize = VecSize / 2;
2535
2536 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2537 RHS != Vec.end(); LHS++, RHS++) {
Matt Devereau48df06f2023-01-16 14:21:18 +00002538 if (*LHS != nullptr && *RHS != nullptr) {
2539 if (*LHS == *RHS)
2540 continue;
2541 else
2542 return false;
2543 }
2544 if (!AllowPoison)
2545 return false;
2546 if (*LHS == nullptr && *RHS != nullptr)
2547 *LHS = *RHS;
Matt Devereaue18b9712022-12-16 11:19:28 +00002548 }
2549
2550 Vec.resize(HalfVecSize);
Matt Devereau48df06f2023-01-16 14:21:18 +00002551 SimplifyValuePattern(Vec, AllowPoison);
Matt Devereaue18b9712022-12-16 11:19:28 +00002552 return true;
2553}
2554
2555// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2556// to dupqlane(f64(C)) where C is A concatenated with B
2557static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2558 IntrinsicInst &II) {
2559 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2560 if (!match(II.getOperand(0),
2561 m_Intrinsic<Intrinsic::vector_insert>(
2562 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2563 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2564 return std::nullopt;
2565 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2566
2567 // Insert the scalars into a container ordered by InsertElement index
2568 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2569 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2570 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2571 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2572 CurrentInsertElt = InsertElt->getOperand(0);
2573 }
2574
Matt Devereau48df06f2023-01-16 14:21:18 +00002575 bool AllowPoison =
2576 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2577 if (!SimplifyValuePattern(Elts, AllowPoison))
Matt Devereaue18b9712022-12-16 11:19:28 +00002578 return std::nullopt;
2579
2580 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
Matt Devereaue18b9712022-12-16 11:19:28 +00002581 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2582 for (size_t I = 0; I < Elts.size(); I++) {
Matt Devereau48df06f2023-01-16 14:21:18 +00002583 if (Elts[I] == nullptr)
2584 continue;
Nikita Popov724f4a52023-05-16 18:11:17 +02002585 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2586 IC.Builder.getInt64(I));
Matt Devereaue18b9712022-12-16 11:19:28 +00002587 }
Matt Devereau48df06f2023-01-16 14:21:18 +00002588 if (InsertEltChain == nullptr)
2589 return std::nullopt;
Matt Devereaue18b9712022-12-16 11:19:28 +00002590
2591 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2592 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2593 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2594 // be narrowed back to the original type.
2595 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2596 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2597 IIScalableTy->getMinNumElements() /
2598 PatternWidth;
2599
Nikita Popov724f4a52023-05-16 18:11:17 +02002600 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
Matt Devereaue18b9712022-12-16 11:19:28 +00002601 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2602 auto *WideShuffleMaskTy =
Nikita Popov724f4a52023-05-16 18:11:17 +02002603 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
Matt Devereaue18b9712022-12-16 11:19:28 +00002604
Nikita Popov724f4a52023-05-16 18:11:17 +02002605 auto InsertSubvector = IC.Builder.CreateInsertVector(
Craig Topper123758b2025-05-02 16:10:18 -07002606 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2607 uint64_t(0));
Matt Devereaue18b9712022-12-16 11:19:28 +00002608 auto WideBitcast =
Nikita Popov724f4a52023-05-16 18:11:17 +02002609 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
Matt Devereaue18b9712022-12-16 11:19:28 +00002610 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
Nikita Popov724f4a52023-05-16 18:11:17 +02002611 auto WideShuffle = IC.Builder.CreateShuffleVector(
Matt Devereaue18b9712022-12-16 11:19:28 +00002612 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2613 auto NarrowBitcast =
Nikita Popov724f4a52023-05-16 18:11:17 +02002614 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
Matt Devereaue18b9712022-12-16 11:19:28 +00002615
2616 return IC.replaceInstUsesWith(II, NarrowBitcast);
2617}
2618
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002619static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2620 IntrinsicInst &II) {
Florian Hahn17a73992022-05-10 19:57:43 +01002621 Value *A = II.getArgOperand(0);
2622 Value *B = II.getArgOperand(1);
2623 if (A == B)
2624 return IC.replaceInstUsesWith(II, A);
2625
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002626 return std::nullopt;
Florian Hahn17a73992022-05-10 19:57:43 +01002627}
2628
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002629static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2630 IntrinsicInst &II) {
Bradley Smith5f4541f2022-05-06 14:45:56 +00002631 Value *Pred = II.getOperand(0);
2632 Value *Vec = II.getOperand(1);
2633 Value *Shift = II.getOperand(2);
2634
2635 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2636 Value *AbsPred, *MergedValue;
2637 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2638 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2639 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2640 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2641
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002642 return std::nullopt;
Bradley Smith5f4541f2022-05-06 14:45:56 +00002643
2644 // Transform is valid if any of the following are true:
2645 // * The ABS merge value is an undef or non-negative
2646 // * The ABS predicate is all active
2647 // * The ABS predicate and the SRSHL predicates are the same
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002648 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
Bradley Smith5f4541f2022-05-06 14:45:56 +00002649 AbsPred != Pred && !isAllActivePredicate(AbsPred))
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002650 return std::nullopt;
Bradley Smith5f4541f2022-05-06 14:45:56 +00002651
2652 // Only valid when the shift amount is non-negative, otherwise the rounding
2653 // behaviour of SRSHL cannot be ignored.
2654 if (!match(Shift, m_NonNegative()))
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002655 return std::nullopt;
Bradley Smith5f4541f2022-05-06 14:45:56 +00002656
Nikita Popov724f4a52023-05-16 18:11:17 +02002657 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2658 {II.getType()}, {Pred, Vec, Shift});
Bradley Smith5f4541f2022-05-06 14:45:56 +00002659
2660 return IC.replaceInstUsesWith(II, LSL);
2661}
2662
Paul Walker622ae7f2024-09-24 15:11:36 +01002663static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2664 IntrinsicInst &II) {
2665 Value *Vec = II.getOperand(0);
2666
2667 if (getSplatValue(Vec) == II.getOperand(1))
2668 return IC.replaceInstUsesWith(II, Vec);
2669
2670 return std::nullopt;
2671}
2672
Danila Malyutin1a609052024-10-17 21:04:04 +04002673static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2674 IntrinsicInst &II) {
2675 // If this barrier is post-dominated by identical one we can remove it
2676 auto *NI = II.getNextNonDebugInstruction();
2677 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2678 auto CanSkipOver = [](Instruction *I) {
2679 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2680 };
2681 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2682 auto *NIBB = NI->getParent();
2683 NI = NI->getNextNonDebugInstruction();
2684 if (!NI) {
2685 if (auto *SuccBB = NIBB->getUniqueSuccessor())
Jeremy Morse81d18ad82025-01-27 16:27:54 +00002686 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
Danila Malyutin1a609052024-10-17 21:04:04 +04002687 else
2688 break;
2689 }
2690 }
2691 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2692 if (NextII && II.isIdenticalTo(NextII))
2693 return IC.eraseInstFromFunction(II);
2694
2695 return std::nullopt;
2696}
2697
Matthew Devereau91a20562025-04-13 20:40:51 +01002698static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2699 IntrinsicInst &II) {
2700 if (match(II.getOperand(0), m_ConstantInt<AArch64SVEPredPattern::all>()))
2701 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2702 return std::nullopt;
2703}
2704
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002705std::optional<Instruction *>
Joe Ellisc91cd4f2021-04-16 10:05:05 +00002706AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
2707 IntrinsicInst &II) const {
Paul Walkerc1927372025-04-01 13:27:46 +01002708 const SVEIntrinsicInfo &IInfo = constructSVEIntrinsicInfo(II);
2709 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2710 return I;
2711
Joe Ellisc91cd4f2021-04-16 10:05:05 +00002712 Intrinsic::ID IID = II.getIntrinsicID();
2713 switch (IID) {
2714 default:
2715 break;
Danila Malyutin1a609052024-10-17 21:04:04 +04002716 case Intrinsic::aarch64_dmb:
2717 return instCombineDMB(IC, II);
Florian Hahn17a73992022-05-10 19:57:43 +01002718 case Intrinsic::aarch64_neon_fmaxnm:
2719 case Intrinsic::aarch64_neon_fminnm:
2720 return instCombineMaxMinNM(IC, II);
Bradley Smithc8f20ed2021-04-26 16:19:25 +01002721 case Intrinsic::aarch64_sve_convert_from_svbool:
2722 return instCombineConvertFromSVBool(IC, II);
Bradley Smith89085bc2021-04-23 13:55:42 +01002723 case Intrinsic::aarch64_sve_dup:
2724 return instCombineSVEDup(IC, II);
Usman Nadeemab111e92021-09-10 17:57:29 -07002725 case Intrinsic::aarch64_sve_dup_x:
2726 return instCombineSVEDupX(IC, II);
Bradley Smith60c9b5f2021-05-20 11:13:34 +01002727 case Intrinsic::aarch64_sve_cmpne:
2728 case Intrinsic::aarch64_sve_cmpne_wide:
2729 return instCombineSVECmpNE(IC, II);
Peter Waller2d574a12021-05-12 14:47:22 +00002730 case Intrinsic::aarch64_sve_rdffr:
2731 return instCombineRDFFR(IC, II);
Joe Ellisc91cd4f2021-04-16 10:05:05 +00002732 case Intrinsic::aarch64_sve_lasta:
2733 case Intrinsic::aarch64_sve_lastb:
2734 return instCombineSVELast(IC, II);
Cullen Rhodes7c3cda52022-07-08 15:18:27 +00002735 case Intrinsic::aarch64_sve_clasta_n:
2736 case Intrinsic::aarch64_sve_clastb_n:
2737 return instCombineSVECondLast(IC, II);
Jun Maae5433942021-06-18 11:55:01 +08002738 case Intrinsic::aarch64_sve_cntd:
2739 return instCombineSVECntElts(IC, II, 2);
2740 case Intrinsic::aarch64_sve_cntw:
2741 return instCombineSVECntElts(IC, II, 4);
2742 case Intrinsic::aarch64_sve_cnth:
2743 return instCombineSVECntElts(IC, II, 8);
2744 case Intrinsic::aarch64_sve_cntb:
2745 return instCombineSVECntElts(IC, II, 16);
Bradley Smith191f9fa2021-07-13 14:42:36 +00002746 case Intrinsic::aarch64_sve_ptest_any:
2747 case Intrinsic::aarch64_sve_ptest_first:
2748 case Intrinsic::aarch64_sve_ptest_last:
2749 return instCombineSVEPTest(IC, II);
Matthew Devereauf085a9d2021-09-01 16:41:42 +01002750 case Intrinsic::aarch64_sve_fadd:
Paul Walkerc7c71aa2023-06-17 16:48:09 +01002751 return instCombineSVEVectorFAdd(IC, II);
Jolanta Jensendc63b352023-05-17 09:21:40 +00002752 case Intrinsic::aarch64_sve_fadd_u:
Paul Walkerc7c71aa2023-06-17 16:48:09 +01002753 return instCombineSVEVectorFAddU(IC, II);
Jolanta Jensenecb07f42023-05-17 09:21:40 +00002754 case Intrinsic::aarch64_sve_fmul_u:
Paul Walkera7999f32025-04-17 15:58:39 +01002755 return instCombineSVEVectorBinOp(IC, II);
Jolanta Jensenecb07f42023-05-17 09:21:40 +00002756 case Intrinsic::aarch64_sve_fsub:
2757 return instCombineSVEVectorFSub(IC, II);
2758 case Intrinsic::aarch64_sve_fsub_u:
2759 return instCombineSVEVectorFSubU(IC, II);
Matt Devereaua107cf02022-12-15 16:09:13 +00002760 case Intrinsic::aarch64_sve_add:
2761 return instCombineSVEVectorAdd(IC, II);
Jolanta Jensen105d63a2023-05-12 13:00:55 +00002762 case Intrinsic::aarch64_sve_add_u:
2763 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2764 Intrinsic::aarch64_sve_mla_u>(
2765 IC, II, true);
Matt Devereaua107cf02022-12-15 16:09:13 +00002766 case Intrinsic::aarch64_sve_sub:
2767 return instCombineSVEVectorSub(IC, II);
Jolanta Jensen105d63a2023-05-12 13:00:55 +00002768 case Intrinsic::aarch64_sve_sub_u:
2769 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2770 Intrinsic::aarch64_sve_mls_u>(
2771 IC, II, true);
Bradley Smith191f9fa2021-07-13 14:42:36 +00002772 case Intrinsic::aarch64_sve_tbl:
2773 return instCombineSVETBL(IC, II);
Usman Nadeem5420fc42021-08-05 17:23:01 -07002774 case Intrinsic::aarch64_sve_uunpkhi:
2775 case Intrinsic::aarch64_sve_uunpklo:
2776 case Intrinsic::aarch64_sve_sunpkhi:
2777 case Intrinsic::aarch64_sve_sunpklo:
2778 return instCombineSVEUnpack(IC, II);
Usman Nadeem267d6b52024-02-15 10:40:09 -08002779 case Intrinsic::aarch64_sve_uzp1:
2780 return instCombineSVEUzp1(IC, II);
Usman Nadeem757384a2021-09-12 15:53:26 -07002781 case Intrinsic::aarch64_sve_zip1:
2782 case Intrinsic::aarch64_sve_zip2:
2783 return instCombineSVEZip(IC, II);
Peter Waller7a341452021-11-03 13:40:22 +00002784 case Intrinsic::aarch64_sve_ld1_gather_index:
2785 return instCombineLD1GatherIndex(IC, II);
2786 case Intrinsic::aarch64_sve_st1_scatter_index:
2787 return instCombineST1ScatterIndex(IC, II);
Matt Devereauf526c602021-11-04 16:10:55 +00002788 case Intrinsic::aarch64_sve_ld1:
2789 return instCombineSVELD1(IC, II, DL);
2790 case Intrinsic::aarch64_sve_st1:
2791 return instCombineSVEST1(IC, II, DL);
Matt Devereaufb477252021-12-09 15:32:35 +00002792 case Intrinsic::aarch64_sve_sdiv:
2793 return instCombineSVESDIV(IC, II);
Matt Devereaua9e08bc2022-03-16 11:41:14 +00002794 case Intrinsic::aarch64_sve_sel:
2795 return instCombineSVESel(IC, II);
Bradley Smith5f4541f2022-05-06 14:45:56 +00002796 case Intrinsic::aarch64_sve_srshl:
2797 return instCombineSVESrshl(IC, II);
Matt Devereaue18b9712022-12-16 11:19:28 +00002798 case Intrinsic::aarch64_sve_dupq_lane:
2799 return instCombineSVEDupqLane(IC, II);
Paul Walker622ae7f2024-09-24 15:11:36 +01002800 case Intrinsic::aarch64_sve_insr:
2801 return instCombineSVEInsr(IC, II);
Matthew Devereau91a20562025-04-13 20:40:51 +01002802 case Intrinsic::aarch64_sve_ptrue:
2803 return instCombinePTrue(IC, II);
Joe Ellisc91cd4f2021-04-16 10:05:05 +00002804 }
2805
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002806 return std::nullopt;
Joe Ellisc91cd4f2021-04-16 10:05:05 +00002807}
2808
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002809std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
David Green61888d92022-01-13 11:53:12 +00002810 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2811 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2812 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2813 SimplifyAndSetOp) const {
2814 switch (II.getIntrinsicID()) {
2815 default:
2816 break;
2817 case Intrinsic::aarch64_neon_fcvtxn:
2818 case Intrinsic::aarch64_neon_rshrn:
2819 case Intrinsic::aarch64_neon_sqrshrn:
2820 case Intrinsic::aarch64_neon_sqrshrun:
2821 case Intrinsic::aarch64_neon_sqshrn:
2822 case Intrinsic::aarch64_neon_sqshrun:
2823 case Intrinsic::aarch64_neon_sqxtn:
2824 case Intrinsic::aarch64_neon_sqxtun:
2825 case Intrinsic::aarch64_neon_uqrshrn:
2826 case Intrinsic::aarch64_neon_uqshrn:
2827 case Intrinsic::aarch64_neon_uqxtn:
2828 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2829 break;
2830 }
2831
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08002832 return std::nullopt;
David Green61888d92022-01-13 11:53:12 +00002833}
2834
Paul Walker7775a482024-08-05 11:25:44 +01002835bool AArch64TTIImpl::enableScalableVectorization() const {
2836 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2837 EnableScalableAutovecInStreamingMode);
2838}
2839
Sander de Smalen137459a2022-10-19 14:14:00 +00002840TypeSize
2841AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
2842 switch (K) {
2843 case TargetTransformInfo::RGK_Scalar:
Sander de Smalen81b7f112023-11-22 08:52:53 +00002844 return TypeSize::getFixed(64);
Sander de Smalen137459a2022-10-19 14:14:00 +00002845 case TargetTransformInfo::RGK_FixedWidthVector:
Sander de Smalen738533c2024-06-24 11:06:16 +01002846 if (ST->useSVEForFixedLengthVectors() &&
2847 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
Sander de Smalen81b7f112023-11-22 08:52:53 +00002848 return TypeSize::getFixed(
2849 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
Sander de Smalen738533c2024-06-24 11:06:16 +01002850 else if (ST->isNeonAvailable())
2851 return TypeSize::getFixed(128);
2852 else
2853 return TypeSize::getFixed(0);
Sander de Smalen137459a2022-10-19 14:14:00 +00002854 case TargetTransformInfo::RGK_ScalableVector:
Sander de Smalen738533c2024-06-24 11:06:16 +01002855 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2856 EnableScalableAutovecInStreamingMode))
2857 return TypeSize::getScalable(128);
2858 else
Sander de Smalen81b7f112023-11-22 08:52:53 +00002859 return TypeSize::getScalable(0);
Sander de Smalen137459a2022-10-19 14:14:00 +00002860 }
2861 llvm_unreachable("Unsupported register kind");
2862}
2863
Matthew Simpson78fd46b2017-05-09 20:18:12 +00002864bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
David Green1712ae62023-07-12 13:13:06 +01002865 ArrayRef<const Value *> Args,
Sergei Barannikove0c1e232025-04-21 21:42:40 +03002866 Type *SrcOverrideTy) const {
Matthew Simpson78fd46b2017-05-09 20:18:12 +00002867 // A helper that returns a vector type from the given type. The number of
David Kreitzer6918a152022-04-29 12:26:13 -07002868 // elements in type Ty determines the vector width.
Matthew Simpson78fd46b2017-05-09 20:18:12 +00002869 auto toVectorTy = [&](Type *ArgTy) {
Caroline Concatto6c4d8f42020-11-11 14:41:01 +00002870 return VectorType::get(ArgTy->getScalarType(),
2871 cast<VectorType>(DstTy)->getElementCount());
Matthew Simpson78fd46b2017-05-09 20:18:12 +00002872 };
2873
David Green1712ae62023-07-12 13:13:06 +01002874 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2875 // i32, i64]. SVE doesn't generally have the same set of instructions to
David Greenf2a92db2022-11-30 13:09:48 +00002876 // perform an extend with the add/sub/mul. There are SMULLB style
2877 // instructions, but they operate on top/bottom, requiring some sort of lane
2878 // interleaving to be used with zext/sext.
David Green1712ae62023-07-12 13:13:06 +01002879 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2880 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2881 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
Matthew Simpson78fd46b2017-05-09 20:18:12 +00002882 return false;
2883
2884 // Determine if the operation has a widening variant. We consider both the
2885 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2886 // instructions.
2887 //
David Green2abaa022022-04-04 12:45:04 +01002888 // TODO: Add additional widening operations (e.g., shl, etc.) once we
Matthew Simpson78fd46b2017-05-09 20:18:12 +00002889 // verify that their extending operands are eliminated during code
2890 // generation.
David Green1712ae62023-07-12 13:13:06 +01002891 Type *SrcTy = SrcOverrideTy;
Matthew Simpson78fd46b2017-05-09 20:18:12 +00002892 switch (Opcode) {
2893 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2894 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
David Green1712ae62023-07-12 13:13:06 +01002895 // The second operand needs to be an extend
2896 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2897 if (!SrcTy)
2898 SrcTy =
2899 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2900 } else
2901 return false;
Matthew Simpson78fd46b2017-05-09 20:18:12 +00002902 break;
David Green1712ae62023-07-12 13:13:06 +01002903 case Instruction::Mul: { // SMULL(2), UMULL(2)
2904 // Both operands need to be extends of the same type.
2905 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2906 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2907 if (!SrcTy)
2908 SrcTy =
2909 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2910 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2911 // If one of the operands is a Zext and the other has enough zero bits to
2912 // be treated as unsigned, we can still general a umull, meaning the zext
2913 // is free.
2914 KnownBits Known =
2915 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
2916 if (Args[0]->getType()->getScalarSizeInBits() -
2917 Known.Zero.countLeadingOnes() >
2918 DstTy->getScalarSizeInBits() / 2)
2919 return false;
2920 if (!SrcTy)
2921 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2922 DstTy->getScalarSizeInBits() / 2));
2923 } else
2924 return false;
2925 break;
2926 }
Matthew Simpson78fd46b2017-05-09 20:18:12 +00002927 default:
2928 return false;
2929 }
2930
Matthew Simpson78fd46b2017-05-09 20:18:12 +00002931 // Legalize the destination type and ensure it can be used in a widening
2932 // operation.
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03002933 auto DstTyL = getTypeLegalizationCost(DstTy);
David Green1712ae62023-07-12 13:13:06 +01002934 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
Matthew Simpson78fd46b2017-05-09 20:18:12 +00002935 return false;
2936
2937 // Legalize the source type and ensure it can be used in a widening
2938 // operation.
David Green1712ae62023-07-12 13:13:06 +01002939 assert(SrcTy && "Expected some SrcTy");
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03002940 auto SrcTyL = getTypeLegalizationCost(SrcTy);
Matthew Simpson78fd46b2017-05-09 20:18:12 +00002941 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2942 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2943 return false;
2944
2945 // Get the total number of vector elements in the legalized types.
Daniil Fukalov3489c2d2021-04-29 16:02:51 +03002946 InstructionCost NumDstEls =
2947 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2948 InstructionCost NumSrcEls =
2949 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
Matthew Simpson78fd46b2017-05-09 20:18:12 +00002950
2951 // Return true if the legalized types have the same number of vector elements
2952 // and the destination element type size is twice that of the source type.
David Green1712ae62023-07-12 13:13:06 +01002953 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
Matthew Simpson78fd46b2017-05-09 20:18:12 +00002954}
2955
Kerry McLaughlin9a98ab52023-08-29 08:15:29 +00002956// s/urhadd instructions implement the following pattern, making the
2957// extends free:
2958// %x = add ((zext i8 -> i16), 1)
2959// %y = (zext i8 -> i16)
2960// trunc i16 (lshr (add %x, %y), 1) -> i8
2961//
zhongyundef41223ee2023-09-01 23:40:21 +08002962bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst,
Sergei Barannikove0c1e232025-04-21 21:42:40 +03002963 Type *Src) const {
Kerry McLaughlin9a98ab52023-08-29 08:15:29 +00002964 // The source should be a legal vector type.
2965 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2966 (Src->isScalableTy() && !ST->hasSVE2()))
2967 return false;
2968
2969 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2970 return false;
2971
2972 // Look for trunc/shl/add before trying to match the pattern.
2973 const Instruction *Add = ExtUser;
2974 auto *AddUser =
2975 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2976 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2977 Add = AddUser;
2978
2979 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2980 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2981 return false;
2982
2983 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2984 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2985 Src->getScalarSizeInBits() !=
2986 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2987 return false;
2988
2989 // Try to match the whole pattern. Ext could be either the first or second
2990 // m_ZExtOrSExt matched.
2991 Instruction *Ex1, *Ex2;
2992 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2993 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2994 return false;
2995
2996 // Ensure both extends are of the same type
2997 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2998 Ex1->getOpcode() == Ex2->getOpcode())
2999 return true;
3000
3001 return false;
3002}
3003
Sander de Smalen92d84212021-01-21 13:40:22 +00003004InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
3005 Type *Src,
3006 TTI::CastContextHint CCH,
3007 TTI::TargetCostKind CostKind,
Sergei Barannikove0c1e232025-04-21 21:42:40 +03003008 const Instruction *I) const {
Tim Northover3b0846e2014-05-24 12:50:23 +00003009 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3010 assert(ISD && "Invalid opcode");
Matthew Simpson78fd46b2017-05-09 20:18:12 +00003011 // If the cast is observable, and it is used by a widening instruction (e.g.,
3012 // uaddl, saddw, etc.), it may be free.
David Green2abaa022022-04-04 12:45:04 +01003013 if (I && I->hasOneUser()) {
Matthew Simpson78fd46b2017-05-09 20:18:12 +00003014 auto *SingleUser = cast<Instruction>(*I->user_begin());
3015 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
David Green1712ae62023-07-12 13:13:06 +01003016 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
3017 // For adds only count the second operand as free if both operands are
3018 // extends but not the same operation. (i.e both operands are not free in
3019 // add(sext, zext)).
3020 if (SingleUser->getOpcode() == Instruction::Add) {
3021 if (I == SingleUser->getOperand(1) ||
3022 (isa<CastInst>(SingleUser->getOperand(1)) &&
3023 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
Matthew Simpson78fd46b2017-05-09 20:18:12 +00003024 return 0;
David Green1712ae62023-07-12 13:13:06 +01003025 } else // Others are free so long as isWideningInstruction returned true.
3026 return 0;
Matthew Simpson78fd46b2017-05-09 20:18:12 +00003027 }
Kerry McLaughlin9a98ab52023-08-29 08:15:29 +00003028
3029 // The cast will be free for the s/urhadd instructions
3030 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
zhongyundef41223ee2023-09-01 23:40:21 +08003031 isExtPartOfAvgExpr(SingleUser, Dst, Src))
Kerry McLaughlin9a98ab52023-08-29 08:15:29 +00003032 return 0;
Matthew Simpson78fd46b2017-05-09 20:18:12 +00003033 }
3034
Sam Parker8aaabad2020-05-26 11:27:57 +01003035 // TODO: Allow non-throughput costs that aren't binary.
Sander de Smalen92d84212021-01-21 13:40:22 +00003036 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
Sam Parker8aaabad2020-05-26 11:27:57 +01003037 if (CostKind != TTI::TCK_RecipThroughput)
3038 return Cost == 0 ? 0 : 1;
3039 return Cost;
3040 };
3041
Mehdi Amini44ede332015-07-09 02:09:04 +00003042 EVT SrcTy = TLI->getValueType(DL, Src);
3043 EVT DstTy = TLI->getValueType(DL, Dst);
Tim Northover3b0846e2014-05-24 12:50:23 +00003044
3045 if (!SrcTy.isSimple() || !DstTy.isSimple())
David Green60280e92020-07-29 13:32:53 +01003046 return AdjustCost(
3047 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
Tim Northover3b0846e2014-05-24 12:50:23 +00003048
David Green2db7b312025-01-07 09:39:08 +00003049 static const TypeConversionCostTblEntry BF16Tbl[] = {
3050 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3051 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3052 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3053 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3054 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3055 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3056 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3057 };
3058
3059 if (ST->hasBF16())
3060 if (const auto *Entry = ConvertCostTableLookup(
3061 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3062 return AdjustCost(Entry->Cost);
3063
Graham Hunterf737df72025-03-25 10:43:44 +00003064 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3065 // The cost of unpacking twice is artificially increased for now in order
3066 // to avoid regressions against NEON, which will use tbl instructions directly
3067 // instead of multiple layers of [s|u]unpk[lo|hi].
3068 // We use the unpacks in cases where the destination type is illegal and
3069 // requires splitting of the input, even if the input type itself is legal.
3070 const unsigned int SVE_EXT_COST = 1;
3071 const unsigned int SVE_FCVT_COST = 1;
3072 const unsigned int SVE_UNPACK_ONCE = 4;
3073 const unsigned int SVE_UNPACK_TWICE = 16;
3074
David Greenca884002024-12-09 23:41:18 +00003075 static const TypeConversionCostTblEntry ConversionTbl[] = {
3076 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3077 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3078 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3079 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3080 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3081 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3082 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3083 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3084 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3085 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3086 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3087 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3088 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3089 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3090 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3091 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3092 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3093 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3094 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3095 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
Silviu Barangab322aa62015-08-17 16:05:09 +00003096
David Greenca884002024-12-09 23:41:18 +00003097 // Truncations on nxvmiN
David Sherwoodeaf482f2024-12-19 10:07:41 +00003098 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3099 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3100 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3101 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3102 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3103 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3104 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3105 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3106 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3107 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3108 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3109 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3110 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3111 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3112 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3113 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3114 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3115 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3116 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3117 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3118 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3119 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3120 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3121 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3122 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3123 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3124 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3125 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3126 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3127 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3128 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3129 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3130 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
Nashe Mncube19601a4c2021-03-17 12:00:31 +00003131
David Greenca884002024-12-09 23:41:18 +00003132 // The number of shll instructions for the extension.
3133 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3134 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3135 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3136 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3137 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3138 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3139 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3140 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3141 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3142 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3143 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3144 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3145 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3146 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3147 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3148 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
Silviu Barangab322aa62015-08-17 16:05:09 +00003149
David Green2f18b5e2024-12-11 06:26:41 +00003150 // FP Ext and trunc
3151 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3152 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3153 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3154 // FP16
3155 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3156 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3157 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3158 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3159 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3160 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3161 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
David Green2db7b312025-01-07 09:39:08 +00003162 // BF16 (uses shift)
3163 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3164 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3165 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3166 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3167 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3168 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3169 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
David Green2f18b5e2024-12-11 06:26:41 +00003170 // FP Ext and trunc
3171 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3172 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3173 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3174 // FP16
3175 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3176 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3177 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3178 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3179 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3180 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3181 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
David Green2db7b312025-01-07 09:39:08 +00003182 // BF16 (more complex, with +bf16 is handled above)
3183 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3184 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3185 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3186 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3187 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3188 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3189 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3190 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
David Green2f18b5e2024-12-11 06:26:41 +00003191
David Greenca884002024-12-09 23:41:18 +00003192 // LowerVectorINT_TO_FP:
3193 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3194 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3195 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3196 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3197 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3198 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
Tim Northoveref0d7602014-06-15 09:27:06 +00003199
Graham Hunterf737df72025-03-25 10:43:44 +00003200 // SVE: to nxv2f16
3201 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3202 SVE_EXT_COST + SVE_FCVT_COST},
3203 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3204 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3205 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3206 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3207 SVE_EXT_COST + SVE_FCVT_COST},
3208 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3209 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3210 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3211
3212 // SVE: to nxv4f16
3213 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3214 SVE_EXT_COST + SVE_FCVT_COST},
3215 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3216 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3217 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3218 SVE_EXT_COST + SVE_FCVT_COST},
3219 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3220 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3221
3222 // SVE: to nxv8f16
3223 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3224 SVE_EXT_COST + SVE_FCVT_COST},
3225 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3226 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3227 SVE_EXT_COST + SVE_FCVT_COST},
3228 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3229
3230 // SVE: to nxv16f16
3231 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3232 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3233 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3234 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3235
David Greenca884002024-12-09 23:41:18 +00003236 // Complex: to v2f32
3237 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3238 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
David Greenca884002024-12-09 23:41:18 +00003239 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3240 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
Tim Northoveref0d7602014-06-15 09:27:06 +00003241
Graham Hunterf737df72025-03-25 10:43:44 +00003242 // SVE: to nxv2f32
3243 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3244 SVE_EXT_COST + SVE_FCVT_COST},
3245 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3246 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3247 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3248 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3249 SVE_EXT_COST + SVE_FCVT_COST},
3250 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3251 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3252 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3253
David Greenca884002024-12-09 23:41:18 +00003254 // Complex: to v4f32
3255 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3256 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3257 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3258 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
Tim Northoveref0d7602014-06-15 09:27:06 +00003259
Graham Hunterf737df72025-03-25 10:43:44 +00003260 // SVE: to nxv4f32
3261 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3262 SVE_EXT_COST + SVE_FCVT_COST},
3263 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3264 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3265 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3266 SVE_EXT_COST + SVE_FCVT_COST},
3267 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3268 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3269
David Greenca884002024-12-09 23:41:18 +00003270 // Complex: to v8f32
3271 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3272 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3273 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3274 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
Silviu Barangab322aa62015-08-17 16:05:09 +00003275
Graham Hunterf737df72025-03-25 10:43:44 +00003276 // SVE: to nxv8f32
3277 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3278 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3279 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3280 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3281 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3282 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3283 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3284 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3285
3286 // SVE: to nxv16f32
3287 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3288 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3289 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3290 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3291
David Greenca884002024-12-09 23:41:18 +00003292 // Complex: to v16f32
3293 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3294 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
Silviu Barangab322aa62015-08-17 16:05:09 +00003295
David Greenca884002024-12-09 23:41:18 +00003296 // Complex: to v2f64
3297 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3298 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3299 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3300 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3301 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3302 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
Tim Northoveref0d7602014-06-15 09:27:06 +00003303
Graham Hunterf737df72025-03-25 10:43:44 +00003304 // SVE: to nxv2f64
3305 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3306 SVE_EXT_COST + SVE_FCVT_COST},
3307 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3308 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3309 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3310 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3311 SVE_EXT_COST + SVE_FCVT_COST},
3312 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3313 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3314 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3315
David Greenca884002024-12-09 23:41:18 +00003316 // Complex: to v4f64
3317 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3318 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
Tim Northoveref0d7602014-06-15 09:27:06 +00003319
Graham Hunterf737df72025-03-25 10:43:44 +00003320 // SVE: to nxv4f64
3321 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3322 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3323 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3324 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3325 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3326 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3327 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3328 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3329 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3330 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3331 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3332 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3333
3334 // SVE: to nxv8f64
3335 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3336 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3337 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3338 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3339 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3340 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3341 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3342 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3343
David Greenca884002024-12-09 23:41:18 +00003344 // LowerVectorFP_TO_INT
3345 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3346 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3347 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3348 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3349 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3350 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
Tim Northoveref0d7602014-06-15 09:27:06 +00003351
David Greenca884002024-12-09 23:41:18 +00003352 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3353 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3354 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3355 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3356 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3357 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3358 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
Tim Northoverdbecc3b2014-06-15 09:27:15 +00003359
David Greenca884002024-12-09 23:41:18 +00003360 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3361 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3362 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3363 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3364 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
Tim Northoverdbecc3b2014-06-15 09:27:15 +00003365
David Greenca884002024-12-09 23:41:18 +00003366 // Complex, from nxv2f32.
3367 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3368 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3369 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3370 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3371 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3372 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3373 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3374 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
Nashe Mncube19601a4c2021-03-17 12:00:31 +00003375
David Greenca884002024-12-09 23:41:18 +00003376 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3377 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3378 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3379 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3380 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3381 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3382 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
Nashe Mncube19601a4c2021-03-17 12:00:31 +00003383
David Greenca884002024-12-09 23:41:18 +00003384 // Complex, from nxv2f64.
3385 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3386 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3387 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3388 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
Paul Walkerda4cbec2025-03-04 11:34:44 +00003389 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
David Greenca884002024-12-09 23:41:18 +00003390 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3391 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3392 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3393 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
Paul Walkerda4cbec2025-03-04 11:34:44 +00003394 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
Nashe Mncube19601a4c2021-03-17 12:00:31 +00003395
David Greenca884002024-12-09 23:41:18 +00003396 // Complex, from nxv4f32.
3397 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3398 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3399 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3400 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
Paul Walkerda4cbec2025-03-04 11:34:44 +00003401 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
David Greenca884002024-12-09 23:41:18 +00003402 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3403 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3404 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3405 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
Paul Walkerda4cbec2025-03-04 11:34:44 +00003406 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
Nashe Mncube19601a4c2021-03-17 12:00:31 +00003407
David Greenca884002024-12-09 23:41:18 +00003408 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3409 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3410 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3411 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3412 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
Nashe Mncube19601a4c2021-03-17 12:00:31 +00003413
David Greenca884002024-12-09 23:41:18 +00003414 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3415 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3416 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3417 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3418 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3419 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3420 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
Nashe Mncube19601a4c2021-03-17 12:00:31 +00003421
David Greenca884002024-12-09 23:41:18 +00003422 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3423 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3424 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3425 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3426 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
David Sherwood57ca65e2021-04-06 11:06:58 +01003427
David Greenca884002024-12-09 23:41:18 +00003428 // Complex, from nxv8f16.
3429 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3430 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3431 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3432 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
Paul Walkerda4cbec2025-03-04 11:34:44 +00003433 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
David Greenca884002024-12-09 23:41:18 +00003434 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3435 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3436 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3437 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
Paul Walkerda4cbec2025-03-04 11:34:44 +00003438 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
David Sherwood57ca65e2021-04-06 11:06:58 +01003439
David Greenca884002024-12-09 23:41:18 +00003440 // Complex, from nxv4f16.
3441 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3442 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3443 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3444 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3445 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3446 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3447 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3448 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
David Sherwood57ca65e2021-04-06 11:06:58 +01003449
David Greenca884002024-12-09 23:41:18 +00003450 // Complex, from nxv2f16.
3451 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3452 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3453 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3454 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3455 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3456 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3457 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3458 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
Nashe Mncube19601a4c2021-03-17 12:00:31 +00003459
David Greenca884002024-12-09 23:41:18 +00003460 // Truncate from nxvmf32 to nxvmf16.
3461 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3462 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3463 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
Nashe Mncube19601a4c2021-03-17 12:00:31 +00003464
David Greenca884002024-12-09 23:41:18 +00003465 // Truncate from nxvmf64 to nxvmf16.
3466 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3467 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3468 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
Nashe Mncube19601a4c2021-03-17 12:00:31 +00003469
David Greenca884002024-12-09 23:41:18 +00003470 // Truncate from nxvmf64 to nxvmf32.
3471 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3472 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3473 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
Nashe Mncube19601a4c2021-03-17 12:00:31 +00003474
David Greenca884002024-12-09 23:41:18 +00003475 // Extend from nxvmf16 to nxvmf32.
3476 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3477 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3478 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
Nashe Mncube19601a4c2021-03-17 12:00:31 +00003479
David Greenca884002024-12-09 23:41:18 +00003480 // Extend from nxvmf16 to nxvmf64.
3481 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3482 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3483 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
Nashe Mncube19601a4c2021-03-17 12:00:31 +00003484
David Greenca884002024-12-09 23:41:18 +00003485 // Extend from nxvmf32 to nxvmf64.
3486 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3487 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3488 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
Nashe Mncube19601a4c2021-03-17 12:00:31 +00003489
David Greenca884002024-12-09 23:41:18 +00003490 // Bitcasts from float to integer
3491 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3492 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3493 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
Alban Bridonneau2feddb32022-01-26 13:33:38 +00003494
David Greenca884002024-12-09 23:41:18 +00003495 // Bitcasts from integer to float
3496 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3497 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3498 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
Hassnaa Hamdi045eec62023-04-19 09:23:13 +00003499
David Greenca884002024-12-09 23:41:18 +00003500 // Add cost for extending to illegal -too wide- scalable vectors.
3501 // zero/sign extend are implemented by multiple unpack operations,
3502 // where each operation has a cost of 1.
3503 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3504 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3505 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3506 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3507 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3508 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
Hassnaa Hamdi045eec62023-04-19 09:23:13 +00003509
David Greenca884002024-12-09 23:41:18 +00003510 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3511 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3512 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3513 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3514 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3515 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
Tim Northover3b0846e2014-05-24 12:50:23 +00003516 };
3517
Dinar Temirbulatov73668cc2023-05-15 16:18:45 +00003518 // We have to estimate a cost of fixed length operation upon
3519 // SVE registers(operations) with the number of registers required
3520 // for a fixed type to be represented upon SVE registers.
3521 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3522 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3523 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3524 ST->useSVEForFixedLengthVectors(WiderTy)) {
3525 std::pair<InstructionCost, MVT> LT =
3526 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
David Greenca884002024-12-09 23:41:18 +00003527 unsigned NumElements =
3528 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
Dinar Temirbulatov73668cc2023-05-15 16:18:45 +00003529 return AdjustCost(
3530 LT.first *
3531 getCastInstrCost(
3532 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3533 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3534 CostKind, I));
3535 }
3536
David Greenca884002024-12-09 23:41:18 +00003537 if (const auto *Entry = ConvertCostTableLookup(
3538 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
Sam Parker8aaabad2020-05-26 11:27:57 +01003539 return AdjustCost(Entry->Cost);
Tim Northover3b0846e2014-05-24 12:50:23 +00003540
David Green47f4cd92022-03-03 11:17:24 +00003541 static const TypeConversionCostTblEntry FP16Tbl[] = {
3542 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3543 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3544 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3545 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3546 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3547 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3548 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3549 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3550 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3551 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3552 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3553 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3554 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3555 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3556 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3557 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3558 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3559 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
Florian Hahnaa590e52022-03-11 10:27:17 +00003560 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3561 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3562 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3563 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
David Green47f4cd92022-03-03 11:17:24 +00003564 };
3565
3566 if (ST->hasFullFP16())
3567 if (const auto *Entry = ConvertCostTableLookup(
3568 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3569 return AdjustCost(Entry->Cost);
3570
David Greene2202b92025-03-26 07:26:17 +00003571 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3572 // double-rounding issues.
3573 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3574 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3575 isa<FixedVectorType>(Dst) && isa<FixedVectorType>(Src))
3576 return AdjustCost(
3577 cast<FixedVectorType>(Dst)->getNumElements() *
3578 getCastInstrCost(Opcode, Dst->getScalarType(), Src->getScalarType(),
3579 CCH, CostKind) +
3580 BaseT::getScalarizationOverhead(cast<FixedVectorType>(Src), false, true,
3581 CostKind) +
3582 BaseT::getScalarizationOverhead(cast<FixedVectorType>(Dst), true, false,
3583 CostKind));
3584
David Sherwoodfad69a52023-10-02 10:50:56 +01003585 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
Sander de Smalenc4366492024-06-25 13:27:06 +01003586 CCH == TTI::CastContextHint::Masked &&
3587 ST->isSVEorStreamingSVEAvailable() &&
David Sherwoodfad69a52023-10-02 10:50:56 +01003588 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3589 TargetLowering::TypePromoteInteger &&
3590 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3591 TargetLowering::TypeSplitVector) {
3592 // The standard behaviour in the backend for these cases is to split the
3593 // extend up into two parts:
3594 // 1. Perform an extending load or masked load up to the legal type.
3595 // 2. Extend the loaded data to the final type.
3596 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3597 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3598 InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(
3599 Opcode, LegalTy, Src, CCH, CostKind, I);
3600 InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost(
3601 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3602 return Part1 + Part2;
3603 }
3604
David Sherwoodafc2b7d2023-04-05 12:58:03 +00003605 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3606 // but we also want to include the TTI::CastContextHint::Masked case too.
3607 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
Sander de Smalenc4366492024-06-25 13:27:06 +01003608 CCH == TTI::CastContextHint::Masked &&
3609 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
David Sherwoodafc2b7d2023-04-05 12:58:03 +00003610 CCH = TTI::CastContextHint::Normal;
3611
David Green60280e92020-07-29 13:32:53 +01003612 return AdjustCost(
3613 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
Tim Northover3b0846e2014-05-24 12:50:23 +00003614}
3615
David Greend20604e2025-04-22 15:09:43 +01003616InstructionCost
3617AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
3618 VectorType *VecTy, unsigned Index,
3619 TTI::TargetCostKind CostKind) const {
Matthew Simpsone5dfb082016-04-27 15:20:21 +00003620
3621 // Make sure we were given a valid extend opcode.
Matthew Simpson47bd3992016-04-27 16:25:04 +00003622 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3623 "Invalid opcode");
Matthew Simpsone5dfb082016-04-27 15:20:21 +00003624
3625 // We are extending an element we extract from a vector, so the source type
3626 // of the extend is the element type of the vector.
3627 auto *Src = VecTy->getElementType();
3628
3629 // Sign- and zero-extends are for integer types only.
3630 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3631
3632 // Get the cost for the extract. We compute the cost (if any) for the extend
3633 // below.
Alexey Bataev9b5f6262022-12-21 13:38:38 -08003634 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
ShihPo Hung5fb3a572023-01-21 05:29:05 -08003635 CostKind, Index, nullptr, nullptr);
Matthew Simpsone5dfb082016-04-27 15:20:21 +00003636
3637 // Legalize the types.
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03003638 auto VecLT = getTypeLegalizationCost(VecTy);
Matthew Simpsone5dfb082016-04-27 15:20:21 +00003639 auto DstVT = TLI->getValueType(DL, Dst);
3640 auto SrcVT = TLI->getValueType(DL, Src);
Matthew Simpsone5dfb082016-04-27 15:20:21 +00003641
3642 // If the resulting type is still a vector and the destination type is legal,
3643 // we may get the extension for free. If not, get the default cost for the
3644 // extend.
3645 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
David Green60280e92020-07-29 13:32:53 +01003646 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3647 CostKind);
Matthew Simpsone5dfb082016-04-27 15:20:21 +00003648
3649 // The destination type should be larger than the element type. If not, get
3650 // the default cost for the extend.
David Sherwoodd67d8f82020-10-09 12:03:20 +01003651 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
David Green60280e92020-07-29 13:32:53 +01003652 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3653 CostKind);
Matthew Simpsone5dfb082016-04-27 15:20:21 +00003654
3655 switch (Opcode) {
3656 default:
3657 llvm_unreachable("Opcode should be either SExt or ZExt");
3658
3659 // For sign-extends, we only need a smov, which performs the extension
3660 // automatically.
3661 case Instruction::SExt:
3662 return Cost;
3663
3664 // For zero-extends, the extend is performed automatically by a umov unless
3665 // the destination type is i64 and the element type is i8 or i16.
3666 case Instruction::ZExt:
3667 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3668 return Cost;
3669 }
3670
3671 // If we are unable to perform the extend for free, get the default cost.
David Green60280e92020-07-29 13:32:53 +01003672 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3673 CostKind);
Matthew Simpsone5dfb082016-04-27 15:20:21 +00003674}
3675
Sander de Smalen14b934f2021-01-26 16:32:30 +00003676InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
3677 TTI::TargetCostKind CostKind,
Sergei Barannikove0c1e232025-04-21 21:42:40 +03003678 const Instruction *I) const {
Florian Hahn1ccc4992020-06-30 10:39:23 +01003679 if (CostKind != TTI::TCK_RecipThroughput)
3680 return Opcode == Instruction::PHI ? 0 : 1;
Florian Hahnc30da982020-07-01 18:20:01 +01003681 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
Florian Hahn1ccc4992020-06-30 10:39:23 +01003682 // Branches are assumed to be predicted.
Florian Hahnc30da982020-07-01 18:20:01 +01003683 return 0;
Florian Hahn1ccc4992020-06-30 10:39:23 +01003684}
3685
Sushant Gokhale9991ea22024-11-13 11:10:49 +05303686InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
David Greenc6406c82025-03-27 17:25:02 +00003687 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3688 bool HasRealUse, const Instruction *I, Value *Scalar,
Sergei Barannikove0c1e232025-04-21 21:42:40 +03003689 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
Tim Northover3b0846e2014-05-24 12:50:23 +00003690 assert(Val->isVectorTy() && "This must be a vector type");
3691
3692 if (Index != -1U) {
3693 // Legalize the type.
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03003694 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
Tim Northover3b0846e2014-05-24 12:50:23 +00003695
3696 // This type is legalized to a scalar type.
3697 if (!LT.second.isVector())
3698 return 0;
3699
David Sherwoodef1ca4d2022-01-12 09:51:34 +00003700 // The type may be split. For fixed-width vectors we can normalize the
3701 // index to the new type.
3702 if (LT.second.isFixedLengthVector()) {
3703 unsigned Width = LT.second.getVectorNumElements();
3704 Index = Index % Width;
3705 }
Tim Northover3b0846e2014-05-24 12:50:23 +00003706
3707 // The element at index zero is already inside the vector.
Mingming Liu8aa80062022-06-21 13:38:30 -07003708 // - For a physical (HasRealUse==true) insert-element or extract-element
3709 // instruction that extracts integers, an explicit FPR -> GPR move is
3710 // needed. So it has non-zero cost.
3711 // - For the rest of cases (virtual instruction or element type is float),
3712 // consider the instruction free.
Sjoerd Meijer079c4882023-02-09 16:07:17 +00003713 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
3714 return 0;
3715
3716 // This is recognising a LD1 single-element structure to one lane of one
3717 // register instruction. I.e., if this is an `insertelement` instruction,
3718 // and its second operand is a load, then we will generate a LD1, which
3719 // are expensive instructions.
3720 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
David Greenc6406c82025-03-27 17:25:02 +00003721 return CostKind == TTI::TCK_CodeSize
3722 ? 0
3723 : ST->getVectorInsertExtractBaseCost() + 1;
Sjoerd Meijer079c4882023-02-09 16:07:17 +00003724
David Greeneb764a72023-06-01 10:54:53 +01003725 // i1 inserts and extract will include an extra cset or cmp of the vector
3726 // value. Increase the cost by 1 to account.
3727 if (Val->getScalarSizeInBits() == 1)
David Greenc6406c82025-03-27 17:25:02 +00003728 return CostKind == TTI::TCK_CodeSize
3729 ? 2
3730 : ST->getVectorInsertExtractBaseCost() + 1;
David Greeneb764a72023-06-01 10:54:53 +01003731
Mingming Liu8aa80062022-06-21 13:38:30 -07003732 // FIXME:
3733 // If the extract-element and insert-element instructions could be
3734 // simplified away (e.g., could be combined into users by looking at use-def
3735 // context), they have no cost. This is not done in the first place for
3736 // compile-time considerations.
Tim Northover3b0846e2014-05-24 12:50:23 +00003737 }
3738
Sushant Gokhale9991ea22024-11-13 11:10:49 +05303739 // In case of Neon, if there exists extractelement from lane != 0 such that
3740 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
3741 // 2. extractelement result feeds into fmul.
3742 // 3. Other operand of fmul is an extractelement from lane 0 or lane
3743 // equivalent to 0.
3744 // then the extractelement can be merged with fmul in the backend and it
3745 // incurs no cost.
3746 // e.g.
3747 // define double @foo(<2 x double> %a) {
3748 // %1 = extractelement <2 x double> %a, i32 0
3749 // %2 = extractelement <2 x double> %a, i32 1
3750 // %res = fmul double %1, %2
3751 // ret double %res
3752 // }
3753 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
3754 auto ExtractCanFuseWithFmul = [&]() {
3755 // We bail out if the extract is from lane 0.
3756 if (Index == 0)
3757 return false;
3758
3759 // Check if the scalar element type of the vector operand of ExtractElement
3760 // instruction is one of the allowed types.
3761 auto IsAllowedScalarTy = [&](const Type *T) {
3762 return T->isFloatTy() || T->isDoubleTy() ||
3763 (T->isHalfTy() && ST->hasFullFP16());
3764 };
3765
3766 // Check if the extractelement user is scalar fmul.
3767 auto IsUserFMulScalarTy = [](const Value *EEUser) {
3768 // Check if the user is scalar fmul.
David Greend106a392024-11-29 01:11:39 +00003769 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
Sushant Gokhale9991ea22024-11-13 11:10:49 +05303770 return BO && BO->getOpcode() == BinaryOperator::FMul &&
3771 !BO->getType()->isVectorTy();
3772 };
3773
3774 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
3775 // certain scalar type and a certain vector register width.
David Greend106a392024-11-29 01:11:39 +00003776 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
Sushant Gokhale9991ea22024-11-13 11:10:49 +05303777 auto RegWidth =
3778 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
3779 .getFixedValue();
David Greend714b222024-11-29 04:01:03 +00003780 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
Sushant Gokhale9991ea22024-11-13 11:10:49 +05303781 };
3782
3783 // Check if the type constraints on input vector type and result scalar type
3784 // of extractelement instruction are satisfied.
3785 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
3786 return false;
3787
3788 if (Scalar) {
3789 DenseMap<User *, unsigned> UserToExtractIdx;
3790 for (auto *U : Scalar->users()) {
3791 if (!IsUserFMulScalarTy(U))
3792 return false;
3793 // Recording entry for the user is important. Index value is not
3794 // important.
3795 UserToExtractIdx[U];
3796 }
David Greend106a392024-11-29 01:11:39 +00003797 if (UserToExtractIdx.empty())
3798 return false;
Sushant Gokhale9991ea22024-11-13 11:10:49 +05303799 for (auto &[S, U, L] : ScalarUserAndIdx) {
3800 for (auto *U : S->users()) {
Kazu Hirata2d287f52025-05-03 21:55:36 -07003801 if (UserToExtractIdx.contains(U)) {
Sushant Gokhale9991ea22024-11-13 11:10:49 +05303802 auto *FMul = cast<BinaryOperator>(U);
3803 auto *Op0 = FMul->getOperand(0);
3804 auto *Op1 = FMul->getOperand(1);
David Greend106a392024-11-29 01:11:39 +00003805 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
Sushant Gokhale9991ea22024-11-13 11:10:49 +05303806 UserToExtractIdx[U] = L;
3807 break;
3808 }
3809 }
3810 }
3811 }
3812 for (auto &[U, L] : UserToExtractIdx) {
3813 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
3814 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
3815 return false;
3816 }
3817 } else {
3818 const auto *EE = cast<ExtractElementInst>(I);
3819
3820 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
3821 if (!IdxOp)
3822 return false;
3823
3824 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
3825 if (!IsUserFMulScalarTy(U))
3826 return false;
3827
3828 // Check if the other operand of extractelement is also extractelement
3829 // from lane equivalent to 0.
3830 const auto *BO = cast<BinaryOperator>(U);
3831 const auto *OtherEE = dyn_cast<ExtractElementInst>(
3832 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
3833 if (OtherEE) {
3834 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
3835 if (!IdxOp)
3836 return false;
3837 return IsExtractLaneEquivalentToZero(
3838 cast<ConstantInt>(OtherEE->getIndexOperand())
3839 ->getValue()
3840 .getZExtValue(),
3841 OtherEE->getType()->getScalarSizeInBits());
3842 }
3843 return true;
3844 });
3845 }
3846 return true;
3847 };
3848
3849 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
3850 ExtractCanFuseWithFmul())
3851 return 0;
3852
Tim Northover3b0846e2014-05-24 12:50:23 +00003853 // All other insert/extracts cost this much.
David Greenc6406c82025-03-27 17:25:02 +00003854 return CostKind == TTI::TCK_CodeSize ? 1
3855 : ST->getVectorInsertExtractBaseCost();
Tim Northover3b0846e2014-05-24 12:50:23 +00003856}
3857
Mingming Liu8aa80062022-06-21 13:38:30 -07003858InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
ShihPo Hung5fb3a572023-01-21 05:29:05 -08003859 TTI::TargetCostKind CostKind,
David Greenabd2c072025-05-01 15:55:08 +01003860 unsigned Index,
3861 const Value *Op0,
3862 const Value *Op1) const {
Alexey Bataev8cf02902023-04-14 09:35:03 -07003863 bool HasRealUse =
3864 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
David Greenc6406c82025-03-27 17:25:02 +00003865 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);
Sushant Gokhale9991ea22024-11-13 11:10:49 +05303866}
3867
3868InstructionCost AArch64TTIImpl::getVectorInstrCost(
3869 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3870 Value *Scalar,
Sergei Barannikov0014b492025-04-22 06:27:29 +03003871 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
David Greenc6406c82025-03-27 17:25:02 +00003872 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
3873 Scalar, ScalarUserAndIdx);
Mingming Liu8aa80062022-06-21 13:38:30 -07003874}
3875
3876InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
ShihPo Hung5fb3a572023-01-21 05:29:05 -08003877 Type *Val,
3878 TTI::TargetCostKind CostKind,
Sergei Barannikov0014b492025-04-22 06:27:29 +03003879 unsigned Index) const {
David Greenc6406c82025-03-27 17:25:02 +00003880 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
Sushant Gokhale9991ea22024-11-13 11:10:49 +05303881 true /* HasRealUse */, &I);
Mingming Liu8aa80062022-06-21 13:38:30 -07003882}
3883
David Green2a859b22023-07-28 21:26:50 +01003884InstructionCost AArch64TTIImpl::getScalarizationOverhead(
3885 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
Jonas Paulssonf5c8c1e2025-04-30 17:11:27 +02003886 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
3887 ArrayRef<Value *> VL) const {
David Green2a859b22023-07-28 21:26:50 +01003888 if (isa<ScalableVectorType>(Ty))
3889 return InstructionCost::getInvalid();
3890 if (Ty->getElementType()->isFloatingPointTy())
3891 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
3892 CostKind);
David Green052225d2025-04-11 20:18:26 +01003893 unsigned VecInstCost =
3894 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
3895 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
David Green2a859b22023-07-28 21:26:50 +01003896}
3897
Sander de Smalen4f42d872021-04-14 16:53:01 +01003898InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
Sam Parker40574fe2020-04-28 14:11:27 +01003899 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
Philip Reames104fa362022-08-20 08:07:28 -07003900 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
Sergei Barannikove0c1e232025-04-21 21:42:40 +03003901 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
Philip Reames478cf942022-08-22 12:03:36 -07003902
David Green0b745a12024-08-09 14:25:07 +01003903 // The code-generator is currently not able to handle scalable vectors
3904 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3905 // it. This change will be removed when code-generation for these types is
3906 // sufficiently reliable.
3907 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3908 if (VTy->getElementCount() == ElementCount::getScalable(1))
3909 return InstructionCost::getInvalid();
3910
Sam Parkerfa8bff02020-06-05 08:42:03 +01003911 // TODO: Handle more cost kinds.
3912 if (CostKind != TTI::TCK_RecipThroughput)
Philip Reames104fa362022-08-20 08:07:28 -07003913 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3914 Op2Info, Args, CxtI);
Sam Parkerfa8bff02020-06-05 08:42:03 +01003915
Tim Northover3b0846e2014-05-24 12:50:23 +00003916 // Legalize the type.
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03003917 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
Tim Northover3b0846e2014-05-24 12:50:23 +00003918 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3919
3920 switch (ISD) {
3921 default:
Philip Reames104fa362022-08-20 08:07:28 -07003922 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3923 Op2Info);
Sushant Gokhalec4808742025-03-09 22:26:39 -07003924 case ISD::SREM:
Evandro Menezesf9bd8712018-03-07 22:35:32 +00003925 case ISD::SDIV:
Sushant Gokhalec4808742025-03-09 22:26:39 -07003926 /*
3927 Notes for sdiv/srem specific costs:
3928 1. This only considers the cases where the divisor is constant, uniform and
3929 (pow-of-2/non-pow-of-2). Other cases are not important since they either
3930 result in some form of (ldr + adrp), corresponding to constant vectors, or
3931 scalarization of the division operation.
3932 2. Constant divisors, either negative in whole or partially, don't result in
3933 significantly different codegen as compared to positive constant divisors.
3934 So, we don't consider negative divisors seperately.
3935 3. If the codegen is significantly different with SVE, it has been indicated
3936 using comments at appropriate places.
3937
3938 sdiv specific cases:
3939 -----------------------------------------------------------------------
3940 codegen | pow-of-2 | Type
3941 -----------------------------------------------------------------------
3942 add + cmp + csel + asr | Y | i64
3943 add + cmp + csel + asr | Y | i32
3944 -----------------------------------------------------------------------
3945
3946 srem specific cases:
3947 -----------------------------------------------------------------------
3948 codegen | pow-of-2 | Type
3949 -----------------------------------------------------------------------
3950 negs + and + and + csneg | Y | i64
3951 negs + and + and + csneg | Y | i32
3952 -----------------------------------------------------------------------
3953
3954 other sdiv/srem cases:
3955 -------------------------------------------------------------------------
3956 commom codegen | + srem | + sdiv | pow-of-2 | Type
3957 -------------------------------------------------------------------------
3958 smulh + asr + add + add | - | - | N | i64
3959 smull + lsr + add + add | - | - | N | i32
3960 usra | and + sub | sshr | Y | <2 x i64>
3961 2 * (scalar code) | - | - | N | <2 x i64>
3962 usra | bic + sub | sshr + neg | Y | <4 x i32>
3963 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
3964 + sshr + usra | | | |
3965 -------------------------------------------------------------------------
3966 */
3967 if (Op2Info.isConstant() && Op2Info.isUniform()) {
3968 InstructionCost AddCost =
3969 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
3970 Op1Info.getNoProps(), Op2Info.getNoProps());
3971 InstructionCost AsrCost =
3972 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3973 Op1Info.getNoProps(), Op2Info.getNoProps());
3974 InstructionCost MulCost =
3975 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
3976 Op1Info.getNoProps(), Op2Info.getNoProps());
3977 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
3978 // have similar cost.
3979 auto VT = TLI->getValueType(DL, Ty);
David Green9c6eca22025-03-29 19:25:17 +00003980 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
Sushant Gokhalec4808742025-03-09 22:26:39 -07003981 if (Op2Info.isPowerOf2()) {
3982 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
3983 : (3 * AsrCost + AddCost);
3984 } else {
3985 return MulCost + AsrCost + 2 * AddCost;
3986 }
3987 } else if (VT.isVector()) {
3988 InstructionCost UsraCost = 2 * AsrCost;
3989 if (Op2Info.isPowerOf2()) {
3990 // Division with scalable types corresponds to native 'asrd'
3991 // instruction when SVE is available.
3992 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
3993 if (Ty->isScalableTy() && ST->hasSVE())
3994 return 2 * AsrCost;
3995 return UsraCost +
3996 (ISD == ISD::SDIV
3997 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) *
3998 AsrCost
3999 : 2 * AddCost);
4000 } else if (LT.second == MVT::v2i64) {
4001 return VT.getVectorNumElements() *
4002 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4003 Op1Info.getNoProps(),
4004 Op2Info.getNoProps());
4005 } else {
4006 // When SVE is available, we get:
4007 // smulh + lsr + add/sub + asr + add/sub.
4008 if (Ty->isScalableTy() && ST->hasSVE())
4009 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4010 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4011 }
4012 }
4013 }
4014 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4015 LT.second.isFixedLengthVector()) {
4016 // FIXME: When the constant vector is non-uniform, this may result in
4017 // loading the vector from constant pool or in some cases, may also result
4018 // in scalarization. For now, we are approximating this with the
4019 // scalarization cost.
4020 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4021 CostKind, -1, nullptr, nullptr);
4022 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4023 CostKind, -1, nullptr, nullptr);
4024 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4025 return ExtractCost + InsertCost +
4026 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4027 CostKind, Op1Info.getNoProps(),
4028 Op2Info.getNoProps());
Evandro Menezesf9bd8712018-03-07 22:35:32 +00004029 }
Fangrui Songde9d80c2022-08-08 11:24:15 -07004030 [[fallthrough]];
David Greena5d8b7a2025-02-26 13:49:48 +00004031 case ISD::UDIV:
4032 case ISD::UREM: {
Jon Roelofsbded3b32024-09-05 07:42:23 -07004033 auto VT = TLI->getValueType(DL, Ty);
David Greena5d8b7a2025-02-26 13:49:48 +00004034 if (Op2Info.isConstant()) {
4035 // If the operand is a power of 2 we can use the shift or and cost.
4036 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4037 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4038 Op1Info.getNoProps(),
4039 Op2Info.getNoProps());
4040 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4041 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4042 Op1Info.getNoProps(),
4043 Op2Info.getNoProps());
4044
4045 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4046 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4047 // The MULHU will be expanded to UMULL for the types not listed below,
4048 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4049 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4050 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4051 LT.second == MVT::nxv16i8;
4052 bool Is128bit = LT.second.is128BitVector();
4053
4054 InstructionCost MulCost =
4055 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4056 Op1Info.getNoProps(), Op2Info.getNoProps());
4057 InstructionCost AddCost =
4058 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4059 Op1Info.getNoProps(), Op2Info.getNoProps());
4060 InstructionCost ShrCost =
4061 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4062 Op1Info.getNoProps(), Op2Info.getNoProps());
4063 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4064 (HasMULH ? 0 : ShrCost) + // UMULL shift
4065 AddCost * 2 + ShrCost;
4066 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4067 }
Adhemerval Zanellaf384bc72018-05-09 12:48:22 +00004068 }
4069
Jon Roelofsbded3b32024-09-05 07:42:23 -07004070 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4071 // emitted by the backend even when those functions are not declared in the
4072 // module.
4073 if (!VT.isVector() && VT.getSizeInBits() > 64)
4074 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4075
David Green3c88ff42022-04-03 22:16:39 +01004076 InstructionCost Cost = BaseT::getArithmeticInstrCost(
Philip Reames104fa362022-08-20 08:07:28 -07004077 Opcode, Ty, CostKind, Op1Info, Op2Info);
David Greena5d8b7a2025-02-26 13:49:48 +00004078 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
Hassnaa Hamdif2072e02022-08-23 15:22:52 +00004079 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
Hassnaa Hamdi181f2002022-09-23 11:51:19 +00004080 // SDIV/UDIV operations are lowered using SVE, then we can have less
4081 // costs.
Guillaume Chatelet8fd55582023-01-11 16:48:35 +00004082 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
4083 ->getPrimitiveSizeInBits()
4084 .getFixedValue() < 128) {
Hassnaa Hamdif2072e02022-08-23 15:22:52 +00004085 EVT VT = TLI->getValueType(DL, Ty);
4086 static const CostTblEntry DivTbl[]{
4087 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4088 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4089 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4090 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4091 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4092 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4093
4094 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4095 if (nullptr != Entry)
4096 return Entry->Cost;
4097 }
4098 // For 8/16-bit elements, the cost is higher because the type
4099 // requires promotion and possibly splitting:
4100 if (LT.second.getScalarType() == MVT::i8)
4101 Cost *= 8;
4102 else if (LT.second.getScalarType() == MVT::i16)
4103 Cost *= 4;
4104 return Cost;
4105 } else {
Zain Jaffal6e4cea52022-11-28 10:37:31 +02004106 // If one of the operands is a uniform constant then the cost for each
4107 // element is Cost for insertion, extraction and division.
4108 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4109 // operation with scalar type
4110 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4111 (Op2Info.isConstant() && Op2Info.isUniform())) {
4112 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4113 InstructionCost DivCost = BaseT::getArithmeticInstrCost(
4114 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4115 return (4 + DivCost) * VTy->getNumElements();
4116 }
4117 }
Hassnaa Hamdif2072e02022-08-23 15:22:52 +00004118 // On AArch64, without SVE, vector divisions are expanded
4119 // into scalar divisions of each pair of elements.
David Greenc51b24c2025-04-02 14:51:22 +01004120 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4121 -1, nullptr, nullptr);
4122 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4123 nullptr, nullptr);
Hassnaa Hamdif2072e02022-08-23 15:22:52 +00004124 }
4125
Evandro Menezesf9bd8712018-03-07 22:35:32 +00004126 // TODO: if one of the arguments is scalar, then it's not necessary to
4127 // double the cost of handling the vector elements.
4128 Cost += Cost;
4129 }
4130 return Cost;
David Green3c88ff42022-04-03 22:16:39 +01004131 }
Tim Northover3b0846e2014-05-24 12:50:23 +00004132 case ISD::MUL:
Hassnaa Hamdif2072e02022-08-23 15:22:52 +00004133 // When SVE is available, then we can lower the v2i64 operation using
4134 // the SVE mul instruction, which has a lower cost.
4135 if (LT.second == MVT::v2i64 && ST->hasSVE())
4136 return LT.first;
4137
4138 // When SVE is not available, there is no MUL.2d instruction,
4139 // which means mul <2 x i64> is expensive as elements are extracted
4140 // from the vectors and the muls scalarized.
4141 // As getScalarizationOverhead is a bit too pessimistic, we
4142 // estimate the cost for a i64 vector directly here, which is:
David Green750bf352022-04-04 17:42:20 +01004143 // - four 2-cost i64 extracts,
4144 // - two 2-cost i64 inserts, and
4145 // - two 1-cost muls.
4146 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4147 // LT.first = 2 the cost is 28. If both operands are extensions it will not
David Green2abaa022022-04-04 12:45:04 +01004148 // need to scalarize so the cost can be cheaper (smull or umull).
Hassnaa Hamdif2072e02022-08-23 15:22:52 +00004149 // so the cost can be cheaper (smull or umull).
David Green1712ae62023-07-12 13:13:06 +01004150 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
David Green2abaa022022-04-04 12:45:04 +01004151 return LT.first;
David Green27a2d3d2025-01-20 11:43:57 +00004152 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4153 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4154 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4155 nullptr, nullptr) *
4156 2 +
4157 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4158 nullptr, nullptr));
Sjoerd Meijer5110ff02020-11-30 11:16:10 +00004159 case ISD::ADD:
Tim Northover3b0846e2014-05-24 12:50:23 +00004160 case ISD::XOR:
4161 case ISD::OR:
4162 case ISD::AND:
David Green65c0e452022-03-03 10:42:57 +00004163 case ISD::SRL:
4164 case ISD::SRA:
4165 case ISD::SHL:
Tim Northover3b0846e2014-05-24 12:50:23 +00004166 // These nodes are marked as 'custom' for combining purposes only.
4167 // We know that they are legal. See LowerAdd in ISelLowering.
David Green3c88ff42022-04-03 22:16:39 +01004168 return LT.first;
Paul Walker3a98d5d2020-06-20 20:23:31 +01004169
Sjoerd Meijerd8278652023-04-11 12:40:14 +01004170 case ISD::FNEG:
David Greenc61d5652024-08-21 18:10:16 +01004171 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4172 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4173 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4174 CxtI &&
4175 ((CxtI->hasOneUse() &&
4176 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4177 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4178 return 0;
4179 [[fallthrough]];
Paul Walker3a98d5d2020-06-20 20:23:31 +01004180 case ISD::FADD:
David Sherwoodd581d942021-08-31 14:07:50 +01004181 case ISD::FSUB:
Sjoerd Meijerd8278652023-04-11 12:40:14 +01004182 // Increase the cost for half and bfloat types if not architecturally
4183 // supported.
4184 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
4185 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
4186 return 2 * LT.first;
4187 if (!Ty->getScalarType()->isFP128Ty())
4188 return LT.first;
Craig Topper6006d432023-05-24 12:15:23 -07004189 [[fallthrough]];
David Sherwoodd581d942021-08-31 14:07:50 +01004190 case ISD::FMUL:
4191 case ISD::FDIV:
Paul Walker3a98d5d2020-06-20 20:23:31 +01004192 // These nodes are marked as 'custom' just to lower them to SVE.
4193 // We know said lowering will incur no additional cost.
David Sherwoodd581d942021-08-31 14:07:50 +01004194 if (!Ty->getScalarType()->isFP128Ty())
David Green3c88ff42022-04-03 22:16:39 +01004195 return 2 * LT.first;
Paul Walker3a98d5d2020-06-20 20:23:31 +01004196
Philip Reames104fa362022-08-20 08:07:28 -07004197 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4198 Op2Info);
Paschalis Mpeisbbdc62e2024-02-23 09:29:45 +00004199 case ISD::FREM:
4200 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4201 // those functions are not declared in the module.
4202 if (!Ty->isVectorTy())
4203 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4204 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4205 Op2Info);
Tim Northover3b0846e2014-05-24 12:50:23 +00004206 }
4207}
4208
Sergei Barannikov0014b492025-04-22 06:27:29 +03004209InstructionCost
4210AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
4211 const SCEV *Ptr) const {
Tim Northover3b0846e2014-05-24 12:50:23 +00004212 // Address computations in vectorized code with non-consecutive addresses will
4213 // likely result in more instructions compared to scalar code where the
4214 // computation can more often be merged into the index mode. The resulting
4215 // extra micro-ops can significantly decrease throughput.
zhongyundedf19d872023-06-07 21:50:54 +08004216 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
Mohammed Agabaria23599ba2017-01-05 14:03:41 +00004217 int MaxMergeDistance = 64;
Tim Northover3b0846e2014-05-24 12:50:23 +00004218
Fangrui Songf78650a2018-07-30 19:41:25 +00004219 if (Ty->isVectorTy() && SE &&
Mohammed Agabaria23599ba2017-01-05 14:03:41 +00004220 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
Tim Northover3b0846e2014-05-24 12:50:23 +00004221 return NumVectorInstToHideOverhead;
4222
4223 // In many cases the address computation is not merged into the instruction
4224 // addressing mode.
4225 return 1;
4226}
4227
Philip Reamesd2885742024-09-25 07:25:57 -07004228InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
4229 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4230 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
Sergei Barannikove0c1e232025-04-21 21:42:40 +03004231 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
Sam Parker37289612020-05-26 14:28:34 +01004232 // TODO: Handle other cost kinds.
4233 if (CostKind != TTI::TCK_RecipThroughput)
Florian Hahnb3b993a2020-11-02 12:40:34 +00004234 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
Philip Reamesd2885742024-09-25 07:25:57 -07004235 Op1Info, Op2Info, I);
Tim Northover3b0846e2014-05-24 12:50:23 +00004236
4237 int ISD = TLI->InstructionOpcodeToISD(Opcode);
Silviu Barangaa3e27ed2015-09-09 15:35:02 +00004238 // We don't lower some vector selects well that are wider than the register
4239 // width.
David Sherwood2e080eb2021-01-19 15:38:03 +00004240 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
Tim Northover3b0846e2014-05-24 12:50:23 +00004241 // We would need this many instructions to hide the scalarization happening.
Chandler Carruth93205eb2015-08-05 18:08:10 +00004242 const int AmortizationCost = 20;
Florian Hahnb3b993a2020-11-02 12:40:34 +00004243
4244 // If VecPred is not set, check if we can get a predicate from the context
4245 // instruction, if its type matches the requested ValTy.
4246 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
Ramkumar Ramachandra4a0d53a2024-12-13 14:18:33 +00004247 CmpPredicate CurrentPred;
Florian Hahnb3b993a2020-11-02 12:40:34 +00004248 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4249 m_Value())))
4250 VecPred = CurrentPred;
4251 }
Florian Hahn17ebd682022-01-31 10:18:28 +00004252 // Check if we have a compare/select chain that can be lowered using
4253 // a (F)CMxx & BFI pair.
4254 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4255 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4256 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4257 VecPred == CmpInst::FCMP_UNE) {
4258 static const auto ValidMinMaxTys = {
4259 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4260 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4261 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4262
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03004263 auto LT = getTypeLegalizationCost(ValTy);
Florian Hahn17ebd682022-01-31 10:18:28 +00004264 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
4265 (ST->hasFullFP16() &&
4266 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
Florian Hahnb3b993a2020-11-02 12:40:34 +00004267 return LT.first;
4268 }
4269
Craig Topper4b275762015-10-28 04:02:12 +00004270 static const TypeConversionCostTblEntry
Tim Northover3b0846e2014-05-24 12:50:23 +00004271 VectorSelectTbl[] = {
Zhongyundecb353dc2023-06-20 13:12:02 +08004272 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
4273 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
4274 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
4275 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
4276 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
Silviu Barangaa3e27ed2015-09-09 15:35:02 +00004277 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
4278 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
4279 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
Tim Northover3b0846e2014-05-24 12:50:23 +00004280 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
4281 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
4282 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
4283 };
4284
Mehdi Amini44ede332015-07-09 02:09:04 +00004285 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4286 EVT SelValTy = TLI->getValueType(DL, ValTy);
Tim Northover3b0846e2014-05-24 12:50:23 +00004287 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
Craig Topperee0c8592015-10-27 04:14:24 +00004288 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
4289 SelCondTy.getSimpleVT(),
4290 SelValTy.getSimpleVT()))
4291 return Entry->Cost;
Tim Northover3b0846e2014-05-24 12:50:23 +00004292 }
4293 }
Craig Topper9ad93802023-05-13 23:33:00 -07004294
David Green1ba9ec02023-05-14 23:28:11 +01004295 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
Paul Walkera095ebc2025-04-22 11:20:17 +01004296 Type *ValScalarTy = ValTy->getScalarType();
4297 if ((ValScalarTy->isHalfTy() && !ST->hasFullFP16()) ||
4298 ValScalarTy->isBFloatTy()) {
4299 auto *ValVTy = cast<FixedVectorType>(ValTy);
4300
Paul Walkera095ebc2025-04-22 11:20:17 +01004301 // Without dedicated instructions we promote [b]f16 compares to f32.
4302 auto *PromotedTy =
4303 VectorType::get(Type::getFloatTy(ValTy->getContext()), ValVTy);
4304
4305 InstructionCost Cost = 0;
4306 // Promote operands to float vectors.
4307 Cost += 2 * getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy,
4308 TTI::CastContextHint::None, CostKind);
4309 // Compare float vectors.
4310 Cost += getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind,
4311 Op1Info, Op2Info);
4312 // During codegen we'll truncate the vector result from i32 to i16.
4313 Cost +=
4314 getCastInstrCost(Instruction::Trunc, VectorType::getInteger(ValVTy),
4315 VectorType::getInteger(PromotedTy),
4316 TTI::CastContextHint::None, CostKind);
4317 return Cost;
4318 }
Craig Topper9ad93802023-05-13 23:33:00 -07004319 }
4320
David Green5106b222023-07-01 21:59:54 +01004321 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
4322 // FIXME: This can apply to more conditions and add/sub if it can be shown to
4323 // be profitable.
4324 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
4325 ICmpInst::isEquality(VecPred) &&
4326 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4327 match(I->getOperand(1), m_Zero()) &&
4328 match(I->getOperand(0), m_And(m_Value(), m_Value())))
4329 return 0;
4330
David Sherwood2e080eb2021-01-19 15:38:03 +00004331 // The base case handles scalable vectors fine for now, since it treats the
4332 // cost as 1 * legalization cost.
Philip Reamesd2885742024-09-25 07:25:57 -07004333 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4334 Op1Info, Op2Info, I);
Tim Northover3b0846e2014-05-24 12:50:23 +00004335}
4336
Evandro Menezesa005c1a2019-08-05 18:09:14 +00004337AArch64TTIImpl::TTI::MemCmpExpansionOptions
4338AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4339 TTI::MemCmpExpansionOptions Options;
Eli Friedmane9ac7572020-04-06 15:17:02 -07004340 if (ST->requiresStrictAlign()) {
4341 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4342 // a bunch of instructions when strict align is enabled.
4343 return Options;
4344 }
4345 Options.AllowOverlappingLoads = true;
Evandro Menezesa005c1a2019-08-05 18:09:14 +00004346 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4347 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4348 // TODO: Though vector loads usually perform well on AArch64, in some targets
4349 // they may wake up the FP unit, which raises the power consumption. Perhaps
4350 // they could be used with no holds barred (-O3).
4351 Options.LoadSizes = {8, 4, 2, 1};
Igor Kirillov849f9632023-10-30 18:40:48 +00004352 Options.AllowedTailExpansions = {3, 5, 6};
Evandro Menezesa005c1a2019-08-05 18:09:14 +00004353 return Options;
4354}
4355
Tiehu Zhangb3291562022-06-17 18:24:23 +08004356bool AArch64TTIImpl::prefersVectorizedAddressing() const {
4357 return ST->hasSVE();
4358}
4359
David Sherwooda458b782021-04-16 16:08:38 +01004360InstructionCost
4361AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
4362 Align Alignment, unsigned AddressSpace,
Sergei Barannikove0c1e232025-04-21 21:42:40 +03004363 TTI::TargetCostKind CostKind) const {
Matthew Devereaue00f22c2021-08-19 11:42:20 +01004364 if (useNeonVector(Src))
David Sherwooda458b782021-04-16 16:08:38 +01004365 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4366 CostKind);
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03004367 auto LT = getTypeLegalizationCost(Src);
Kerry McLaughlin5db52752021-06-08 10:49:22 +01004368 if (!LT.first.isValid())
4369 return InstructionCost::getInvalid();
Sander de Smaleneac16702021-07-14 09:43:30 +01004370
David Sherwood2dd41672024-06-25 15:04:24 +01004371 // Return an invalid cost for element types that we are unable to lower.
4372 auto *VT = cast<VectorType>(Src);
4373 if (VT->getElementType()->isIntegerTy(1))
4374 return InstructionCost::getInvalid();
4375
Sander de Smaleneac16702021-07-14 09:43:30 +01004376 // The code-generator is currently not able to handle scalable vectors
4377 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4378 // it. This change will be removed when code-generation for these types is
4379 // sufficiently reliable.
David Sherwood2dd41672024-06-25 15:04:24 +01004380 if (VT->getElementCount() == ElementCount::getScalable(1))
Sander de Smaleneac16702021-07-14 09:43:30 +01004381 return InstructionCost::getInvalid();
4382
liqinweng6efb45f2022-12-09 12:45:42 +08004383 return LT.first;
David Sherwooda458b782021-04-16 16:08:38 +01004384}
4385
Madhur Amilkanthwarb73771c2024-08-14 10:12:40 +05304386// This function returns gather/scatter overhead either from
4387// user-provided value or specialized values per-target from \p ST.
4388static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4389 const AArch64Subtarget *ST) {
4390 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4391 "Should be called on only load or stores.");
4392 switch (Opcode) {
4393 case Instruction::Load:
4394 if (SVEGatherOverhead.getNumOccurrences() > 0)
4395 return SVEGatherOverhead;
4396 return ST->getGatherOverhead();
4397 break;
4398 case Instruction::Store:
4399 if (SVEScatterOverhead.getNumOccurrences() > 0)
4400 return SVEScatterOverhead;
4401 return ST->getScatterOverhead();
4402 break;
4403 default:
4404 llvm_unreachable("Shouldn't have reached here");
4405 }
David Sherwood8b0448c2021-12-06 11:02:29 +00004406}
4407
Sander de Smalenfd1f8a52021-01-22 21:25:50 +00004408InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
Caroline Concatto060cfd92020-12-17 16:15:28 +00004409 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
Sergei Barannikove0c1e232025-04-21 21:42:40 +03004410 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
Antonio Frighetto138e6c12023-10-27 17:30:31 +02004411 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
Caroline Concatto01c190e2021-01-07 09:07:06 +00004412 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
4413 Alignment, CostKind, I);
Caroline Concatto060cfd92020-12-17 16:15:28 +00004414 auto *VT = cast<VectorType>(DataTy);
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03004415 auto LT = getTypeLegalizationCost(DataTy);
Kerry McLaughlin5db52752021-06-08 10:49:22 +01004416 if (!LT.first.isValid())
4417 return InstructionCost::getInvalid();
4418
David Sherwood2dd41672024-06-25 15:04:24 +01004419 // Return an invalid cost for element types that we are unable to lower.
Antonio Frighetto138e6c12023-10-27 17:30:31 +02004420 if (!LT.second.isVector() ||
David Sherwood2dd41672024-06-25 15:04:24 +01004421 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4422 VT->getElementType()->isIntegerTy(1))
Antonio Frighetto138e6c12023-10-27 17:30:31 +02004423 return InstructionCost::getInvalid();
4424
Sander de Smaleneac16702021-07-14 09:43:30 +01004425 // The code-generator is currently not able to handle scalable vectors
4426 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4427 // it. This change will be removed when code-generation for these types is
4428 // sufficiently reliable.
David Sherwood2dd41672024-06-25 15:04:24 +01004429 if (VT->getElementCount() == ElementCount::getScalable(1))
Sander de Smaleneac16702021-07-14 09:43:30 +01004430 return InstructionCost::getInvalid();
4431
Caroline Concatto060cfd92020-12-17 16:15:28 +00004432 ElementCount LegalVF = LT.second.getVectorElementCount();
Sander de Smalen03f47bd2021-01-23 12:14:21 +00004433 InstructionCost MemOpCost =
Alexey Bataevd53e2452022-08-19 05:13:25 -07004434 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
Philip Reamesc9608d52022-08-22 15:26:36 -07004435 {TTI::OK_AnyValue, TTI::OP_None}, I);
David Sherwood8b0448c2021-12-06 11:02:29 +00004436 // Add on an overhead cost for using gathers/scatters.
Madhur Amilkanthwarb73771c2024-08-14 10:12:40 +05304437 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
David Sherwood9448cdc2021-09-22 10:54:05 +01004438 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
Caroline Concatto060cfd92020-12-17 16:15:28 +00004439}
4440
Caroline Concatto37f4ccb2020-11-06 15:53:59 +00004441bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
4442 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
4443}
4444
Sander de Smalen03f47bd2021-01-23 12:14:21 +00004445InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
Sergei Barannikov3334c352025-04-22 11:40:12 +03004446 Align Alignment,
Sander de Smalen03f47bd2021-01-23 12:14:21 +00004447 unsigned AddressSpace,
4448 TTI::TargetCostKind CostKind,
Philip Reamesc9608d52022-08-22 15:26:36 -07004449 TTI::OperandValueInfo OpInfo,
Sergei Barannikove0c1e232025-04-21 21:42:40 +03004450 const Instruction *I) const {
Sjoerd Meijeree752132021-07-01 14:45:54 +01004451 EVT VT = TLI->getValueType(DL, Ty, true);
Sam Parker5b5e78ad2020-06-08 15:25:03 +01004452 // Type legalization can't handle structs
Sjoerd Meijeree752132021-07-01 14:45:54 +01004453 if (VT == MVT::Other)
Sam Parker5b5e78ad2020-06-08 15:25:03 +01004454 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
4455 CostKind);
4456
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03004457 auto LT = getTypeLegalizationCost(Ty);
Kerry McLaughlin5db52752021-06-08 10:49:22 +01004458 if (!LT.first.isValid())
4459 return InstructionCost::getInvalid();
Tim Northover3b0846e2014-05-24 12:50:23 +00004460
Sander de Smaleneac16702021-07-14 09:43:30 +01004461 // The code-generator is currently not able to handle scalable vectors
4462 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4463 // it. This change will be removed when code-generation for these types is
4464 // sufficiently reliable.
David Sherwood2dd41672024-06-25 15:04:24 +01004465 // We also only support full register predicate loads and stores.
Sander de Smaleneac16702021-07-14 09:43:30 +01004466 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
David Sherwood2dd41672024-06-25 15:04:24 +01004467 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
4468 (VTy->getElementType()->isIntegerTy(1) &&
4469 !VTy->getElementCount().isKnownMultipleOf(
4470 ElementCount::getScalable(16))))
Sander de Smaleneac16702021-07-14 09:43:30 +01004471 return InstructionCost::getInvalid();
4472
Florian Hahnacd9cc72021-04-15 09:22:32 +01004473 // TODO: consider latency as well for TCK_SizeAndLatency.
4474 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
4475 return LT.first;
4476
4477 if (CostKind != TTI::TCK_RecipThroughput)
4478 return 1;
4479
Matthew Simpson2c8de192016-12-15 18:36:59 +00004480 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
Sergei Barannikov3334c352025-04-22 11:40:12 +03004481 LT.second.is128BitVector() && Alignment < Align(16)) {
Evandro Menezes330e1b82017-01-10 23:42:21 +00004482 // Unaligned stores are extremely inefficient. We don't split all
4483 // unaligned 128-bit stores because the negative impact that has shown in
4484 // practice on inlined block copy code.
4485 // We make such stores expensive so that we will only vectorize if there
Tim Northover3b0846e2014-05-24 12:50:23 +00004486 // are 6 other instructions getting vectorized.
Evandro Menezes330e1b82017-01-10 23:42:21 +00004487 const int AmortizationCost = 6;
Tim Northover3b0846e2014-05-24 12:50:23 +00004488
4489 return LT.first * 2 * AmortizationCost;
4490 }
4491
Sjoerd Meijer5c94fab2022-12-16 09:20:37 +00004492 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4493 if (Ty->isPtrOrPtrVectorTy())
4494 return LT.first;
4495
Florian Hahne473daa2024-01-17 21:32:06 +00004496 if (useNeonVector(Ty)) {
4497 // Check truncating stores and extending loads.
4498 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4499 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4500 if (VT == MVT::v4i8)
4501 return 2;
4502 // Otherwise we need to scalarize.
4503 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
4504 }
4505 EVT EltVT = VT.getVectorElementType();
4506 unsigned EltSize = EltVT.getScalarSizeInBits();
4507 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
Sergei Barannikov3334c352025-04-22 11:40:12 +03004508 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
Florian Hahne473daa2024-01-17 21:32:06 +00004509 return LT.first;
4510 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4511 // widening to v4i8, which produces suboptimal results.
4512 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4513 return LT.first;
4514
4515 // Check non-power-of-2 loads/stores for legal vector element types with
4516 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4517 // operations on smaller power-of-2 ops, including ld1/st1.
4518 LLVMContext &C = Ty->getContext();
4519 InstructionCost Cost(0);
4520 SmallVector<EVT> TypeWorklist;
4521 TypeWorklist.push_back(VT);
4522 while (!TypeWorklist.empty()) {
4523 EVT CurrVT = TypeWorklist.pop_back_val();
4524 unsigned CurrNumElements = CurrVT.getVectorNumElements();
4525 if (isPowerOf2_32(CurrNumElements)) {
4526 Cost += 1;
4527 continue;
4528 }
4529
4530 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
4531 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
4532 TypeWorklist.push_back(
4533 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
4534 }
4535 return Cost;
Tim Northover3b0846e2014-05-24 12:50:23 +00004536 }
4537
4538 return LT.first;
4539}
James Molloy2b8933c2014-08-05 12:30:34 +00004540
Sander de Smalen03f47bd2021-01-23 12:14:21 +00004541InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
Guillaume Chateletfdc7c7f2020-06-26 11:00:53 +00004542 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4543 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
Sergei Barannikov0014b492025-04-22 06:27:29 +03004544 bool UseMaskForCond, bool UseMaskForGaps) const {
Hao Liu7ec8ee32015-06-26 02:32:07 +00004545 assert(Factor >= 2 && "Invalid interleave factor");
Graham Hunter95bfb192023-03-21 11:48:49 +00004546 auto *VecVTy = cast<VectorType>(VecTy);
4547
Philip Reamesb3c687b2024-10-15 07:37:46 -07004548 if (VecTy->isScalableTy() && !ST->hasSVE())
Graham Hunter95bfb192023-03-21 11:48:49 +00004549 return InstructionCost::getInvalid();
Hao Liu7ec8ee32015-06-26 02:32:07 +00004550
Igor Kirillov17bde322023-06-12 10:18:16 +00004551 // Vectorization for masked interleaved accesses is only enabled for scalable
4552 // VF.
4553 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
4554 return InstructionCost::getInvalid();
4555
4556 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
Graham Hunter95bfb192023-03-21 11:48:49 +00004557 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
Christopher Tetreault616d8d92020-06-16 13:42:58 -07004558 auto *SubVecTy =
Graham Hunter95bfb192023-03-21 11:48:49 +00004559 VectorType::get(VecVTy->getElementType(),
4560 VecVTy->getElementCount().divideCoefficientBy(Factor));
Hao Liu7ec8ee32015-06-26 02:32:07 +00004561
4562 // ldN/stN only support legal vector types of size 64 or 128 in bits.
Matthew Simpsonaee97712017-03-02 15:15:35 +00004563 // Accesses having vector types that are a multiple of 128 bits can be
4564 // matched to more than one ldN/stN instruction.
Bradley Smith13faa5f2021-10-18 12:29:26 +00004565 bool UseScalable;
Graham Hunter95bfb192023-03-21 11:48:49 +00004566 if (MinElts % Factor == 0 &&
Bradley Smith13faa5f2021-10-18 12:29:26 +00004567 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4568 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
Hao Liu7ec8ee32015-06-26 02:32:07 +00004569 }
4570
4571 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Sam Parker40574fe2020-04-28 14:11:27 +01004572 Alignment, AddressSpace, CostKind,
Dorit Nuzman34da6dd2018-10-31 09:57:56 +00004573 UseMaskForCond, UseMaskForGaps);
Hao Liu7ec8ee32015-06-26 02:32:07 +00004574}
4575
Daniil Fukalove1cb98b2021-05-20 12:09:16 +03004576InstructionCost
Sergei Barannikov0014b492025-04-22 06:27:29 +03004577AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {
Sander de Smalen03f47bd2021-01-23 12:14:21 +00004578 InstructionCost Cost = 0;
Sam Parker40574fe2020-04-28 14:11:27 +01004579 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
James Molloy2b8933c2014-08-05 12:30:34 +00004580 for (auto *I : Tys) {
4581 if (!I->isVectorTy())
4582 continue;
Christopher Tetreaultab35ba52020-06-30 11:07:24 -07004583 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
4584 128)
Sam Parker40574fe2020-04-28 14:11:27 +01004585 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
4586 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
James Molloy2b8933c2014-08-05 12:30:34 +00004587 }
Daniil Fukalove1cb98b2021-05-20 12:09:16 +03004588 return Cost;
James Molloy2b8933c2014-08-05 12:30:34 +00004589}
James Molloya88896b2014-08-21 00:02:51 +00004590
Sergei Barannikov0014b492025-04-22 06:27:29 +03004591unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
Matthias Braun651cff42016-06-02 18:03:53 +00004592 return ST->getMaxInterleaveFactor();
James Molloya88896b2014-08-21 00:02:51 +00004593}
Kevin Qin72a799a2014-10-09 10:13:27 +00004594
Geoff Berry378374d2017-06-28 18:53:09 +00004595// For Falkor, we want to avoid having too many strided loads in a loop since
4596// that can exhaust the HW prefetcher resources. We adjust the unroller
4597// MaxCount preference below to attempt to ensure unrolling doesn't create too
4598// many strided loads.
4599static void
4600getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4601 TargetTransformInfo::UnrollingPreferences &UP) {
Geoff Berry0abd9802017-06-28 19:36:10 +00004602 enum { MaxStridedLoads = 7 };
Geoff Berry378374d2017-06-28 18:53:09 +00004603 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
4604 int StridedLoads = 0;
4605 // FIXME? We could make this more precise by looking at the CFG and
4606 // e.g. not counting loads in each side of an if-then-else diamond.
4607 for (const auto BB : L->blocks()) {
4608 for (auto &I : *BB) {
4609 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
4610 if (!LMemI)
4611 continue;
4612
4613 Value *PtrValue = LMemI->getPointerOperand();
4614 if (L->isLoopInvariant(PtrValue))
4615 continue;
4616
4617 const SCEV *LSCEV = SE.getSCEV(PtrValue);
4618 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
4619 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
4620 continue;
4621
4622 // FIXME? We could take pairing of unrolled load copies into account
4623 // by looking at the AddRec, but we would probably have to limit this
4624 // to loops with no stores or other memory optimization barriers.
4625 ++StridedLoads;
4626 // We've seen enough strided loads that seeing more won't make a
4627 // difference.
4628 if (StridedLoads > MaxStridedLoads / 2)
4629 return StridedLoads;
4630 }
4631 }
4632 return StridedLoads;
4633 };
4634
4635 int StridedLoads = countStridedLoads(L, SE);
Nicola Zaghend34e60c2018-05-14 12:53:11 +00004636 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
4637 << " strided loads\n");
Geoff Berry378374d2017-06-28 18:53:09 +00004638 // Pick the largest power of 2 unroll count that won't result in too many
4639 // strided loads.
4640 if (StridedLoads) {
4641 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
Nicola Zaghend34e60c2018-05-14 12:53:11 +00004642 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
4643 << UP.MaxCount << '\n');
Geoff Berry378374d2017-06-28 18:53:09 +00004644 }
4645}
4646
David Sherwood712c2132025-04-09 10:34:27 +01004647// This function returns true if the loop:
4648// 1. Has a valid cost, and
4649// 2. Has a cost within the supplied budget.
4650// Otherwise it returns false.
Sergei Barannikov0014b492025-04-22 06:27:29 +03004651static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI,
David Sherwood712c2132025-04-09 10:34:27 +01004652 InstructionCost Budget,
4653 unsigned *FinalSize) {
4654 // Estimate the size of the loop.
4655 InstructionCost LoopCost = 0;
4656
4657 for (auto *BB : L->getBlocks()) {
4658 for (auto &I : *BB) {
4659 SmallVector<const Value *, 4> Operands(I.operand_values());
4660 InstructionCost Cost =
4661 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
4662 // This can happen with intrinsics that don't currently have a cost model
4663 // or for some operations that require SVE.
4664 if (!Cost.isValid())
4665 return false;
4666
4667 LoopCost += Cost;
4668 if (LoopCost > Budget)
4669 return false;
4670 }
4671 }
4672
4673 if (FinalSize)
David Green98b6f8d2025-04-23 07:46:27 +01004674 *FinalSize = LoopCost.getValue();
David Sherwood712c2132025-04-09 10:34:27 +01004675 return true;
4676}
4677
4678static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
Sergei Barannikov0014b492025-04-22 06:27:29 +03004679 const AArch64TTIImpl &TTI) {
David Sherwood712c2132025-04-09 10:34:27 +01004680 // Only consider loops with unknown trip counts for which we can determine
4681 // a symbolic expression. Multi-exit loops with small known trip counts will
4682 // likely be unrolled anyway.
4683 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4684 if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
4685 return false;
4686
4687 // It might not be worth unrolling loops with low max trip counts. Restrict
4688 // this to max trip counts > 32 for now.
4689 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
4690 if (MaxTC > 0 && MaxTC <= 32)
4691 return false;
4692
4693 // Make sure the loop size is <= 5.
4694 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
4695 return false;
4696
4697 // Small search loops with multiple exits can be highly beneficial to unroll.
4698 // We only care about loops with exactly two exiting blocks, although each
4699 // block could jump to the same exit block.
4700 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
4701 if (Blocks.size() != 2)
4702 return false;
4703
4704 if (any_of(Blocks, [](BasicBlock *BB) {
4705 return !isa<BranchInst>(BB->getTerminator());
4706 }))
4707 return false;
4708
4709 return true;
4710}
4711
Florian Hahn0bb7bd42024-12-09 14:28:31 +00004712/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
4713/// OOO engine's wide instruction window and various predictors.
4714static void
4715getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4716 TargetTransformInfo::UnrollingPreferences &UP,
Sergei Barannikov0014b492025-04-22 06:27:29 +03004717 const AArch64TTIImpl &TTI) {
Florian Hahn0bb7bd42024-12-09 14:28:31 +00004718 // Limit loops with structure that is highly likely to benefit from runtime
Florian Hahn46a13a52025-02-27 14:42:45 +00004719 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
4720 // likely with complex control flow). Note that the heuristics here may be
4721 // overly conservative and we err on the side of avoiding runtime unrolling
4722 // rather than unroll excessively. They are all subject to further refinement.
4723 if (!L->isInnermost() || L->getNumBlocks() > 8)
Florian Hahn0bb7bd42024-12-09 14:28:31 +00004724 return;
4725
David Sherwood712c2132025-04-09 10:34:27 +01004726 // Loops with multiple exits are handled by common code.
4727 if (!L->getExitBlock())
4728 return;
4729
Florian Hahn46a13a52025-02-27 14:42:45 +00004730 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
Florian Hahn0bb7bd42024-12-09 14:28:31 +00004731 if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
4732 (SE.getSmallConstantMaxTripCount(L) > 0 &&
4733 SE.getSmallConstantMaxTripCount(L) <= 32))
4734 return;
David Sherwood712c2132025-04-09 10:34:27 +01004735
Florian Hahn0bb7bd42024-12-09 14:28:31 +00004736 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4737 return;
4738
Florian Hahn46a13a52025-02-27 14:42:45 +00004739 if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L))
4740 return;
4741
Florian Hahn0bb7bd42024-12-09 14:28:31 +00004742 // Limit to loops with trip counts that are cheap to expand.
4743 UP.SCEVExpansionBudget = 1;
4744
4745 // Try to unroll small, single block loops, if they have load/store
4746 // dependencies, to expose more parallel memory access streams.
Florian Hahnd486b762024-12-22 13:10:54 +00004747 BasicBlock *Header = L->getHeader();
4748 if (Header == L->getLoopLatch()) {
David Sherwood712c2132025-04-09 10:34:27 +01004749 // Estimate the size of the loop.
4750 unsigned Size;
4751 if (!isLoopSizeWithinBudget(L, TTI, 8, &Size))
Florian Hahnd486b762024-12-22 13:10:54 +00004752 return;
Florian Hahn0bb7bd42024-12-09 14:28:31 +00004753
Florian Hahnd486b762024-12-22 13:10:54 +00004754 SmallPtrSet<Value *, 8> LoadedValues;
4755 SmallVector<StoreInst *> Stores;
4756 for (auto *BB : L->blocks()) {
4757 for (auto &I : *BB) {
4758 Value *Ptr = getLoadStorePointerOperand(&I);
4759 if (!Ptr)
4760 continue;
4761 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4762 if (SE.isLoopInvariant(PtrSCEV, L))
4763 continue;
4764 if (isa<LoadInst>(&I))
4765 LoadedValues.insert(&I);
4766 else
4767 Stores.push_back(cast<StoreInst>(&I));
4768 }
Florian Hahn0bb7bd42024-12-09 14:28:31 +00004769 }
Florian Hahnd486b762024-12-22 13:10:54 +00004770
4771 // Try to find an unroll count that maximizes the use of the instruction
4772 // window, i.e. trying to fetch as many instructions per cycle as possible.
4773 unsigned MaxInstsPerLine = 16;
4774 unsigned UC = 1;
4775 unsigned BestUC = 1;
4776 unsigned SizeWithBestUC = BestUC * Size;
4777 while (UC <= 8) {
4778 unsigned SizeWithUC = UC * Size;
4779 if (SizeWithUC > 48)
4780 break;
4781 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4782 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4783 BestUC = UC;
4784 SizeWithBestUC = BestUC * Size;
4785 }
4786 UC++;
4787 }
4788
4789 if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
4790 return LoadedValues.contains(SI->getOperand(0));
4791 }))
4792 return;
4793
4794 UP.Runtime = true;
4795 UP.DefaultUnrollRuntimeCount = BestUC;
4796 return;
Florian Hahn0bb7bd42024-12-09 14:28:31 +00004797 }
4798
Florian Hahnd486b762024-12-22 13:10:54 +00004799 // Try to runtime-unroll loops with early-continues depending on loop-varying
4800 // loads; this helps with branch-prediction for the early-continues.
4801 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
4802 auto *Latch = L->getLoopLatch();
4803 SmallVector<BasicBlock *> Preds(predecessors(Latch));
4804 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
4805 none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
4806 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
Florian Hahn0bb7bd42024-12-09 14:28:31 +00004807 return;
4808
Florian Hahnd486b762024-12-22 13:10:54 +00004809 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
4810 [&](Instruction *I, unsigned Depth) -> bool {
4811 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
4812 return false;
4813
4814 if (isa<LoadInst>(I))
4815 return true;
4816
4817 return any_of(I->operands(), [&](Value *V) {
4818 auto *I = dyn_cast<Instruction>(V);
4819 return I && DependsOnLoopLoad(I, Depth + 1);
4820 });
4821 };
4822 CmpPredicate Pred;
4823 Instruction *I;
4824 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
4825 m_Value())) &&
4826 DependsOnLoopLoad(I, 0)) {
4827 UP.Runtime = true;
4828 }
Florian Hahn0bb7bd42024-12-09 14:28:31 +00004829}
4830
Sergei Barannikov0014b492025-04-22 06:27:29 +03004831void AArch64TTIImpl::getUnrollingPreferences(
4832 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
4833 OptimizationRemarkEmitter *ORE) const {
Kevin Qinaef68412015-03-09 06:14:28 +00004834 // Enable partial unrolling and runtime unrolling.
Roman Lebedev6f6e9a82021-08-03 00:57:26 +03004835 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
Kevin Qinaef68412015-03-09 06:14:28 +00004836
Jingu Kang94c495292021-07-14 11:43:29 +01004837 UP.UpperBound = true;
4838
Kevin Qinaef68412015-03-09 06:14:28 +00004839 // For inner loop, it is more likely to be a hot one, and the runtime check
4840 // can be promoted out from LICM pass, so the overhead is less, let's try
4841 // a larger threshold to unroll more loops.
4842 if (L->getLoopDepth() > 1)
4843 UP.PartialThreshold *= 2;
4844
Kevin Qin72a799a2014-10-09 10:13:27 +00004845 // Disable partial & runtime unrolling on -Os.
4846 UP.PartialOptSizeThreshold = 0;
Geoff Berry378374d2017-06-28 18:53:09 +00004847
David Sherwood712c2132025-04-09 10:34:27 +01004848 // Scan the loop: don't unroll loops with calls as this could prevent
4849 // inlining. Don't unroll vector loops either, as they don't benefit much from
4850 // unrolling.
4851 for (auto *BB : L->getBlocks()) {
4852 for (auto &I : *BB) {
4853 // Don't unroll vectorised loop.
4854 if (I.getType()->isVectorTy())
4855 return;
4856
4857 if (isa<CallBase>(I)) {
4858 if (isa<CallInst>(I) || isa<InvokeInst>(I))
4859 if (const Function *F = cast<CallBase>(I).getCalledFunction())
4860 if (!isLoweredToCall(F))
4861 continue;
4862 return;
4863 }
4864 }
4865 }
4866
Florian Hahn0bb7bd42024-12-09 14:28:31 +00004867 // Apply subtarget-specific unrolling preferences.
4868 switch (ST->getProcFamily()) {
4869 case AArch64Subtarget::AppleA14:
4870 case AArch64Subtarget::AppleA15:
4871 case AArch64Subtarget::AppleA16:
4872 case AArch64Subtarget::AppleM4:
4873 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
4874 break;
4875 case AArch64Subtarget::Falkor:
4876 if (EnableFalkorHWPFUnrollFix)
4877 getFalkorUnrollingPreferences(L, SE, UP);
4878 break;
4879 default:
4880 break;
4881 }
Nicholas Guy2b6e0c92021-03-04 14:36:13 +00004882
David Sherwood712c2132025-04-09 10:34:27 +01004883 // If this is a small, multi-exit loop similar to something like std::find,
4884 // then there is typically a performance improvement achieved by unrolling.
4885 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
4886 UP.RuntimeUnrollMultiExit = true;
4887 UP.Runtime = true;
4888 // Limit unroll count.
4889 UP.DefaultUnrollRuntimeCount = 4;
4890 // Allow slightly more costly trip-count expansion to catch search loops
4891 // with pointer inductions.
4892 UP.SCEVExpansionBudget = 5;
4893 return;
Nicholas Guy2b6e0c92021-03-04 14:36:13 +00004894 }
4895
4896 // Enable runtime unrolling for in-order models
4897 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
4898 // checking for that case, we can ensure that the default behaviour is
4899 // unchanged
David Green6424abc2025-02-07 10:16:57 +00004900 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
Nicholas Guy2b6e0c92021-03-04 14:36:13 +00004901 !ST->getSchedModel().isOutOfOrder()) {
4902 UP.Runtime = true;
4903 UP.Partial = true;
Nicholas Guy2b6e0c92021-03-04 14:36:13 +00004904 UP.UnrollRemainder = true;
4905 UP.DefaultUnrollRuntimeCount = 4;
Nicholas Guy3043cbc2021-05-26 14:49:58 +01004906
4907 UP.UnrollAndJam = true;
4908 UP.UnrollAndJamInnerLoopThreshold = 60;
Nicholas Guy2b6e0c92021-03-04 14:36:13 +00004909 }
Kevin Qin72a799a2014-10-09 10:13:27 +00004910}
Chad Rosierf9327d6f2015-01-26 22:51:15 +00004911
Sidharth Bavejae541e1b2020-07-10 18:38:08 +00004912void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
Sergei Barannikov0014b492025-04-22 06:27:29 +03004913 TTI::PeelingPreferences &PP) const {
Sidharth Bavejae541e1b2020-07-10 18:38:08 +00004914 BaseT::getPeelingPreferences(L, SE, PP);
4915}
4916
Sergei Barannikov0014b492025-04-22 06:27:29 +03004917Value *
4918AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
4919 Type *ExpectedType) const {
Chad Rosierf9327d6f2015-01-26 22:51:15 +00004920 switch (Inst->getIntrinsicID()) {
4921 default:
4922 return nullptr;
4923 case Intrinsic::aarch64_neon_st2:
4924 case Intrinsic::aarch64_neon_st3:
4925 case Intrinsic::aarch64_neon_st4: {
4926 // Create a struct type
4927 StructType *ST = dyn_cast<StructType>(ExpectedType);
4928 if (!ST)
4929 return nullptr;
Kazu Hiratac1e32b32021-10-02 12:06:29 -07004930 unsigned NumElts = Inst->arg_size() - 1;
Chad Rosierf9327d6f2015-01-26 22:51:15 +00004931 if (ST->getNumElements() != NumElts)
4932 return nullptr;
4933 for (unsigned i = 0, e = NumElts; i != e; ++i) {
4934 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
4935 return nullptr;
4936 }
Manuel Brito1e55d5b2022-11-21 18:41:01 +00004937 Value *Res = PoisonValue::get(ExpectedType);
Chad Rosierf9327d6f2015-01-26 22:51:15 +00004938 IRBuilder<> Builder(Inst);
4939 for (unsigned i = 0, e = NumElts; i != e; ++i) {
4940 Value *L = Inst->getArgOperand(i);
4941 Res = Builder.CreateInsertValue(Res, L, i);
4942 }
4943 return Res;
4944 }
4945 case Intrinsic::aarch64_neon_ld2:
4946 case Intrinsic::aarch64_neon_ld3:
4947 case Intrinsic::aarch64_neon_ld4:
4948 if (Inst->getType() == ExpectedType)
4949 return Inst;
4950 return nullptr;
4951 }
4952}
4953
Chandler Carruth705b1852015-01-31 03:43:40 +00004954bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
Sergei Barannikov0014b492025-04-22 06:27:29 +03004955 MemIntrinsicInfo &Info) const {
Chad Rosierf9327d6f2015-01-26 22:51:15 +00004956 switch (Inst->getIntrinsicID()) {
4957 default:
4958 break;
4959 case Intrinsic::aarch64_neon_ld2:
4960 case Intrinsic::aarch64_neon_ld3:
4961 case Intrinsic::aarch64_neon_ld4:
4962 Info.ReadMem = true;
4963 Info.WriteMem = false;
Chad Rosierf9327d6f2015-01-26 22:51:15 +00004964 Info.PtrVal = Inst->getArgOperand(0);
4965 break;
4966 case Intrinsic::aarch64_neon_st2:
4967 case Intrinsic::aarch64_neon_st3:
4968 case Intrinsic::aarch64_neon_st4:
4969 Info.ReadMem = false;
4970 Info.WriteMem = true;
Kazu Hiratac1e32b32021-10-02 12:06:29 -07004971 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
Chad Rosierf9327d6f2015-01-26 22:51:15 +00004972 break;
4973 }
4974
4975 switch (Inst->getIntrinsicID()) {
4976 default:
4977 return false;
4978 case Intrinsic::aarch64_neon_ld2:
4979 case Intrinsic::aarch64_neon_st2:
4980 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
4981 break;
4982 case Intrinsic::aarch64_neon_ld3:
4983 case Intrinsic::aarch64_neon_st3:
4984 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
4985 break;
4986 case Intrinsic::aarch64_neon_ld4:
4987 case Intrinsic::aarch64_neon_st4:
4988 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
4989 break;
4990 }
4991 return true;
4992}
Adam Nemet53e758f2016-03-18 00:27:29 +00004993
Jun Bum Limdee55652017-04-03 19:20:07 +00004994/// See if \p I should be considered for address type promotion. We check if \p
4995/// I is a sext with right type and used in memory accesses. If it used in a
4996/// "complex" getelementptr, we allow it to be promoted without finding other
4997/// sext instructions that sign extended the same initial value. A getelementptr
4998/// is considered as "complex" if it has more than 2 operands.
4999bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
Sergei Barannikov0014b492025-04-22 06:27:29 +03005000 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
Jun Bum Limdee55652017-04-03 19:20:07 +00005001 bool Considerable = false;
5002 AllowPromotionWithoutCommonHeader = false;
5003 if (!isa<SExtInst>(&I))
5004 return false;
5005 Type *ConsideredSExtType =
5006 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5007 if (I.getType() != ConsideredSExtType)
5008 return false;
5009 // See if the sext is the one with the right type and used in at least one
5010 // GetElementPtrInst.
5011 for (const User *U : I.users()) {
5012 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5013 Considerable = true;
5014 // A getelementptr is considered as "complex" if it has more than 2
5015 // operands. We will promote a SExt used in such complex GEP as we
5016 // expect some computation to be merged if they are done on 64 bits.
5017 if (GEPInst->getNumOperands() > 2) {
5018 AllowPromotionWithoutCommonHeader = true;
5019 break;
5020 }
5021 }
5022 }
5023 return Considerable;
5024}
5025
Simon Pilgrim5e6bfb62021-06-11 10:19:37 +01005026bool AArch64TTIImpl::isLegalToVectorizeReduction(
5027 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
Kerry McLaughlinba1e1502021-02-16 10:43:42 +00005028 if (!VF.isScalable())
5029 return true;
5030
5031 Type *Ty = RdxDesc.getRecurrenceType();
Kerry McLaughlina7512402021-07-06 10:49:43 +01005032 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
Kerry McLaughlinba1e1502021-02-16 10:43:42 +00005033 return false;
5034
5035 switch (RdxDesc.getRecurrenceKind()) {
5036 case RecurKind::Add:
5037 case RecurKind::FAdd:
5038 case RecurKind::And:
5039 case RecurKind::Or:
5040 case RecurKind::Xor:
5041 case RecurKind::SMin:
5042 case RecurKind::SMax:
5043 case RecurKind::UMin:
5044 case RecurKind::UMax:
5045 case RecurKind::FMin:
5046 case RecurKind::FMax:
Rosie Sumpterc2441b62021-10-11 15:50:44 +01005047 case RecurKind::FMulAdd:
Mel Chen425e9e82023-07-19 02:51:15 -07005048 case RecurKind::IAnyOf:
5049 case RecurKind::FAnyOf:
Kerry McLaughlinba1e1502021-02-16 10:43:42 +00005050 return true;
5051 default:
5052 return false;
5053 }
5054}
5055
Sander de Smalen2285dfb2021-01-22 22:07:09 +00005056InstructionCost
David Green12025ce2023-07-04 15:02:30 +01005057AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
5058 FastMathFlags FMF,
Sergei Barannikove0c1e232025-04-21 21:42:40 +03005059 TTI::TargetCostKind CostKind) const {
David Green0b745a12024-08-09 14:25:07 +01005060 // The code-generator is currently not able to handle scalable vectors
5061 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5062 // it. This change will be removed when code-generation for these types is
5063 // sufficiently reliable.
5064 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5065 if (VTy->getElementCount() == ElementCount::getScalable(1))
5066 return InstructionCost::getInvalid();
5067
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03005068 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
David Green649cf452021-08-05 23:23:24 +01005069
5070 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
David Green12025ce2023-07-04 15:02:30 +01005071 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
David Green649cf452021-08-05 23:23:24 +01005072
Sander de Smalendb134e22021-01-22 21:44:23 +00005073 InstructionCost LegalizationCost = 0;
Caroline Concatto172f1f82020-12-21 15:04:29 +00005074 if (LT.first > 1) {
5075 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
David Green12025ce2023-07-04 15:02:30 +01005076 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
David Green649cf452021-08-05 23:23:24 +01005077 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
Caroline Concatto172f1f82020-12-21 15:04:29 +00005078 }
5079
5080 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5081}
5082
Sander de Smalenbd868242021-01-22 21:33:51 +00005083InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
Sergei Barannikove0c1e232025-04-21 21:42:40 +03005084 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03005085 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
Sander de Smalen4f42d872021-04-14 16:53:01 +01005086 InstructionCost LegalizationCost = 0;
Caroline Concatto172f1f82020-12-21 15:04:29 +00005087 if (LT.first > 1) {
5088 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5089 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5090 LegalizationCost *= LT.first - 1;
5091 }
5092
5093 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5094 assert(ISD && "Invalid opcode");
5095 // Add the final reduction cost for the legal horizontal reduction
5096 switch (ISD) {
5097 case ISD::ADD:
5098 case ISD::AND:
5099 case ISD::OR:
5100 case ISD::XOR:
5101 case ISD::FADD:
5102 return LegalizationCost + 2;
5103 default:
Sander de Smalenbd868242021-01-22 21:33:51 +00005104 return InstructionCost::getInvalid();
Caroline Concatto172f1f82020-12-21 15:04:29 +00005105 }
5106}
5107
Sander de Smalenbd868242021-01-22 21:33:51 +00005108InstructionCost
5109AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
Krzysztof Parzyszek86fe4df2022-12-02 09:35:05 -08005110 std::optional<FastMathFlags> FMF,
Sergei Barannikove0c1e232025-04-21 21:42:40 +03005111 TTI::TargetCostKind CostKind) const {
David Green0b745a12024-08-09 14:25:07 +01005112 // The code-generator is currently not able to handle scalable vectors
5113 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5114 // it. This change will be removed when code-generation for these types is
5115 // sufficiently reliable.
5116 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5117 if (VTy->getElementCount() == ElementCount::getScalable(1))
5118 return InstructionCost::getInvalid();
5119
David Sherwood0aff1792021-07-07 13:18:20 +01005120 if (TTI::requiresOrderedReduction(FMF)) {
David Sherwood219d4512021-08-18 09:40:21 +01005121 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5122 InstructionCost BaseCost =
5123 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5124 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5125 // end up vectorizing for more computationally intensive loops.
5126 return BaseCost + FixedVTy->getNumElements();
5127 }
David Sherwood0aff1792021-07-07 13:18:20 +01005128
5129 if (Opcode != Instruction::FAdd)
5130 return InstructionCost::getInvalid();
5131
5132 auto *VTy = cast<ScalableVectorType>(ValTy);
5133 InstructionCost Cost =
5134 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5135 Cost *= getMaxNumElements(VTy->getElementCount());
5136 return Cost;
5137 }
5138
Caroline Concatto172f1f82020-12-21 15:04:29 +00005139 if (isa<ScalableVectorType>(ValTy))
David Green38c9a402021-07-09 11:51:16 +01005140 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
Matthew Simpsoneacfefd2018-03-16 11:34:15 +00005141
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03005142 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
Matthew Simpsoneacfefd2018-03-16 11:34:15 +00005143 MVT MTy = LT.second;
5144 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5145 assert(ISD && "Invalid opcode");
5146
5147 // Horizontal adds can use the 'addv' instruction. We model the cost of these
David Greenc9cebda2021-07-22 18:19:54 +01005148 // instructions as twice a normal vector add, plus 1 for each legalization
5149 // step (LT.first). This is the only arithmetic vector reduction operation for
5150 // which we have an instruction.
Rosie Sumpter0c4651f2021-06-15 10:29:27 +01005151 // OR, XOR and AND costs should match the codegen from:
5152 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5153 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5154 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
Matthew Simpsoneacfefd2018-03-16 11:34:15 +00005155 static const CostTblEntry CostTblNoPairwise[]{
David Greenc9cebda2021-07-22 18:19:54 +01005156 {ISD::ADD, MVT::v8i8, 2},
5157 {ISD::ADD, MVT::v16i8, 2},
5158 {ISD::ADD, MVT::v4i16, 2},
5159 {ISD::ADD, MVT::v8i16, 2},
5160 {ISD::ADD, MVT::v4i32, 2},
Vasileios Porpodasf6690302022-07-28 17:01:15 -07005161 {ISD::ADD, MVT::v2i64, 2},
Rosie Sumpter0c4651f2021-06-15 10:29:27 +01005162 {ISD::OR, MVT::v8i8, 15},
5163 {ISD::OR, MVT::v16i8, 17},
5164 {ISD::OR, MVT::v4i16, 7},
5165 {ISD::OR, MVT::v8i16, 9},
5166 {ISD::OR, MVT::v2i32, 3},
5167 {ISD::OR, MVT::v4i32, 5},
5168 {ISD::OR, MVT::v2i64, 3},
5169 {ISD::XOR, MVT::v8i8, 15},
5170 {ISD::XOR, MVT::v16i8, 17},
5171 {ISD::XOR, MVT::v4i16, 7},
5172 {ISD::XOR, MVT::v8i16, 9},
5173 {ISD::XOR, MVT::v2i32, 3},
5174 {ISD::XOR, MVT::v4i32, 5},
5175 {ISD::XOR, MVT::v2i64, 3},
5176 {ISD::AND, MVT::v8i8, 15},
5177 {ISD::AND, MVT::v16i8, 17},
5178 {ISD::AND, MVT::v4i16, 7},
5179 {ISD::AND, MVT::v8i16, 9},
5180 {ISD::AND, MVT::v2i32, 3},
5181 {ISD::AND, MVT::v4i32, 5},
5182 {ISD::AND, MVT::v2i64, 3},
Matthew Simpsoneacfefd2018-03-16 11:34:15 +00005183 };
Rosie Sumpter0c4651f2021-06-15 10:29:27 +01005184 switch (ISD) {
5185 default:
5186 break;
Sushant Gokhalec5672e22024-09-24 14:35:01 +05305187 case ISD::FADD:
5188 if (Type *EltTy = ValTy->getScalarType();
5189 // FIXME: For half types without fullfp16 support, this could extend and
5190 // use a fp32 faddp reduction but current codegen unrolls.
5191 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5192 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5193 const unsigned NElts = MTy.getVectorNumElements();
5194 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5195 isPowerOf2_32(NElts))
5196 // Reduction corresponding to series of fadd instructions is lowered to
5197 // series of faddp instructions. faddp has latency/throughput that
5198 // matches fadd instruction and hence, every faddp instruction can be
5199 // considered to have a relative cost = 1 with
5200 // CostKind = TCK_RecipThroughput.
5201 // An faddp will pairwise add vector elements, so the size of input
5202 // vector reduces by half every time, requiring
5203 // #(faddp instructions) = log2_32(NElts).
5204 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5205 }
5206 break;
Rosie Sumpter0c4651f2021-06-15 10:29:27 +01005207 case ISD::ADD:
5208 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
David Greenc9cebda2021-07-22 18:19:54 +01005209 return (LT.first - 1) + Entry->Cost;
Rosie Sumpter0c4651f2021-06-15 10:29:27 +01005210 break;
5211 case ISD::XOR:
5212 case ISD::AND:
5213 case ISD::OR:
5214 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5215 if (!Entry)
5216 break;
5217 auto *ValVTy = cast<FixedVectorType>(ValTy);
David Greene79fac22023-06-01 09:28:48 +01005218 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
Rosie Sumpter0c4651f2021-06-15 10:29:27 +01005219 isPowerOf2_32(ValVTy->getNumElements())) {
5220 InstructionCost ExtraCost = 0;
5221 if (LT.first != 1) {
5222 // Type needs to be split, so there is an extra cost of LT.first - 1
5223 // arithmetic ops.
5224 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5225 MTy.getVectorNumElements());
5226 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5227 ExtraCost *= LT.first - 1;
5228 }
David Greene79fac22023-06-01 09:28:48 +01005229 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5230 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5231 return Cost + ExtraCost;
Rosie Sumpter0c4651f2021-06-15 10:29:27 +01005232 }
5233 break;
5234 }
David Sherwood0aff1792021-07-07 13:18:20 +01005235 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
Matthew Simpsoneacfefd2018-03-16 11:34:15 +00005236}
Matthew Simpsonb4096eb2018-04-26 13:48:33 +00005237
David Greene5f40192025-02-15 20:33:03 +00005238InstructionCost AArch64TTIImpl::getExtendedReductionCost(
5239 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
Sergei Barannikov0014b492025-04-22 06:27:29 +03005240 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
David Greene5f40192025-02-15 20:33:03 +00005241 EVT VecVT = TLI->getValueType(DL, VecTy);
5242 EVT ResVT = TLI->getValueType(DL, ResTy);
5243
5244 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5245 VecVT.getSizeInBits() >= 64) {
5246 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5247
5248 // The legal cases are:
5249 // UADDLV 8/16/32->32
5250 // UADDLP 32->64
5251 unsigned RevVTSize = ResVT.getSizeInBits();
5252 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5253 RevVTSize <= 32) ||
5254 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5255 RevVTSize <= 32) ||
5256 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5257 RevVTSize <= 64))
5258 return (LT.first - 1) * 2 + 2;
5259 }
5260
5261 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5262 CostKind);
5263}
5264
5265InstructionCost
5266AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
5267 VectorType *VecTy,
Sergei Barannikov0014b492025-04-22 06:27:29 +03005268 TTI::TargetCostKind CostKind) const {
David Greene5f40192025-02-15 20:33:03 +00005269 EVT VecVT = TLI->getValueType(DL, VecTy);
5270 EVT ResVT = TLI->getValueType(DL, ResTy);
5271
5272 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple()) {
5273 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5274
5275 // The legal cases with dotprod are
5276 // UDOT 8->32
5277 // Which requires an additional uaddv to sum the i32 values.
5278 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5279 ResVT == MVT::i32)
5280 return LT.first + 2;
5281 }
5282
5283 return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, VecTy, CostKind);
5284}
5285
Sergei Barannikove0c1e232025-04-21 21:42:40 +03005286InstructionCost
5287AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index,
5288 TTI::TargetCostKind CostKind) const {
Caroline Concattoa2c5c562021-06-18 15:39:03 +01005289 static const CostTblEntry ShuffleTbl[] = {
5290 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5291 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5292 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5293 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5294 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5295 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5296 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5297 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5298 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5299 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5300 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5301 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5302 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5303 };
5304
Paul Walker3bb22872022-08-26 14:32:46 +01005305 // The code-generator is currently not able to handle scalable vectors
5306 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5307 // it. This change will be removed when code-generation for these types is
5308 // sufficiently reliable.
5309 if (Tp->getElementCount() == ElementCount::getScalable(1))
5310 return InstructionCost::getInvalid();
5311
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03005312 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
Caroline Concattoa2c5c562021-06-18 15:39:03 +01005313 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
Caroline Concattoa2c5c562021-06-18 15:39:03 +01005314 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5315 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5316 : LT.second;
5317 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5318 InstructionCost LegalizationCost = 0;
5319 if (Index < 0) {
5320 LegalizationCost =
5321 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5322 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
5323 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5324 CmpInst::BAD_ICMP_PREDICATE, CostKind);
5325 }
5326
5327 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5328 // Cost performed on a promoted type.
5329 if (LT.second.getScalarType() == MVT::i1) {
5330 LegalizationCost +=
5331 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5332 TTI::CastContextHint::None, CostKind) +
5333 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5334 TTI::CastContextHint::None, CostKind);
5335 }
5336 const auto *Entry =
5337 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5338 assert(Entry && "Illegal Type for Splice");
5339 LegalizationCost += Entry->Cost;
5340 return LegalizationCost * LT.first;
5341}
5342
David Sherwooda733c1f2025-01-20 14:07:03 +00005343InstructionCost AArch64TTIImpl::getPartialReductionCost(
5344 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5345 ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
5346 TTI::PartialReductionExtendKind OpBExtend,
5347 std::optional<unsigned> BinOp) const {
5348 InstructionCost Invalid = InstructionCost::getInvalid();
5349 InstructionCost Cost(TTI::TCC_Basic);
5350
Nicholas Guy9c89faa62025-02-13 10:35:45 +00005351 // Sub opcodes currently only occur in chained cases.
5352 // Independent partial reduction subtractions are still costed as an add
5353 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
David Sherwooda733c1f2025-01-20 14:07:03 +00005354 return Invalid;
5355
5356 if (InputTypeA != InputTypeB)
5357 return Invalid;
5358
5359 EVT InputEVT = EVT::getEVT(InputTypeA);
5360 EVT AccumEVT = EVT::getEVT(AccumType);
5361
Sam Tebbsc7995a62025-02-05 13:34:43 +00005362 unsigned VFMinValue = VF.getKnownMinValue();
5363
5364 if (VF.isScalable()) {
5365 if (!ST->isSVEorStreamingSVEAvailable())
5366 return Invalid;
5367
5368 // Don't accept a partial reduction if the scaled accumulator is vscale x 1,
5369 // since we can't lower that type.
5370 unsigned Scale =
5371 AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits();
5372 if (VFMinValue == Scale)
5373 return Invalid;
5374 }
David Sherwoodefc72342025-02-11 15:10:39 +00005375 if (VF.isFixed() &&
5376 (!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64))
David Sherwooda733c1f2025-01-20 14:07:03 +00005377 return Invalid;
5378
5379 if (InputEVT == MVT::i8) {
Sam Tebbsc7995a62025-02-05 13:34:43 +00005380 switch (VFMinValue) {
David Sherwooda733c1f2025-01-20 14:07:03 +00005381 default:
5382 return Invalid;
5383 case 8:
5384 if (AccumEVT == MVT::i32)
5385 Cost *= 2;
5386 else if (AccumEVT != MVT::i64)
5387 return Invalid;
5388 break;
5389 case 16:
5390 if (AccumEVT == MVT::i64)
5391 Cost *= 2;
5392 else if (AccumEVT != MVT::i32)
5393 return Invalid;
5394 break;
5395 }
5396 } else if (InputEVT == MVT::i16) {
5397 // FIXME: Allow i32 accumulator but increase cost, as we would extend
5398 // it to i64.
Sam Tebbsc7995a62025-02-05 13:34:43 +00005399 if (VFMinValue != 8 || AccumEVT != MVT::i64)
David Sherwooda733c1f2025-01-20 14:07:03 +00005400 return Invalid;
5401 } else
5402 return Invalid;
5403
Sam Tebbs2876dbc2025-05-01 16:06:37 +01005404 // AArch64 supports lowering mixed fixed-width extensions to a usdot but only
5405 // if the i8mm feature is available.
David Sherwooda733c1f2025-01-20 14:07:03 +00005406 if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None ||
Sam Tebbs2876dbc2025-05-01 16:06:37 +01005407 (OpAExtend != OpBExtend && !ST->hasMatMulInt8()))
David Sherwooda733c1f2025-01-20 14:07:03 +00005408 return Invalid;
5409
5410 if (!BinOp || *BinOp != Instruction::Mul)
5411 return Invalid;
5412
5413 return Cost;
5414}
5415
David Green4ac27212024-04-09 16:36:08 +01005416InstructionCost AArch64TTIImpl::getShuffleCost(
5417 TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
5418 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
Sergei Barannikove0c1e232025-04-21 21:42:40 +03005419 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03005420 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
David Green4ac27212024-04-09 16:36:08 +01005421
David Green8e2a0e62022-04-27 13:51:50 +01005422 // If we have a Mask, and the LT is being legalized somehow, split the Mask
5423 // into smaller vectors and sum the cost of each shuffle.
David Green46cef9a2022-04-27 15:36:15 +01005424 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
David Green8e2a0e62022-04-27 13:51:50 +01005425 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
Alexey Bataev263a00f2023-10-02 06:44:01 -07005426 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
David Green4ac27212024-04-09 16:36:08 +01005427
David Green18bb1752024-04-21 13:53:22 +01005428 // Check for LD3/LD4 instructions, which are represented in llvm IR as
5429 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
5430 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
5431 // cost than just the load.
5432 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
5433 (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 3) ||
5434 ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4)))
5435 return std::max<InstructionCost>(1, LT.first / 4);
5436
David Green4ac27212024-04-09 16:36:08 +01005437 // Check for ST3/ST4 instructions, which are represented in llvm IR as
5438 // store(interleaving-shuffle). The shuffle cost could potentially be free,
David Green18bb1752024-04-21 13:53:22 +01005439 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
David Green4ac27212024-04-09 16:36:08 +01005440 // cost than just the store.
5441 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
5442 (ShuffleVectorInst::isInterleaveMask(
5443 Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
5444 ShuffleVectorInst::isInterleaveMask(
5445 Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
5446 return LT.first;
5447
Alexey Bataev263a00f2023-10-02 06:44:01 -07005448 unsigned TpNumElts = Mask.size();
David Green8e2a0e62022-04-27 13:51:50 +01005449 unsigned LTNumElts = LT.second.getVectorNumElements();
5450 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
5451 VectorType *NTp =
5452 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
5453 InstructionCost Cost;
5454 for (unsigned N = 0; N < NumVecs; N++) {
5455 SmallVector<int> NMask;
5456 // Split the existing mask into chunks of size LTNumElts. Track the source
5457 // sub-vectors to ensure the result has at most 2 inputs.
5458 unsigned Source1, Source2;
5459 unsigned NumSources = 0;
5460 for (unsigned E = 0; E < LTNumElts; E++) {
5461 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
ManuelJBritod22edb92023-04-27 16:22:57 +01005462 : PoisonMaskElem;
David Green8e2a0e62022-04-27 13:51:50 +01005463 if (MaskElt < 0) {
ManuelJBritod22edb92023-04-27 16:22:57 +01005464 NMask.push_back(PoisonMaskElem);
David Green8e2a0e62022-04-27 13:51:50 +01005465 continue;
5466 }
5467
5468 // Calculate which source from the input this comes from and whether it
5469 // is new to us.
5470 unsigned Source = MaskElt / LTNumElts;
5471 if (NumSources == 0) {
5472 Source1 = Source;
5473 NumSources = 1;
5474 } else if (NumSources == 1 && Source != Source1) {
5475 Source2 = Source;
5476 NumSources = 2;
5477 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
5478 NumSources++;
5479 }
5480
5481 // Add to the new mask. For the NumSources>2 case these are not correct,
5482 // but are only used for the modular lane number.
5483 if (Source == Source1)
5484 NMask.push_back(MaskElt % LTNumElts);
5485 else if (Source == Source2)
5486 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
5487 else
5488 NMask.push_back(MaskElt % LTNumElts);
5489 }
5490 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
David Spickettd1f3a922024-07-29 11:24:39 +00005491 // getShuffleCost. If not then cost it using the worst case as the number
5492 // of element moves into a new vector.
David Green8e2a0e62022-04-27 13:51:50 +01005493 if (NumSources <= 2)
5494 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
5495 : TTI::SK_PermuteTwoSrc,
David Green4ac27212024-04-09 16:36:08 +01005496 NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
David Green8e2a0e62022-04-27 13:51:50 +01005497 else
5498 Cost += LTNumElts;
5499 }
5500 return Cost;
5501 }
5502
Alexey Bataev9a207572023-08-08 09:57:50 -07005503 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
Alexey Bataev7bc079c2024-02-12 07:09:49 -05005504 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
David Greenc05fc9b2025-01-09 12:10:43 +00005505 // A subvector extract can be implemented with an ext (or trivial extract, if
David Greena8dab1a2025-01-08 08:13:07 +00005506 // from lane 0). This currently only handles low or high extracts to prevent
5507 // SLP vectorizer regressions.
5508 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
5509 if (LT.second.is128BitVector() &&
5510 cast<FixedVectorType>(SubTp)->getNumElements() ==
5511 LT.second.getVectorNumElements() / 2) {
5512 if (Index == 0)
5513 return 0;
David Green32bc0292025-01-08 08:59:15 +00005514 if (Index == (int)LT.second.getVectorNumElements() / 2)
David Greena8dab1a2025-01-08 08:13:07 +00005515 return 1;
5516 }
Alexey Bataev7bc079c2024-02-12 07:09:49 -05005517 Kind = TTI::SK_PermuteSingleSrc;
David Greena8dab1a2025-01-08 08:13:07 +00005518 }
David Greend6327052022-04-27 12:09:01 +01005519
Sjoerd Meijer775451b2023-03-13 13:05:34 +00005520 // Check for broadcast loads, which are supported by the LD1R instruction.
5521 // In terms of code-size, the shuffle vector is free when a load + dup get
5522 // folded into a LD1R. That's what we check and return here. For performance
5523 // and reciprocal throughput, a LD1R is not completely free. In this case, we
5524 // return the cost for the broadcast below (i.e. 1 for most/all types), so
5525 // that we model the load + dup sequence slightly higher because LD1R is a
5526 // high latency instruction.
5527 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
David Greend6327052022-04-27 12:09:01 +01005528 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
5529 if (IsLoad && LT.second.isVector() &&
5530 isLegalBroadcastLoad(Tp->getElementType(),
David Green8e2a0e62022-04-27 13:51:50 +01005531 LT.second.getVectorElementCount()))
Sjoerd Meijer775451b2023-03-13 13:05:34 +00005532 return 0;
David Greend6327052022-04-27 12:09:01 +01005533 }
5534
5535 // If we have 4 elements for the shuffle and a Mask, get the cost straight
5536 // from the perfect shuffle tables.
5537 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
5538 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
5539 all_of(Mask, [](int E) { return E < 8; }))
5540 return getPerfectShuffleCost(Mask);
5541
David Greenf0e79d92024-04-09 17:16:14 +01005542 // Check for identity masks, which we can treat as free.
5543 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
5544 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
5545 all_of(enumerate(Mask), [](const auto &M) {
5546 return M.value() < 0 || M.value() == (int)M.index();
5547 }))
5548 return 0;
5549
David Greena5367432024-04-11 08:45:28 +01005550 // Check for other shuffles that are not SK_ kinds but we have native
5551 // instructions for, for example ZIP and UZP.
5552 unsigned Unused;
5553 if (LT.second.isFixedLengthVector() &&
5554 LT.second.getVectorNumElements() == Mask.size() &&
5555 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
David Green363ec6f2024-05-06 18:37:04 +01005556 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
5557 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
David Green48397fe2025-02-25 10:32:45 +00005558 isREVMask(Mask, LT.second.getScalarSizeInBits(),
5559 LT.second.getVectorNumElements(), 16) ||
5560 isREVMask(Mask, LT.second.getScalarSizeInBits(),
5561 LT.second.getVectorNumElements(), 32) ||
5562 isREVMask(Mask, LT.second.getScalarSizeInBits(),
5563 LT.second.getVectorNumElements(), 64) ||
David Green6c2cc822024-04-14 12:09:14 +01005564 // Check for non-zero lane splats
5565 all_of(drop_begin(Mask),
5566 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
David Greena5367432024-04-11 08:45:28 +01005567 return 1;
5568
Simon Pilgrim071e8222018-10-25 10:52:36 +00005569 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
Caroline Concattob52e6c52021-01-27 15:59:27 +00005570 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
David Green0cf9e472022-08-22 12:44:57 +01005571 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
Simon Pilgrim9c8f9372018-06-22 09:45:31 +00005572 static const CostTblEntry ShuffleTbl[] = {
liqinweng723245b2022-09-08 18:33:29 +08005573 // Broadcast shuffle kinds can be performed with 'dup'.
5574 {TTI::SK_Broadcast, MVT::v8i8, 1},
5575 {TTI::SK_Broadcast, MVT::v16i8, 1},
5576 {TTI::SK_Broadcast, MVT::v4i16, 1},
5577 {TTI::SK_Broadcast, MVT::v8i16, 1},
5578 {TTI::SK_Broadcast, MVT::v2i32, 1},
5579 {TTI::SK_Broadcast, MVT::v4i32, 1},
5580 {TTI::SK_Broadcast, MVT::v2i64, 1},
David Green180865a2023-03-14 21:25:18 +00005581 {TTI::SK_Broadcast, MVT::v4f16, 1},
5582 {TTI::SK_Broadcast, MVT::v8f16, 1},
liqinweng723245b2022-09-08 18:33:29 +08005583 {TTI::SK_Broadcast, MVT::v2f32, 1},
5584 {TTI::SK_Broadcast, MVT::v4f32, 1},
5585 {TTI::SK_Broadcast, MVT::v2f64, 1},
5586 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
5587 // 'zip1/zip2' instructions.
5588 {TTI::SK_Transpose, MVT::v8i8, 1},
5589 {TTI::SK_Transpose, MVT::v16i8, 1},
5590 {TTI::SK_Transpose, MVT::v4i16, 1},
5591 {TTI::SK_Transpose, MVT::v8i16, 1},
5592 {TTI::SK_Transpose, MVT::v2i32, 1},
5593 {TTI::SK_Transpose, MVT::v4i32, 1},
5594 {TTI::SK_Transpose, MVT::v2i64, 1},
David Green180865a2023-03-14 21:25:18 +00005595 {TTI::SK_Transpose, MVT::v4f16, 1},
5596 {TTI::SK_Transpose, MVT::v8f16, 1},
liqinweng723245b2022-09-08 18:33:29 +08005597 {TTI::SK_Transpose, MVT::v2f32, 1},
5598 {TTI::SK_Transpose, MVT::v4f32, 1},
5599 {TTI::SK_Transpose, MVT::v2f64, 1},
5600 // Select shuffle kinds.
5601 // TODO: handle vXi8/vXi16.
5602 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
5603 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
5604 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
5605 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
5606 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
5607 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
5608 // PermuteSingleSrc shuffle kinds.
5609 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
5610 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
5611 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
5612 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
5613 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
5614 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
5615 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
5616 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
David Green3875c382022-09-08 19:54:12 +01005617 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
5618 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
5619 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
liqinweng723245b2022-09-08 18:33:29 +08005620 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
5621 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
5622 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
5623 // Reverse can be lowered with `rev`.
5624 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
5625 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
5626 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
5627 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
5628 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
5629 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
5630 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
David Green5c6453d2025-02-24 08:37:15 +00005631 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
liqinweng723245b2022-09-08 18:33:29 +08005632 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
5633 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
5634 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
David Green5c6453d2025-02-24 08:37:15 +00005635 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
liqinweng723245b2022-09-08 18:33:29 +08005636 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
5637 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
5638 // Splice can all be lowered as `ext`.
5639 {TTI::SK_Splice, MVT::v2i32, 1},
5640 {TTI::SK_Splice, MVT::v4i32, 1},
5641 {TTI::SK_Splice, MVT::v2i64, 1},
5642 {TTI::SK_Splice, MVT::v2f32, 1},
5643 {TTI::SK_Splice, MVT::v4f32, 1},
5644 {TTI::SK_Splice, MVT::v2f64, 1},
5645 {TTI::SK_Splice, MVT::v8f16, 1},
5646 {TTI::SK_Splice, MVT::v8bf16, 1},
5647 {TTI::SK_Splice, MVT::v8i16, 1},
5648 {TTI::SK_Splice, MVT::v16i8, 1},
liqinweng723245b2022-09-08 18:33:29 +08005649 {TTI::SK_Splice, MVT::v4f16, 1},
David Green5c6453d2025-02-24 08:37:15 +00005650 {TTI::SK_Splice, MVT::v4bf16, 1},
liqinweng723245b2022-09-08 18:33:29 +08005651 {TTI::SK_Splice, MVT::v4i16, 1},
5652 {TTI::SK_Splice, MVT::v8i8, 1},
5653 // Broadcast shuffle kinds for scalable vectors
5654 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
5655 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
5656 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
5657 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
5658 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
5659 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
5660 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
5661 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
5662 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
5663 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
5664 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
5665 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
5666 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
5667 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
5668 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
5669 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
5670 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
5671 // Handle the cases for vector.reverse with scalable vectors
5672 {TTI::SK_Reverse, MVT::nxv16i8, 1},
5673 {TTI::SK_Reverse, MVT::nxv8i16, 1},
5674 {TTI::SK_Reverse, MVT::nxv4i32, 1},
5675 {TTI::SK_Reverse, MVT::nxv2i64, 1},
5676 {TTI::SK_Reverse, MVT::nxv2f16, 1},
5677 {TTI::SK_Reverse, MVT::nxv4f16, 1},
5678 {TTI::SK_Reverse, MVT::nxv8f16, 1},
5679 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
5680 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
5681 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
5682 {TTI::SK_Reverse, MVT::nxv2f32, 1},
5683 {TTI::SK_Reverse, MVT::nxv4f32, 1},
5684 {TTI::SK_Reverse, MVT::nxv2f64, 1},
5685 {TTI::SK_Reverse, MVT::nxv16i1, 1},
5686 {TTI::SK_Reverse, MVT::nxv8i1, 1},
5687 {TTI::SK_Reverse, MVT::nxv4i1, 1},
5688 {TTI::SK_Reverse, MVT::nxv2i1, 1},
Matthew Simpsonb4096eb2018-04-26 13:48:33 +00005689 };
Simon Pilgrim9c8f9372018-06-22 09:45:31 +00005690 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
Matthew Simpsonb4096eb2018-04-26 13:48:33 +00005691 return LT.first * Entry->Cost;
5692 }
David Greenfa784f62022-04-07 19:27:41 +01005693
Caroline Concattoa2c5c562021-06-18 15:39:03 +01005694 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
David Green2ba455f2025-04-21 06:31:03 +01005695 return getSpliceCost(Tp, Index, CostKind);
David Greenfa784f62022-04-07 19:27:41 +01005696
5697 // Inserting a subvector can often be done with either a D, S or H register
5698 // move, so long as the inserted vector is "aligned".
5699 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
5700 LT.second.getSizeInBits() <= 128 && SubTp) {
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03005701 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
David Greenfa784f62022-04-07 19:27:41 +01005702 if (SubLT.second.isVector()) {
5703 int NumElts = LT.second.getVectorNumElements();
5704 int NumSubElts = SubLT.second.getVectorNumElements();
5705 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
5706 return SubLT.first;
5707 }
5708 }
5709
Alexey Bataev7bc079c2024-02-12 07:09:49 -05005710 // Restore optimal kind.
5711 if (IsExtractSubvector)
5712 Kind = TTI::SK_ExtractSubvector;
David Green4ac27212024-04-09 16:36:08 +01005713 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
5714 CxtI);
Matthew Simpsonb4096eb2018-04-26 13:48:33 +00005715}
David Sherwoodf15b6b22022-07-12 12:03:39 +01005716
David Sherwood636efd22023-03-14 18:15:03 +00005717static bool containsDecreasingPointers(Loop *TheLoop,
5718 PredicatedScalarEvolution *PSE) {
Philip Reamese41dce42023-05-11 09:47:37 -07005719 const auto &Strides = DenseMap<Value *, const SCEV *>();
David Sherwood636efd22023-03-14 18:15:03 +00005720 for (BasicBlock *BB : TheLoop->blocks()) {
5721 // Scan the instructions in the block and look for addresses that are
5722 // consecutive and decreasing.
5723 for (Instruction &I : *BB) {
5724 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
5725 Value *Ptr = getLoadStorePointerOperand(&I);
5726 Type *AccessTy = getLoadStoreType(&I);
5727 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
5728 /*ShouldCheckWrap=*/false)
5729 .value_or(0) < 0)
5730 return true;
5731 }
5732 }
5733 }
5734 return false;
5735}
5736
David Greenb35d3452024-12-31 11:07:42 +00005737bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const {
5738 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
5739 return SVEPreferFixedOverScalableIfEqualCost;
5740 return ST->useFixedOverScalableIfEqualCost();
5741}
5742
Sjoerd Meijer9bccf612024-11-20 09:33:39 +00005743unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
5744 return ST->getEpilogueVectorizationMinVF();
5745}
5746
Sergei Barannikov0014b492025-04-22 06:27:29 +03005747bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
David Sherwood7beb2ca2023-04-20 12:34:55 +00005748 if (!ST->hasSVE())
David Sherwoodf15b6b22022-07-12 12:03:39 +01005749 return false;
5750
David Sherwood4ef9cb62022-07-18 10:36:11 +01005751 // We don't currently support vectorisation with interleaving for SVE - with
5752 // such loops we're better off not using tail-folding. This gives us a chance
5753 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
David Sherwoodb4089cf2023-04-04 13:58:58 +00005754 if (TFI->IAI->hasGroups())
David Sherwood4ef9cb62022-07-18 10:36:11 +01005755 return false;
5756
David Sherwood7beb2ca2023-04-20 12:34:55 +00005757 TailFoldingOpts Required = TailFoldingOpts::Disabled;
David Sherwoodb4089cf2023-04-04 13:58:58 +00005758 if (TFI->LVL->getReductionVars().size())
David Sherwood7beb2ca2023-04-20 12:34:55 +00005759 Required |= TailFoldingOpts::Reductions;
David Sherwoodb4089cf2023-04-04 13:58:58 +00005760 if (TFI->LVL->getFixedOrderRecurrences().size())
David Sherwood7beb2ca2023-04-20 12:34:55 +00005761 Required |= TailFoldingOpts::Recurrences;
David Sherwood636efd22023-03-14 18:15:03 +00005762
5763 // We call this to discover whether any load/store pointers in the loop have
5764 // negative strides. This will require extra work to reverse the loop
5765 // predicate, which may be expensive.
David Sherwoodb4089cf2023-04-04 13:58:58 +00005766 if (containsDecreasingPointers(TFI->LVL->getLoop(),
5767 TFI->LVL->getPredicatedScalarEvolution()))
David Sherwood7beb2ca2023-04-20 12:34:55 +00005768 Required |= TailFoldingOpts::Reverse;
5769 if (Required == TailFoldingOpts::Disabled)
5770 Required |= TailFoldingOpts::Simple;
David Sherwoodf15b6b22022-07-12 12:03:39 +01005771
David Sherwoodc7dbe322023-04-25 08:46:41 +00005772 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
5773 Required))
5774 return false;
5775
5776 // Don't tail-fold for tight loops where we would be better off interleaving
5777 // with an unpredicated loop.
5778 unsigned NumInsns = 0;
5779 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
5780 NumInsns += BB->sizeWithoutDebug();
5781 }
5782
5783 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
5784 return NumInsns >= SVETailFoldInsnThreshold;
David Sherwoodf15b6b22022-07-12 12:03:39 +01005785}
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03005786
5787InstructionCost
5788AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
Graham Hunter2e8d8152024-05-10 11:22:11 +01005789 StackOffset BaseOffset, bool HasBaseReg,
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03005790 int64_t Scale, unsigned AddrSpace) const {
5791 // Scaling factors are not free at all.
5792 // Operands | Rt Latency
5793 // -------------------------------------------
5794 // Rt, [Xn, Xm] | 4
5795 // -------------------------------------------
5796 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
5797 // Rt, [Xn, Wm, <extend> #imm] |
5798 TargetLoweringBase::AddrMode AM;
5799 AM.BaseGV = BaseGV;
Graham Hunter2e8d8152024-05-10 11:22:11 +01005800 AM.BaseOffs = BaseOffset.getFixed();
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03005801 AM.HasBaseReg = HasBaseReg;
5802 AM.Scale = Scale;
Graham Hunter2e8d8152024-05-10 11:22:11 +01005803 AM.ScalableOffset = BaseOffset.getScalable();
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03005804 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
5805 // Scale represents reg2 * scale, thus account for 1 if
5806 // it is not equal to 0 or 1.
5807 return AM.Scale != 0 && AM.Scale != 1;
Craig Topper39c454a2025-03-05 09:10:45 -08005808 return InstructionCost::getInvalid();
Daniil Fukalov7ed3d812022-08-18 00:38:34 +03005809}
David Greena2d68b42024-01-22 23:46:58 +00005810
Sergei Barannikov0014b492025-04-22 06:27:29 +03005811bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(
5812 const Instruction *I) const {
Florian Hahn9a0f2512024-11-30 21:05:41 +00005813 if (EnableOrLikeSelectOpt) {
5814 // For the binary operators (e.g. or) we need to be more careful than
5815 // selects, here we only transform them if they are already at a natural
5816 // break point in the code - the end of a block with an unconditional
5817 // terminator.
5818 if (I->getOpcode() == Instruction::Or &&
5819 isa<BranchInst>(I->getNextNode()) &&
5820 cast<BranchInst>(I->getNextNode())->isUnconditional())
5821 return true;
5822
5823 if (I->getOpcode() == Instruction::Add ||
5824 I->getOpcode() == Instruction::Sub)
5825 return true;
5826 }
David Greena2d68b42024-01-22 23:46:58 +00005827 return BaseT::shouldTreatInstructionLikeSelect(I);
Sander de Smalen3abf55a2024-01-31 11:38:29 +00005828}
Graham Huntere16f2f52024-06-06 14:45:36 +01005829
Sergei Barannikov0014b492025-04-22 06:27:29 +03005830bool AArch64TTIImpl::isLSRCostLess(
5831 const TargetTransformInfo::LSRCost &C1,
5832 const TargetTransformInfo::LSRCost &C2) const {
Graham Huntere16f2f52024-06-06 14:45:36 +01005833 // AArch64 specific here is adding the number of instructions to the
5834 // comparison (though not as the first consideration, as some targets do)
5835 // along with changing the priority of the base additions.
5836 // TODO: Maybe a more nuanced tradeoff between instruction count
5837 // and number of registers? To be investigated at a later date.
5838 if (EnableLSRCostOpt)
5839 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
5840 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5841 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
5842 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5843
5844 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
Sander de Smalen738533c2024-06-24 11:06:16 +01005845}
Jeffrey Byrnes853c43d2024-10-09 14:30:09 -07005846
5847static bool isSplatShuffle(Value *V) {
5848 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
5849 return all_equal(Shuf->getShuffleMask());
5850 return false;
5851}
5852
5853/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
5854/// or upper half of the vector elements.
5855static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
5856 bool AllowSplat = false) {
5857 // Scalable types can't be extract shuffle vectors.
5858 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
5859 return false;
5860
5861 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
5862 auto *FullTy = FullV->getType();
5863 auto *HalfTy = HalfV->getType();
5864 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
5865 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
5866 };
5867
5868 auto extractHalf = [](Value *FullV, Value *HalfV) {
5869 auto *FullVT = cast<FixedVectorType>(FullV->getType());
5870 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
5871 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
5872 };
5873
5874 ArrayRef<int> M1, M2;
5875 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
5876 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
5877 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
5878 return false;
5879
5880 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
5881 // it is not checked as an extract below.
5882 if (AllowSplat && isSplatShuffle(Op1))
5883 S1Op1 = nullptr;
5884 if (AllowSplat && isSplatShuffle(Op2))
5885 S2Op1 = nullptr;
5886
5887 // Check that the operands are half as wide as the result and we extract
5888 // half of the elements of the input vectors.
5889 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
5890 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
5891 return false;
5892
5893 // Check the mask extracts either the lower or upper half of vector
5894 // elements.
5895 int M1Start = 0;
5896 int M2Start = 0;
5897 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
5898 if ((S1Op1 &&
5899 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
5900 (S2Op1 &&
5901 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
5902 return false;
5903
5904 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
5905 (M2Start != 0 && M2Start != (NumElements / 2)))
5906 return false;
5907 if (S1Op1 && S2Op1 && M1Start != M2Start)
5908 return false;
5909
5910 return true;
5911}
5912
5913/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
5914/// of the vector elements.
5915static bool areExtractExts(Value *Ext1, Value *Ext2) {
5916 auto areExtDoubled = [](Instruction *Ext) {
5917 return Ext->getType()->getScalarSizeInBits() ==
5918 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
5919 };
5920
5921 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
5922 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
5923 !areExtDoubled(cast<Instruction>(Ext1)) ||
5924 !areExtDoubled(cast<Instruction>(Ext2)))
5925 return false;
5926
5927 return true;
5928}
5929
5930/// Check if Op could be used with vmull_high_p64 intrinsic.
5931static bool isOperandOfVmullHighP64(Value *Op) {
5932 Value *VectorOperand = nullptr;
5933 ConstantInt *ElementIndex = nullptr;
5934 return match(Op, m_ExtractElt(m_Value(VectorOperand),
5935 m_ConstantInt(ElementIndex))) &&
5936 ElementIndex->getValue() == 1 &&
5937 isa<FixedVectorType>(VectorOperand->getType()) &&
5938 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
5939}
5940
5941/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
5942static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
5943 return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
5944}
5945
5946static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
5947 // Restrict ourselves to the form CodeGenPrepare typically constructs.
5948 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
5949 if (!GEP || GEP->getNumOperands() != 2)
5950 return false;
5951
5952 Value *Base = GEP->getOperand(0);
5953 Value *Offsets = GEP->getOperand(1);
5954
5955 // We only care about scalar_base+vector_offsets.
5956 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
5957 return false;
5958
5959 // Sink extends that would allow us to use 32-bit offset vectors.
5960 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
5961 auto *OffsetsInst = cast<Instruction>(Offsets);
5962 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
5963 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
5964 Ops.push_back(&GEP->getOperandUse(1));
5965 }
5966
5967 // Sink the GEP.
5968 return true;
5969}
5970
5971/// We want to sink following cases:
5972/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
5973/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
5974static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
5975 if (match(Op, m_VScale()))
5976 return true;
5977 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
5978 match(Op, m_Mul(m_VScale(), m_ConstantInt()))) {
5979 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
5980 return true;
5981 }
5982 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
5983 match(Op, m_Mul(m_ZExt(m_VScale()), m_ConstantInt()))) {
5984 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
5985 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
5986 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
5987 return true;
5988 }
5989 return false;
5990}
5991
5992/// Check if sinking \p I's operands to I's basic block is profitable, because
5993/// the operands can be folded into a target instruction, e.g.
5994/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
5995bool AArch64TTIImpl::isProfitableToSinkOperands(
5996 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
5997 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
5998 switch (II->getIntrinsicID()) {
5999 case Intrinsic::aarch64_neon_smull:
6000 case Intrinsic::aarch64_neon_umull:
6001 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6002 /*AllowSplat=*/true)) {
6003 Ops.push_back(&II->getOperandUse(0));
6004 Ops.push_back(&II->getOperandUse(1));
6005 return true;
6006 }
6007 [[fallthrough]];
6008
6009 case Intrinsic::fma:
6010 case Intrinsic::fmuladd:
6011 if (isa<VectorType>(I->getType()) &&
6012 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6013 !ST->hasFullFP16())
6014 return false;
6015 [[fallthrough]];
6016 case Intrinsic::aarch64_neon_sqdmull:
6017 case Intrinsic::aarch64_neon_sqdmulh:
6018 case Intrinsic::aarch64_neon_sqrdmulh:
6019 // Sink splats for index lane variants
6020 if (isSplatShuffle(II->getOperand(0)))
6021 Ops.push_back(&II->getOperandUse(0));
6022 if (isSplatShuffle(II->getOperand(1)))
6023 Ops.push_back(&II->getOperandUse(1));
6024 return !Ops.empty();
6025 case Intrinsic::aarch64_neon_fmlal:
6026 case Intrinsic::aarch64_neon_fmlal2:
6027 case Intrinsic::aarch64_neon_fmlsl:
6028 case Intrinsic::aarch64_neon_fmlsl2:
6029 // Sink splats for index lane variants
6030 if (isSplatShuffle(II->getOperand(1)))
6031 Ops.push_back(&II->getOperandUse(1));
6032 if (isSplatShuffle(II->getOperand(2)))
6033 Ops.push_back(&II->getOperandUse(2));
6034 return !Ops.empty();
6035 case Intrinsic::aarch64_sve_ptest_first:
6036 case Intrinsic::aarch64_sve_ptest_last:
6037 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6038 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6039 Ops.push_back(&II->getOperandUse(0));
6040 return !Ops.empty();
6041 case Intrinsic::aarch64_sme_write_horiz:
6042 case Intrinsic::aarch64_sme_write_vert:
6043 case Intrinsic::aarch64_sme_writeq_horiz:
6044 case Intrinsic::aarch64_sme_writeq_vert: {
6045 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6046 if (!Idx || Idx->getOpcode() != Instruction::Add)
6047 return false;
6048 Ops.push_back(&II->getOperandUse(1));
6049 return true;
6050 }
6051 case Intrinsic::aarch64_sme_read_horiz:
6052 case Intrinsic::aarch64_sme_read_vert:
6053 case Intrinsic::aarch64_sme_readq_horiz:
6054 case Intrinsic::aarch64_sme_readq_vert:
6055 case Intrinsic::aarch64_sme_ld1b_vert:
6056 case Intrinsic::aarch64_sme_ld1h_vert:
6057 case Intrinsic::aarch64_sme_ld1w_vert:
6058 case Intrinsic::aarch64_sme_ld1d_vert:
6059 case Intrinsic::aarch64_sme_ld1q_vert:
6060 case Intrinsic::aarch64_sme_st1b_vert:
6061 case Intrinsic::aarch64_sme_st1h_vert:
6062 case Intrinsic::aarch64_sme_st1w_vert:
6063 case Intrinsic::aarch64_sme_st1d_vert:
6064 case Intrinsic::aarch64_sme_st1q_vert:
6065 case Intrinsic::aarch64_sme_ld1b_horiz:
6066 case Intrinsic::aarch64_sme_ld1h_horiz:
6067 case Intrinsic::aarch64_sme_ld1w_horiz:
6068 case Intrinsic::aarch64_sme_ld1d_horiz:
6069 case Intrinsic::aarch64_sme_ld1q_horiz:
6070 case Intrinsic::aarch64_sme_st1b_horiz:
6071 case Intrinsic::aarch64_sme_st1h_horiz:
6072 case Intrinsic::aarch64_sme_st1w_horiz:
6073 case Intrinsic::aarch64_sme_st1d_horiz:
6074 case Intrinsic::aarch64_sme_st1q_horiz: {
6075 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6076 if (!Idx || Idx->getOpcode() != Instruction::Add)
6077 return false;
6078 Ops.push_back(&II->getOperandUse(3));
6079 return true;
6080 }
6081 case Intrinsic::aarch64_neon_pmull:
6082 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6083 return false;
6084 Ops.push_back(&II->getOperandUse(0));
6085 Ops.push_back(&II->getOperandUse(1));
6086 return true;
6087 case Intrinsic::aarch64_neon_pmull64:
6088 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6089 II->getArgOperand(1)))
6090 return false;
6091 Ops.push_back(&II->getArgOperandUse(0));
6092 Ops.push_back(&II->getArgOperandUse(1));
6093 return true;
6094 case Intrinsic::masked_gather:
6095 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6096 return false;
6097 Ops.push_back(&II->getArgOperandUse(0));
6098 return true;
6099 case Intrinsic::masked_scatter:
6100 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6101 return false;
6102 Ops.push_back(&II->getArgOperandUse(1));
6103 return true;
6104 default:
6105 return false;
6106 }
6107 }
6108
David Sherwood346185c2025-01-06 13:17:14 +00006109 auto ShouldSinkCondition = [](Value *Cond) -> bool {
6110 auto *II = dyn_cast<IntrinsicInst>(Cond);
6111 return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
6112 isa<ScalableVectorType>(II->getOperand(0)->getType());
6113 };
6114
Jeffrey Byrnes853c43d2024-10-09 14:30:09 -07006115 switch (I->getOpcode()) {
6116 case Instruction::GetElementPtr:
6117 case Instruction::Add:
6118 case Instruction::Sub:
David Sherwood346185c2025-01-06 13:17:14 +00006119 // Sink vscales closer to uses for better isel
Jeffrey Byrnes853c43d2024-10-09 14:30:09 -07006120 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6121 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6122 Ops.push_back(&I->getOperandUse(Op));
6123 return true;
6124 }
6125 }
6126 break;
David Sherwood346185c2025-01-06 13:17:14 +00006127 case Instruction::Select: {
6128 if (!ShouldSinkCondition(I->getOperand(0)))
6129 return false;
6130
6131 Ops.push_back(&I->getOperandUse(0));
6132 return true;
6133 }
6134 case Instruction::Br: {
6135 if (cast<BranchInst>(I)->isUnconditional())
6136 return false;
6137
6138 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
6139 return false;
6140
6141 Ops.push_back(&I->getOperandUse(0));
6142 return true;
6143 }
Jeffrey Byrnes853c43d2024-10-09 14:30:09 -07006144 default:
6145 break;
6146 }
6147
6148 if (!I->getType()->isVectorTy())
6149 return false;
6150
6151 switch (I->getOpcode()) {
6152 case Instruction::Sub:
6153 case Instruction::Add: {
6154 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
6155 return false;
6156
6157 // If the exts' operands extract either the lower or upper elements, we
6158 // can sink them too.
6159 auto Ext1 = cast<Instruction>(I->getOperand(0));
6160 auto Ext2 = cast<Instruction>(I->getOperand(1));
6161 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
6162 Ops.push_back(&Ext1->getOperandUse(0));
6163 Ops.push_back(&Ext2->getOperandUse(0));
6164 }
6165
6166 Ops.push_back(&I->getOperandUse(0));
6167 Ops.push_back(&I->getOperandUse(1));
6168
6169 return true;
6170 }
6171 case Instruction::Or: {
6172 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6173 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6174 if (ST->hasNEON()) {
6175 Instruction *OtherAnd, *IA, *IB;
6176 Value *MaskValue;
6177 // MainAnd refers to And instruction that has 'Not' as one of its operands
6178 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
6179 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
6180 m_Instruction(IA)))))) {
6181 if (match(OtherAnd,
6182 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
6183 Instruction *MainAnd = I->getOperand(0) == OtherAnd
6184 ? cast<Instruction>(I->getOperand(1))
6185 : cast<Instruction>(I->getOperand(0));
6186
6187 // Both Ands should be in same basic block as Or
6188 if (I->getParent() != MainAnd->getParent() ||
6189 I->getParent() != OtherAnd->getParent())
6190 return false;
6191
6192 // Non-mask operands of both Ands should also be in same basic block
6193 if (I->getParent() != IA->getParent() ||
6194 I->getParent() != IB->getParent())
6195 return false;
6196
6197 Ops.push_back(
6198 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
6199 Ops.push_back(&I->getOperandUse(0));
6200 Ops.push_back(&I->getOperandUse(1));
6201
6202 return true;
6203 }
6204 }
6205 }
6206
6207 return false;
6208 }
6209 case Instruction::Mul: {
Hari Limaye8bc95512024-12-06 12:45:18 +00006210 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6211 auto *Ty = cast<VectorType>(V->getType());
6212 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6213 if (Ty->isScalableTy())
6214 return false;
6215
6216 // Indexed variants of Mul exist for i16 and i32 element types only.
6217 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6218 };
6219
Jeffrey Byrnes853c43d2024-10-09 14:30:09 -07006220 int NumZExts = 0, NumSExts = 0;
6221 for (auto &Op : I->operands()) {
6222 // Make sure we are not already sinking this operand
6223 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
6224 continue;
6225
Hari Limaye8bc95512024-12-06 12:45:18 +00006226 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
6227 auto *Ext = cast<Instruction>(Op);
6228 auto *ExtOp = Ext->getOperand(0);
6229 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6230 Ops.push_back(&Ext->getOperandUse(0));
6231 Ops.push_back(&Op);
6232
6233 if (isa<SExtInst>(Ext))
6234 NumSExts++;
6235 else
6236 NumZExts++;
6237
Jeffrey Byrnes853c43d2024-10-09 14:30:09 -07006238 continue;
6239 }
6240
6241 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
Hari Limaye8bc95512024-12-06 12:45:18 +00006242 if (!Shuffle)
6243 continue;
Jeffrey Byrnes853c43d2024-10-09 14:30:09 -07006244
6245 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6246 // operand and the s/zext can help create indexed s/umull. This is
6247 // especially useful to prevent i64 mul being scalarized.
Hari Limaye8bc95512024-12-06 12:45:18 +00006248 if (isSplatShuffle(Shuffle) &&
Jeffrey Byrnes853c43d2024-10-09 14:30:09 -07006249 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
6250 Ops.push_back(&Shuffle->getOperandUse(0));
6251 Ops.push_back(&Op);
6252 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
6253 NumSExts++;
6254 else
6255 NumZExts++;
6256 continue;
6257 }
6258
Jeffrey Byrnes853c43d2024-10-09 14:30:09 -07006259 Value *ShuffleOperand = Shuffle->getOperand(0);
6260 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
6261 if (!Insert)
6262 continue;
6263
6264 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
6265 if (!OperandInstr)
6266 continue;
6267
6268 ConstantInt *ElementConstant =
6269 dyn_cast<ConstantInt>(Insert->getOperand(2));
6270 // Check that the insertelement is inserting into element 0
6271 if (!ElementConstant || !ElementConstant->isZero())
6272 continue;
6273
6274 unsigned Opcode = OperandInstr->getOpcode();
6275 if (Opcode == Instruction::SExt)
6276 NumSExts++;
6277 else if (Opcode == Instruction::ZExt)
6278 NumZExts++;
6279 else {
6280 // If we find that the top bits are known 0, then we can sink and allow
6281 // the backend to generate a umull.
6282 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6283 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
6284 const DataLayout &DL = I->getDataLayout();
6285 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
6286 continue;
6287 NumZExts++;
6288 }
6289
David Green5a069ea2025-01-10 11:54:46 +00006290 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6291 // the And, just to hoist it again back to the load.
6292 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
6293 Ops.push_back(&Insert->getOperandUse(1));
Jeffrey Byrnes853c43d2024-10-09 14:30:09 -07006294 Ops.push_back(&Shuffle->getOperandUse(0));
6295 Ops.push_back(&Op);
6296 }
6297
Hari Limaye8bc95512024-12-06 12:45:18 +00006298 // It is profitable to sink if we found two of the same type of extends.
6299 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6300 return true;
6301
6302 // Otherwise, see if we should sink splats for indexed variants.
6303 if (!ShouldSinkSplatForIndexedVariant(I))
6304 return false;
6305
6306 Ops.clear();
6307 if (isSplatShuffle(I->getOperand(0)))
6308 Ops.push_back(&I->getOperandUse(0));
6309 if (isSplatShuffle(I->getOperand(1)))
6310 Ops.push_back(&I->getOperandUse(1));
6311
6312 return !Ops.empty();
Jeffrey Byrnes853c43d2024-10-09 14:30:09 -07006313 }
Hari Limaye4f0403f2024-11-19 12:59:22 +00006314 case Instruction::FMul: {
6315 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6316 if (I->getType()->isScalableTy())
6317 return false;
6318
6319 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6320 !ST->hasFullFP16())
6321 return false;
6322
6323 // Sink splats for index lane variants
6324 if (isSplatShuffle(I->getOperand(0)))
6325 Ops.push_back(&I->getOperandUse(0));
6326 if (isSplatShuffle(I->getOperand(1)))
6327 Ops.push_back(&I->getOperandUse(1));
6328 return !Ops.empty();
6329 }
Jeffrey Byrnes853c43d2024-10-09 14:30:09 -07006330 default:
6331 return false;
6332 }
6333 return false;
6334}