Chandler Carruth | 93dcdc4 | 2015-01-31 11:17:59 +0000 | [diff] [blame] | 1 | //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 2 | // |
Chandler Carruth | 2946cd7 | 2019-01-19 08:50:56 +0000 | [diff] [blame] | 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 8 | |
Florian Hahn | 408c440 | 2020-10-30 21:19:52 +0000 | [diff] [blame] | 9 | #include "AArch64TargetTransformInfo.h" |
Florian Hahn | b3b993a | 2020-11-02 12:40:34 +0000 | [diff] [blame] | 10 | #include "AArch64ExpandImm.h" |
David Green | d632705 | 2022-04-27 12:09:01 +0100 | [diff] [blame] | 11 | #include "AArch64PerfectShuffle.h" |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 12 | #include "MCTargetDesc/AArch64AddressingModes.h" |
David Green | 8274be5 | 2024-10-28 18:53:38 +0000 | [diff] [blame] | 13 | #include "Utils/AArch64SMEAttributes.h" |
Sushant Gokhale | 9991ea2 | 2024-11-13 11:10:49 +0530 | [diff] [blame] | 14 | #include "llvm/ADT/DenseMap.h" |
Kevin Qin | aef6841 | 2015-03-09 06:14:28 +0000 | [diff] [blame] | 15 | #include "llvm/Analysis/LoopInfo.h" |
Chandler Carruth | 6bda14b | 2017-06-06 11:49:48 +0000 | [diff] [blame] | 16 | #include "llvm/Analysis/TargetTransformInfo.h" |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 17 | #include "llvm/CodeGen/BasicTTIImpl.h" |
David Blaikie | b3bde2e | 2017-11-17 01:07:10 +0000 | [diff] [blame] | 18 | #include "llvm/CodeGen/CostTable.h" |
| 19 | #include "llvm/CodeGen/TargetLowering.h" |
Sushant Gokhale | c480874 | 2025-03-09 22:26:39 -0700 | [diff] [blame] | 20 | #include "llvm/IR/DerivedTypes.h" |
Reid Kleckner | 0e8c4bb | 2017-09-07 23:27:44 +0000 | [diff] [blame] | 21 | #include "llvm/IR/IntrinsicInst.h" |
David Green | 8e2a0e6 | 2022-04-27 13:51:50 +0100 | [diff] [blame] | 22 | #include "llvm/IR/Intrinsics.h" |
Reid Kleckner | 5d98695 | 2019-12-11 07:55:26 -0800 | [diff] [blame] | 23 | #include "llvm/IR/IntrinsicsAArch64.h" |
Florian Hahn | b3b993a | 2020-11-02 12:40:34 +0000 | [diff] [blame] | 24 | #include "llvm/IR/PatternMatch.h" |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 25 | #include "llvm/Support/Debug.h" |
Alexandros Lamprineas | 831527a | 2025-01-17 10:49:43 +0000 | [diff] [blame] | 26 | #include "llvm/TargetParser/AArch64TargetParser.h" |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 27 | #include "llvm/Transforms/InstCombine/InstCombiner.h" |
David Sherwood | f15b6b2 | 2022-07-12 12:03:39 +0100 | [diff] [blame] | 28 | #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 29 | #include <algorithm> |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 30 | #include <optional> |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 31 | using namespace llvm; |
Florian Hahn | b3b993a | 2020-11-02 12:40:34 +0000 | [diff] [blame] | 32 | using namespace llvm::PatternMatch; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 33 | |
| 34 | #define DEBUG_TYPE "aarch64tti" |
| 35 | |
Geoff Berry | 378374d | 2017-06-28 18:53:09 +0000 | [diff] [blame] | 36 | static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", |
| 37 | cl::init(true), cl::Hidden); |
| 38 | |
David Green | b35d345 | 2024-12-31 11:07:42 +0000 | [diff] [blame] | 39 | static cl::opt<bool> SVEPreferFixedOverScalableIfEqualCost( |
| 40 | "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden); |
| 41 | |
David Sherwood | 8b0448c | 2021-12-06 11:02:29 +0000 | [diff] [blame] | 42 | static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), |
| 43 | cl::Hidden); |
| 44 | |
| 45 | static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", |
| 46 | cl::init(10), cl::Hidden); |
| 47 | |
David Sherwood | c7dbe32 | 2023-04-25 08:46:41 +0000 | [diff] [blame] | 48 | static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", |
| 49 | cl::init(15), cl::Hidden); |
| 50 | |
zhongyunde | df19d87 | 2023-06-07 21:50:54 +0800 | [diff] [blame] | 51 | static cl::opt<unsigned> |
| 52 | NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), |
| 53 | cl::Hidden); |
| 54 | |
Sander de Smalen | 00a8314 | 2023-10-31 10:28:40 +0000 | [diff] [blame] | 55 | static cl::opt<unsigned> CallPenaltyChangeSM( |
| 56 | "call-penalty-sm-change", cl::init(5), cl::Hidden, |
| 57 | cl::desc( |
| 58 | "Penalty of calling a function that requires a change to PSTATE.SM")); |
| 59 | |
| 60 | static cl::opt<unsigned> InlineCallPenaltyChangeSM( |
| 61 | "inline-call-penalty-sm-change", cl::init(10), cl::Hidden, |
| 62 | cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM")); |
| 63 | |
David Green | a2d68b4 | 2024-01-22 23:46:58 +0000 | [diff] [blame] | 64 | static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select", |
| 65 | cl::init(true), cl::Hidden); |
| 66 | |
Graham Hunter | e16f2f5 | 2024-06-06 14:45:36 +0100 | [diff] [blame] | 67 | static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", |
| 68 | cl::init(true), cl::Hidden); |
| 69 | |
Graham Hunter | 2c0add9 | 2024-07-04 10:59:21 +0100 | [diff] [blame] | 70 | // A complete guess as to a reasonable cost. |
| 71 | static cl::opt<unsigned> |
| 72 | BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, |
| 73 | cl::desc("The cost of a histcnt instruction")); |
| 74 | |
Danila Malyutin | 1a60905 | 2024-10-17 21:04:04 +0400 | [diff] [blame] | 75 | static cl::opt<unsigned> DMBLookaheadThreshold( |
| 76 | "dmb-lookahead-threshold", cl::init(10), cl::Hidden, |
| 77 | cl::desc("The number of instructions to search for a redundant dmb")); |
| 78 | |
Benjamin Kramer | b6942a2 | 2023-01-08 17:25:29 +0100 | [diff] [blame] | 79 | namespace { |
David Sherwood | 7beb2ca | 2023-04-20 12:34:55 +0000 | [diff] [blame] | 80 | class TailFoldingOption { |
| 81 | // These bitfields will only ever be set to something non-zero in operator=, |
| 82 | // when setting the -sve-tail-folding option. This option should always be of |
| 83 | // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here |
| 84 | // InitialBits is one of (disabled|all|simple). EnableBits represents |
| 85 | // additional flags we're enabling, and DisableBits for those flags we're |
| 86 | // disabling. The default flag is tracked in the variable NeedsDefault, since |
| 87 | // at the time of setting the option we may not know what the default value |
| 88 | // for the CPU is. |
| 89 | TailFoldingOpts InitialBits = TailFoldingOpts::Disabled; |
| 90 | TailFoldingOpts EnableBits = TailFoldingOpts::Disabled; |
| 91 | TailFoldingOpts DisableBits = TailFoldingOpts::Disabled; |
| 92 | |
| 93 | // This value needs to be initialised to true in case the user does not |
| 94 | // explicitly set the -sve-tail-folding option. |
| 95 | bool NeedsDefault = true; |
| 96 | |
| 97 | void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; } |
| 98 | |
| 99 | void setNeedsDefault(bool V) { NeedsDefault = V; } |
| 100 | |
| 101 | void setEnableBit(TailFoldingOpts Bit) { |
| 102 | EnableBits |= Bit; |
| 103 | DisableBits &= ~Bit; |
| 104 | } |
| 105 | |
| 106 | void setDisableBit(TailFoldingOpts Bit) { |
| 107 | EnableBits &= ~Bit; |
| 108 | DisableBits |= Bit; |
| 109 | } |
| 110 | |
| 111 | TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const { |
| 112 | TailFoldingOpts Bits = TailFoldingOpts::Disabled; |
| 113 | |
| 114 | assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) && |
| 115 | "Initial bits should only include one of " |
| 116 | "(disabled|all|simple|default)"); |
| 117 | Bits = NeedsDefault ? DefaultBits : InitialBits; |
| 118 | Bits |= EnableBits; |
| 119 | Bits &= ~DisableBits; |
| 120 | |
| 121 | return Bits; |
| 122 | } |
| 123 | |
| 124 | void reportError(std::string Opt) { |
| 125 | errs() << "invalid argument '" << Opt |
| 126 | << "' to -sve-tail-folding=; the option should be of the form\n" |
| 127 | " (disabled|all|default|simple)[+(reductions|recurrences" |
| 128 | "|reverse|noreductions|norecurrences|noreverse)]\n"; |
| 129 | report_fatal_error("Unrecognised tail-folding option"); |
| 130 | } |
David Sherwood | f15b6b2 | 2022-07-12 12:03:39 +0100 | [diff] [blame] | 131 | |
| 132 | public: |
David Sherwood | f15b6b2 | 2022-07-12 12:03:39 +0100 | [diff] [blame] | 133 | |
| 134 | void operator=(const std::string &Val) { |
David Sherwood | 7beb2ca | 2023-04-20 12:34:55 +0000 | [diff] [blame] | 135 | // If the user explicitly sets -sve-tail-folding= then treat as an error. |
| 136 | if (Val.empty()) { |
| 137 | reportError(""); |
David Sherwood | f15b6b2 | 2022-07-12 12:03:39 +0100 | [diff] [blame] | 138 | return; |
David Sherwood | 7beb2ca | 2023-04-20 12:34:55 +0000 | [diff] [blame] | 139 | } |
| 140 | |
| 141 | // Since the user is explicitly setting the option we don't automatically |
| 142 | // need the default unless they require it. |
| 143 | setNeedsDefault(false); |
| 144 | |
| 145 | SmallVector<StringRef, 4> TailFoldTypes; |
David Sherwood | f15b6b2 | 2022-07-12 12:03:39 +0100 | [diff] [blame] | 146 | StringRef(Val).split(TailFoldTypes, '+', -1, false); |
David Sherwood | 7beb2ca | 2023-04-20 12:34:55 +0000 | [diff] [blame] | 147 | |
| 148 | unsigned StartIdx = 1; |
| 149 | if (TailFoldTypes[0] == "disabled") |
| 150 | setInitialBits(TailFoldingOpts::Disabled); |
| 151 | else if (TailFoldTypes[0] == "all") |
| 152 | setInitialBits(TailFoldingOpts::All); |
| 153 | else if (TailFoldTypes[0] == "default") |
| 154 | setNeedsDefault(true); |
| 155 | else if (TailFoldTypes[0] == "simple") |
| 156 | setInitialBits(TailFoldingOpts::Simple); |
| 157 | else { |
| 158 | StartIdx = 0; |
| 159 | setInitialBits(TailFoldingOpts::Disabled); |
| 160 | } |
| 161 | |
| 162 | for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) { |
| 163 | if (TailFoldTypes[I] == "reductions") |
| 164 | setEnableBit(TailFoldingOpts::Reductions); |
| 165 | else if (TailFoldTypes[I] == "recurrences") |
| 166 | setEnableBit(TailFoldingOpts::Recurrences); |
| 167 | else if (TailFoldTypes[I] == "reverse") |
| 168 | setEnableBit(TailFoldingOpts::Reverse); |
| 169 | else if (TailFoldTypes[I] == "noreductions") |
| 170 | setDisableBit(TailFoldingOpts::Reductions); |
| 171 | else if (TailFoldTypes[I] == "norecurrences") |
| 172 | setDisableBit(TailFoldingOpts::Recurrences); |
| 173 | else if (TailFoldTypes[I] == "noreverse") |
| 174 | setDisableBit(TailFoldingOpts::Reverse); |
| 175 | else |
| 176 | reportError(Val); |
David Sherwood | f15b6b2 | 2022-07-12 12:03:39 +0100 | [diff] [blame] | 177 | } |
| 178 | } |
| 179 | |
David Sherwood | 7beb2ca | 2023-04-20 12:34:55 +0000 | [diff] [blame] | 180 | bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const { |
| 181 | return (getBits(DefaultBits) & Required) == Required; |
| 182 | } |
David Sherwood | f15b6b2 | 2022-07-12 12:03:39 +0100 | [diff] [blame] | 183 | }; |
Benjamin Kramer | b6942a2 | 2023-01-08 17:25:29 +0100 | [diff] [blame] | 184 | } // namespace |
David Sherwood | f15b6b2 | 2022-07-12 12:03:39 +0100 | [diff] [blame] | 185 | |
David Sherwood | 7beb2ca | 2023-04-20 12:34:55 +0000 | [diff] [blame] | 186 | TailFoldingOption TailFoldingOptionLoc; |
David Sherwood | f15b6b2 | 2022-07-12 12:03:39 +0100 | [diff] [blame] | 187 | |
chrisPyr | 71f4c7d | 2025-03-03 14:46:33 +0800 | [diff] [blame] | 188 | static cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding( |
David Sherwood | f15b6b2 | 2022-07-12 12:03:39 +0100 | [diff] [blame] | 189 | "sve-tail-folding", |
| 190 | cl::desc( |
David Sherwood | 7beb2ca | 2023-04-20 12:34:55 +0000 | [diff] [blame] | 191 | "Control the use of vectorisation using tail-folding for SVE where the" |
| 192 | " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" |
| 193 | "\ndisabled (Initial) No loop types will vectorize using " |
| 194 | "tail-folding" |
| 195 | "\ndefault (Initial) Uses the default tail-folding settings for " |
| 196 | "the target CPU" |
| 197 | "\nall (Initial) All legal loop types will vectorize using " |
| 198 | "tail-folding" |
| 199 | "\nsimple (Initial) Use tail-folding for simple loops (not " |
| 200 | "reductions or recurrences)" |
| 201 | "\nreductions Use tail-folding for loops containing reductions" |
| 202 | "\nnoreductions Inverse of above" |
| 203 | "\nrecurrences Use tail-folding for loops containing fixed order " |
David Sherwood | 636efd2 | 2023-03-14 18:15:03 +0000 | [diff] [blame] | 204 | "recurrences" |
David Sherwood | 7beb2ca | 2023-04-20 12:34:55 +0000 | [diff] [blame] | 205 | "\nnorecurrences Inverse of above" |
| 206 | "\nreverse Use tail-folding for loops requiring reversed " |
| 207 | "predicates" |
| 208 | "\nnoreverse Inverse of above"), |
| 209 | cl::location(TailFoldingOptionLoc)); |
David Sherwood | f15b6b2 | 2022-07-12 12:03:39 +0100 | [diff] [blame] | 210 | |
Sander de Smalen | 137459a | 2022-10-19 14:14:00 +0000 | [diff] [blame] | 211 | // Experimental option that will only be fully functional when the |
| 212 | // code-generator is changed to use SVE instead of NEON for all fixed-width |
| 213 | // operations. |
| 214 | static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode( |
| 215 | "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden); |
| 216 | |
| 217 | // Experimental option that will only be fully functional when the cost-model |
| 218 | // and code-generator have been changed to avoid using scalable vector |
| 219 | // instructions that are not legal in streaming SVE mode. |
| 220 | static cl::opt<bool> EnableScalableAutovecInStreamingMode( |
| 221 | "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden); |
| 222 | |
Sander de Smalen | 6d30bc0 | 2023-10-30 10:47:07 +0000 | [diff] [blame] | 223 | static bool isSMEABIRoutineCall(const CallInst &CI) { |
| 224 | const auto *F = CI.getCalledFunction(); |
| 225 | return F && StringSwitch<bool>(F->getName()) |
| 226 | .Case("__arm_sme_state", true) |
| 227 | .Case("__arm_tpidr2_save", true) |
| 228 | .Case("__arm_tpidr2_restore", true) |
| 229 | .Case("__arm_za_disable", true) |
| 230 | .Default(false); |
| 231 | } |
| 232 | |
| 233 | /// Returns true if the function has explicit operations that can only be |
| 234 | /// lowered using incompatible instructions for the selected mode. This also |
| 235 | /// returns true if the function F may use or modify ZA state. |
| 236 | static bool hasPossibleIncompatibleOps(const Function *F) { |
| 237 | for (const BasicBlock &BB : *F) { |
| 238 | for (const Instruction &I : BB) { |
| 239 | // Be conservative for now and assume that any call to inline asm or to |
| 240 | // intrinsics could could result in non-streaming ops (e.g. calls to |
| 241 | // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that |
| 242 | // all native LLVM instructions can be lowered to compatible instructions. |
| 243 | if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() && |
| 244 | (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) || |
| 245 | isSMEABIRoutineCall(cast<CallInst>(I)))) |
| 246 | return true; |
| 247 | } |
| 248 | } |
| 249 | return false; |
| 250 | } |
| 251 | |
Alexandros Lamprineas | 831527a | 2025-01-17 10:49:43 +0000 | [diff] [blame] | 252 | uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const { |
| 253 | StringRef AttributeStr = |
| 254 | isMultiversionedFunction(F) ? "fmv-features" : "target-features"; |
| 255 | StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString(); |
| 256 | SmallVector<StringRef, 8> Features; |
| 257 | FeatureStr.split(Features, ","); |
| 258 | return AArch64::getFMVPriority(Features); |
| 259 | } |
| 260 | |
| 261 | bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const { |
| 262 | return F.hasFnAttribute("fmv-features"); |
| 263 | } |
| 264 | |
Csanád Hajdú | a190f15 | 2025-02-14 09:56:07 +0100 | [diff] [blame] | 265 | const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = { |
| 266 | AArch64::FeatureExecuteOnly, |
| 267 | }; |
| 268 | |
Florian Hahn | 2665feb | 2017-06-27 22:27:32 +0000 | [diff] [blame] | 269 | bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, |
| 270 | const Function *Callee) const { |
Sander de Smalen | 3abf55a | 2024-01-31 11:38:29 +0000 | [diff] [blame] | 271 | SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); |
| 272 | |
| 273 | // When inlining, we should consider the body of the function, not the |
| 274 | // interface. |
| 275 | if (CalleeAttrs.hasStreamingBody()) { |
| 276 | CalleeAttrs.set(SMEAttrs::SM_Compatible, false); |
| 277 | CalleeAttrs.set(SMEAttrs::SM_Enabled, true); |
| 278 | } |
| 279 | |
Kerry McLaughlin | d8d4c18 | 2025-01-06 12:02:28 +0000 | [diff] [blame] | 280 | if (CalleeAttrs.isNewZA() || CalleeAttrs.isNewZT0()) |
David Sherwood | 64bef3d | 2022-09-20 16:28:34 +0100 | [diff] [blame] | 281 | return false; |
| 282 | |
Sander de Smalen | 6d30bc0 | 2023-10-30 10:47:07 +0000 | [diff] [blame] | 283 | if (CallerAttrs.requiresLazySave(CalleeAttrs) || |
Sander de Smalen | fb470db | 2024-08-02 10:29:08 +0100 | [diff] [blame] | 284 | CallerAttrs.requiresSMChange(CalleeAttrs) || |
Sander de Smalen | 2ce168b | 2024-12-23 19:10:21 +0000 | [diff] [blame] | 285 | CallerAttrs.requiresPreservingZT0(CalleeAttrs) || |
| 286 | CallerAttrs.requiresPreservingAllZAState(CalleeAttrs)) { |
Sander de Smalen | 6d30bc0 | 2023-10-30 10:47:07 +0000 | [diff] [blame] | 287 | if (hasPossibleIncompatibleOps(Callee)) |
| 288 | return false; |
| 289 | } |
| 290 | |
Csanád Hajdú | a190f15 | 2025-02-14 09:56:07 +0100 | [diff] [blame] | 291 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
| 292 | const FeatureBitset &CallerBits = |
| 293 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
| 294 | const FeatureBitset &CalleeBits = |
| 295 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
| 296 | // Adjust the feature bitsets by inverting some of the bits. This is needed |
| 297 | // for target features that represent restrictions rather than capabilities, |
| 298 | // for example a "+execute-only" callee can be inlined into a caller without |
| 299 | // "+execute-only", but not vice versa. |
| 300 | FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures; |
| 301 | FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures; |
| 302 | |
| 303 | return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits; |
Florian Hahn | 2665feb | 2017-06-27 22:27:32 +0000 | [diff] [blame] | 304 | } |
| 305 | |
KAWASHIMA Takahiro | 926173c | 2023-10-26 14:51:20 +0900 | [diff] [blame] | 306 | bool AArch64TTIImpl::areTypesABICompatible( |
| 307 | const Function *Caller, const Function *Callee, |
| 308 | const ArrayRef<Type *> &Types) const { |
| 309 | if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) |
| 310 | return false; |
| 311 | |
| 312 | // We need to ensure that argument promotion does not attempt to promote |
| 313 | // pointers to fixed-length vector types larger than 128 bits like |
| 314 | // <8 x float> (and pointers to aggregate types which have such fixed-length |
| 315 | // vector type members) into the values of the pointees. Such vector types |
| 316 | // are used for SVE VLS but there is no ABI for SVE VLS arguments and the |
| 317 | // backend cannot lower such value arguments. The 128-bit fixed-length SVE |
| 318 | // types can be safely treated as 128-bit NEON types and they cannot be |
| 319 | // distinguished in IR. |
| 320 | if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) { |
| 321 | auto FVTy = dyn_cast<FixedVectorType>(Ty); |
| 322 | return FVTy && |
| 323 | FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128; |
| 324 | })) |
| 325 | return false; |
| 326 | |
| 327 | return true; |
| 328 | } |
| 329 | |
Sander de Smalen | 00a8314 | 2023-10-31 10:28:40 +0000 | [diff] [blame] | 330 | unsigned |
| 331 | AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, |
| 332 | unsigned DefaultCallPenalty) const { |
| 333 | // This function calculates a penalty for executing Call in F. |
| 334 | // |
| 335 | // There are two ways this function can be called: |
| 336 | // (1) F: |
| 337 | // call from F -> G (the call here is Call) |
| 338 | // |
| 339 | // For (1), Call.getCaller() == F, so it will always return a high cost if |
| 340 | // a streaming-mode change is required (thus promoting the need to inline the |
| 341 | // function) |
| 342 | // |
| 343 | // (2) F: |
| 344 | // call from F -> G (the call here is not Call) |
| 345 | // G: |
| 346 | // call from G -> H (the call here is Call) |
| 347 | // |
| 348 | // For (2), if after inlining the body of G into F the call to H requires a |
| 349 | // streaming-mode change, and the call to G from F would also require a |
| 350 | // streaming-mode change, then there is benefit to do the streaming-mode |
| 351 | // change only once and avoid inlining of G into F. |
| 352 | SMEAttrs FAttrs(*F); |
| 353 | SMEAttrs CalleeAttrs(Call); |
| 354 | if (FAttrs.requiresSMChange(CalleeAttrs)) { |
| 355 | if (F == Call.getCaller()) // (1) |
| 356 | return CallPenaltyChangeSM * DefaultCallPenalty; |
| 357 | if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2) |
| 358 | return InlineCallPenaltyChangeSM * DefaultCallPenalty; |
| 359 | } |
| 360 | |
| 361 | return DefaultCallPenalty; |
| 362 | } |
| 363 | |
Jingu Kang | bb82f74 | 2022-05-23 12:33:48 +0100 | [diff] [blame] | 364 | bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( |
| 365 | TargetTransformInfo::RegisterKind K) const { |
| 366 | assert(K != TargetTransformInfo::RGK_Scalar); |
Graham Hunter | 091a235 | 2024-10-18 11:05:55 +0100 | [diff] [blame] | 367 | return (K == TargetTransformInfo::RGK_FixedWidthVector && |
| 368 | ST->isNeonAvailable()); |
Jingu Kang | bb82f74 | 2022-05-23 12:33:48 +0100 | [diff] [blame] | 369 | } |
| 370 | |
Adrian Prantl | 5f8f34e4 | 2018-05-01 15:54:18 +0000 | [diff] [blame] | 371 | /// Calculate the cost of materializing a 64-bit value. This helper |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 372 | /// method might only calculate a fraction of a larger immediate. Therefore it |
| 373 | /// is valid to return a cost of ZERO. |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 374 | InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) const { |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 375 | // Check if the immediate can be encoded within an instruction. |
| 376 | if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) |
| 377 | return 0; |
| 378 | |
| 379 | if (Val < 0) |
| 380 | Val = ~Val; |
| 381 | |
| 382 | // Calculate how many moves we will need to materialize this constant. |
Adhemerval Zanella | 270249d | 2019-03-18 18:50:58 +0000 | [diff] [blame] | 383 | SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; |
| 384 | AArch64_IMM::expandMOVImm(Val, 64, Insn); |
| 385 | return Insn.size(); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 386 | } |
| 387 | |
Adrian Prantl | 5f8f34e4 | 2018-05-01 15:54:18 +0000 | [diff] [blame] | 388 | /// Calculate the cost of materializing the given constant. |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 389 | InstructionCost |
| 390 | AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, |
| 391 | TTI::TargetCostKind CostKind) const { |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 392 | assert(Ty->isIntegerTy()); |
| 393 | |
| 394 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
| 395 | if (BitSize == 0) |
| 396 | return ~0U; |
| 397 | |
| 398 | // Sign-extend all constants to a multiple of 64-bit. |
| 399 | APInt ImmVal = Imm; |
| 400 | if (BitSize & 0x3f) |
| 401 | ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); |
| 402 | |
| 403 | // Split the constant into 64-bit chunks and calculate the cost for each |
| 404 | // chunk. |
Sander de Smalen | f9a50f0 | 2021-01-27 15:01:16 +0000 | [diff] [blame] | 405 | InstructionCost Cost = 0; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 406 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { |
| 407 | APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); |
| 408 | int64_t Val = Tmp.getSExtValue(); |
| 409 | Cost += getIntImmCost(Val); |
| 410 | } |
| 411 | // We need at least one instruction to materialze the constant. |
Sander de Smalen | f9a50f0 | 2021-01-27 15:01:16 +0000 | [diff] [blame] | 412 | return std::max<InstructionCost>(1, Cost); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 413 | } |
| 414 | |
Sander de Smalen | f9a50f0 | 2021-01-27 15:01:16 +0000 | [diff] [blame] | 415 | InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, |
| 416 | const APInt &Imm, Type *Ty, |
| 417 | TTI::TargetCostKind CostKind, |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 418 | Instruction *Inst) const { |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 419 | assert(Ty->isIntegerTy()); |
| 420 | |
| 421 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
| 422 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
| 423 | // here, so that constant hoisting will ignore this constant. |
| 424 | if (BitSize == 0) |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 425 | return TTI::TCC_Free; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 426 | |
| 427 | unsigned ImmIdx = ~0U; |
| 428 | switch (Opcode) { |
| 429 | default: |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 430 | return TTI::TCC_Free; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 431 | case Instruction::GetElementPtr: |
| 432 | // Always hoist the base address of a GetElementPtr. |
| 433 | if (Idx == 0) |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 434 | return 2 * TTI::TCC_Basic; |
| 435 | return TTI::TCC_Free; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 436 | case Instruction::Store: |
| 437 | ImmIdx = 0; |
| 438 | break; |
| 439 | case Instruction::Add: |
| 440 | case Instruction::Sub: |
| 441 | case Instruction::Mul: |
| 442 | case Instruction::UDiv: |
| 443 | case Instruction::SDiv: |
| 444 | case Instruction::URem: |
| 445 | case Instruction::SRem: |
| 446 | case Instruction::And: |
| 447 | case Instruction::Or: |
| 448 | case Instruction::Xor: |
| 449 | case Instruction::ICmp: |
| 450 | ImmIdx = 1; |
| 451 | break; |
| 452 | // Always return TCC_Free for the shift value of a shift instruction. |
| 453 | case Instruction::Shl: |
| 454 | case Instruction::LShr: |
| 455 | case Instruction::AShr: |
| 456 | if (Idx == 1) |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 457 | return TTI::TCC_Free; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 458 | break; |
| 459 | case Instruction::Trunc: |
| 460 | case Instruction::ZExt: |
| 461 | case Instruction::SExt: |
| 462 | case Instruction::IntToPtr: |
| 463 | case Instruction::PtrToInt: |
| 464 | case Instruction::BitCast: |
| 465 | case Instruction::PHI: |
| 466 | case Instruction::Call: |
| 467 | case Instruction::Select: |
| 468 | case Instruction::Ret: |
| 469 | case Instruction::Load: |
| 470 | break; |
| 471 | } |
| 472 | |
| 473 | if (Idx == ImmIdx) { |
Chandler Carruth | 93205eb | 2015-08-05 18:08:10 +0000 | [diff] [blame] | 474 | int NumConstants = (BitSize + 63) / 64; |
Sander de Smalen | f9a50f0 | 2021-01-27 15:01:16 +0000 | [diff] [blame] | 475 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 476 | return (Cost <= NumConstants * TTI::TCC_Basic) |
Chandler Carruth | 93205eb | 2015-08-05 18:08:10 +0000 | [diff] [blame] | 477 | ? static_cast<int>(TTI::TCC_Free) |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 478 | : Cost; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 479 | } |
Sam Parker | 40574fe | 2020-04-28 14:11:27 +0100 | [diff] [blame] | 480 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 481 | } |
| 482 | |
Sander de Smalen | f9a50f0 | 2021-01-27 15:01:16 +0000 | [diff] [blame] | 483 | InstructionCost |
| 484 | AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
| 485 | const APInt &Imm, Type *Ty, |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 486 | TTI::TargetCostKind CostKind) const { |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 487 | assert(Ty->isIntegerTy()); |
| 488 | |
| 489 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
| 490 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
| 491 | // here, so that constant hoisting will ignore this constant. |
| 492 | if (BitSize == 0) |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 493 | return TTI::TCC_Free; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 494 | |
Florian Hahn | 93c8235 | 2019-12-04 10:49:24 +0000 | [diff] [blame] | 495 | // Most (all?) AArch64 intrinsics do not support folding immediates into the |
| 496 | // selected instruction, so we compute the materialization cost for the |
| 497 | // immediate directly. |
| 498 | if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) |
Sam Parker | 40574fe | 2020-04-28 14:11:27 +0100 | [diff] [blame] | 499 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
Florian Hahn | 93c8235 | 2019-12-04 10:49:24 +0000 | [diff] [blame] | 500 | |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 501 | switch (IID) { |
| 502 | default: |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 503 | return TTI::TCC_Free; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 504 | case Intrinsic::sadd_with_overflow: |
| 505 | case Intrinsic::uadd_with_overflow: |
| 506 | case Intrinsic::ssub_with_overflow: |
| 507 | case Intrinsic::usub_with_overflow: |
| 508 | case Intrinsic::smul_with_overflow: |
| 509 | case Intrinsic::umul_with_overflow: |
| 510 | if (Idx == 1) { |
Chandler Carruth | 93205eb | 2015-08-05 18:08:10 +0000 | [diff] [blame] | 511 | int NumConstants = (BitSize + 63) / 64; |
Sander de Smalen | f9a50f0 | 2021-01-27 15:01:16 +0000 | [diff] [blame] | 512 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 513 | return (Cost <= NumConstants * TTI::TCC_Basic) |
Chandler Carruth | 93205eb | 2015-08-05 18:08:10 +0000 | [diff] [blame] | 514 | ? static_cast<int>(TTI::TCC_Free) |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 515 | : Cost; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 516 | } |
| 517 | break; |
| 518 | case Intrinsic::experimental_stackmap: |
| 519 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 520 | return TTI::TCC_Free; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 521 | break; |
| 522 | case Intrinsic::experimental_patchpoint_void: |
Il-Capitano | 308ed02 | 2024-03-26 14:38:52 +0100 | [diff] [blame] | 523 | case Intrinsic::experimental_patchpoint: |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 524 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 525 | return TTI::TCC_Free; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 526 | break; |
Philip Reames | e6bc703 | 2020-09-14 16:38:48 -0700 | [diff] [blame] | 527 | case Intrinsic::experimental_gc_statepoint: |
| 528 | if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) |
| 529 | return TTI::TCC_Free; |
| 530 | break; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 531 | } |
Sam Parker | 40574fe | 2020-04-28 14:11:27 +0100 | [diff] [blame] | 532 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 533 | } |
| 534 | |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 535 | TargetTransformInfo::PopcntSupportKind |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 536 | AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const { |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 537 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); |
| 538 | if (TyWidth == 32 || TyWidth == 64) |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 539 | return TTI::PSK_FastHardware; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 540 | // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 541 | return TTI::PSK_Software; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 542 | } |
| 543 | |
Graham Hunter | 03f852f | 2024-03-04 16:17:01 +0000 | [diff] [blame] | 544 | static bool isUnpackedVectorVT(EVT VecVT) { |
| 545 | return VecVT.isScalableVector() && |
| 546 | VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock; |
| 547 | } |
| 548 | |
Graham Hunter | 2c0add9 | 2024-07-04 10:59:21 +0100 | [diff] [blame] | 549 | static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) { |
| 550 | Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers |
| 551 | Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements |
Sam Tebbs | b49a6b2 | 2024-09-19 13:56:52 +0100 | [diff] [blame] | 552 | unsigned TotalHistCnts = 1; |
Graham Hunter | 2c0add9 | 2024-07-04 10:59:21 +0100 | [diff] [blame] | 553 | |
Sam Tebbs | b49a6b2 | 2024-09-19 13:56:52 +0100 | [diff] [blame] | 554 | unsigned EltSize = EltTy->getScalarSizeInBits(); |
| 555 | // Only allow (up to 64b) integers or pointers |
| 556 | if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64) |
Graham Hunter | 2c0add9 | 2024-07-04 10:59:21 +0100 | [diff] [blame] | 557 | return InstructionCost::getInvalid(); |
| 558 | |
Graham Hunter | 2c0add9 | 2024-07-04 10:59:21 +0100 | [diff] [blame] | 559 | // FIXME: We should be able to generate histcnt for fixed-length vectors |
| 560 | // using ptrue with a specific VL. |
Sam Tebbs | b49a6b2 | 2024-09-19 13:56:52 +0100 | [diff] [blame] | 561 | if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) { |
| 562 | unsigned EC = VTy->getElementCount().getKnownMinValue(); |
| 563 | if (!isPowerOf2_64(EC) || !VTy->isScalableTy()) |
Graham Hunter | 2c0add9 | 2024-07-04 10:59:21 +0100 | [diff] [blame] | 564 | return InstructionCost::getInvalid(); |
| 565 | |
Sam Tebbs | b49a6b2 | 2024-09-19 13:56:52 +0100 | [diff] [blame] | 566 | // HistCnt only supports 32b and 64b element types |
| 567 | unsigned LegalEltSize = EltSize <= 32 ? 32 : 64; |
| 568 | |
Samuel Tebbs | b1b436c1 | 2024-09-19 14:43:22 +0100 | [diff] [blame] | 569 | if (EC == 2 || (LegalEltSize == 32 && EC == 4)) |
Sam Tebbs | b49a6b2 | 2024-09-19 13:56:52 +0100 | [diff] [blame] | 570 | return InstructionCost(BaseHistCntCost); |
| 571 | |
| 572 | unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize; |
| 573 | TotalHistCnts = EC / NaturalVectorWidth; |
| 574 | } |
| 575 | |
| 576 | return InstructionCost(BaseHistCntCost * TotalHistCnts); |
Graham Hunter | 2c0add9 | 2024-07-04 10:59:21 +0100 | [diff] [blame] | 577 | } |
| 578 | |
Sander de Smalen | 2f6f249 | 2021-01-22 17:14:44 +0000 | [diff] [blame] | 579 | InstructionCost |
Florian Hahn | 0fcc6f7 | 2020-10-23 09:00:20 +0100 | [diff] [blame] | 580 | AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 581 | TTI::TargetCostKind CostKind) const { |
David Green | 0b745a1 | 2024-08-09 14:25:07 +0100 | [diff] [blame] | 582 | // The code-generator is currently not able to handle scalable vectors |
| 583 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 584 | // it. This change will be removed when code-generation for these types is |
| 585 | // sufficiently reliable. |
Florian Hahn | 0fcc6f7 | 2020-10-23 09:00:20 +0100 | [diff] [blame] | 586 | auto *RetTy = ICA.getReturnType(); |
David Green | 0b745a1 | 2024-08-09 14:25:07 +0100 | [diff] [blame] | 587 | if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy)) |
| 588 | if (VTy->getElementCount() == ElementCount::getScalable(1)) |
| 589 | return InstructionCost::getInvalid(); |
| 590 | |
Florian Hahn | 0fcc6f7 | 2020-10-23 09:00:20 +0100 | [diff] [blame] | 591 | switch (ICA.getID()) { |
Graham Hunter | 2c0add9 | 2024-07-04 10:59:21 +0100 | [diff] [blame] | 592 | case Intrinsic::experimental_vector_histogram_add: |
| 593 | if (!ST->hasSVE2()) |
| 594 | return InstructionCost::getInvalid(); |
| 595 | return getHistogramCost(ICA); |
Florian Hahn | 0fcc6f7 | 2020-10-23 09:00:20 +0100 | [diff] [blame] | 596 | case Intrinsic::umin: |
Irina Dobrescu | b01417d | 2021-07-22 16:21:48 +0100 | [diff] [blame] | 597 | case Intrinsic::umax: |
Simon Pilgrim | 969918e | 2020-11-27 11:00:34 +0000 | [diff] [blame] | 598 | case Intrinsic::smin: |
| 599 | case Intrinsic::smax: { |
Florian Hahn | 0fcc6f7 | 2020-10-23 09:00:20 +0100 | [diff] [blame] | 600 | static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
Tuan Chuong Goh | e36dd3e | 2023-07-12 07:46:12 +0100 | [diff] [blame] | 601 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
| 602 | MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, |
| 603 | MVT::nxv2i64}; |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 604 | auto LT = getTypeLegalizationCost(RetTy); |
Irina Dobrescu | b01417d | 2021-07-22 16:21:48 +0100 | [diff] [blame] | 605 | // v2i64 types get converted to cmp+bif hence the cost of 2 |
| 606 | if (LT.second == MVT::v2i64) |
| 607 | return LT.first * 2; |
Florian Hahn | 0fcc6f7 | 2020-10-23 09:00:20 +0100 | [diff] [blame] | 608 | if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) |
| 609 | return LT.first; |
| 610 | break; |
| 611 | } |
David Green | 0175cd0 | 2021-01-27 10:38:32 +0000 | [diff] [blame] | 612 | case Intrinsic::sadd_sat: |
| 613 | case Intrinsic::ssub_sat: |
| 614 | case Intrinsic::uadd_sat: |
| 615 | case Intrinsic::usub_sat: { |
| 616 | static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
| 617 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
| 618 | MVT::v2i64}; |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 619 | auto LT = getTypeLegalizationCost(RetTy); |
David Green | 0175cd0 | 2021-01-27 10:38:32 +0000 | [diff] [blame] | 620 | // This is a base cost of 1 for the vadd, plus 3 extract shifts if we |
| 621 | // need to extend the type, as it uses shr(qadd(shl, shl)). |
| 622 | unsigned Instrs = |
| 623 | LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; |
| 624 | if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) |
| 625 | return LT.first * Instrs; |
| 626 | break; |
| 627 | } |
Stelios Ioannou | 30cb9c0 | 2021-02-24 12:51:30 +0000 | [diff] [blame] | 628 | case Intrinsic::abs: { |
| 629 | static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
| 630 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
| 631 | MVT::v2i64}; |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 632 | auto LT = getTypeLegalizationCost(RetTy); |
Stelios Ioannou | 30cb9c0 | 2021-02-24 12:51:30 +0000 | [diff] [blame] | 633 | if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) |
| 634 | return LT.first; |
| 635 | break; |
| 636 | } |
David Green | 8da62b8 | 2023-07-21 08:48:53 +0100 | [diff] [blame] | 637 | case Intrinsic::bswap: { |
| 638 | static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32, |
| 639 | MVT::v4i32, MVT::v2i64}; |
| 640 | auto LT = getTypeLegalizationCost(RetTy); |
| 641 | if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; }) && |
| 642 | LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits()) |
| 643 | return LT.first; |
| 644 | break; |
| 645 | } |
Maciej Gabka | 95d2d1c | 2024-08-28 12:48:20 +0100 | [diff] [blame] | 646 | case Intrinsic::stepvector: { |
Sander de Smalen | 4f42d87 | 2021-04-14 16:53:01 +0100 | [diff] [blame] | 647 | InstructionCost Cost = 1; // Cost of the `index' instruction |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 648 | auto LT = getTypeLegalizationCost(RetTy); |
David Sherwood | 748ae52 | 2021-02-08 15:46:24 +0000 | [diff] [blame] | 649 | // Legalisation of illegal vectors involves an `index' instruction plus |
| 650 | // (LT.first - 1) vector adds. |
| 651 | if (LT.first > 1) { |
| 652 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); |
Sander de Smalen | 4f42d87 | 2021-04-14 16:53:01 +0100 | [diff] [blame] | 653 | InstructionCost AddCost = |
David Sherwood | 748ae52 | 2021-02-08 15:46:24 +0000 | [diff] [blame] | 654 | getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); |
| 655 | Cost += AddCost * (LT.first - 1); |
| 656 | } |
| 657 | return Cost; |
| 658 | } |
Graham Hunter | 03f852f | 2024-03-04 16:17:01 +0000 | [diff] [blame] | 659 | case Intrinsic::vector_extract: |
| 660 | case Intrinsic::vector_insert: { |
| 661 | // If both the vector and subvector types are legal types and the index |
| 662 | // is 0, then this should be a no-op or simple operation; return a |
| 663 | // relatively low cost. |
| 664 | |
| 665 | // If arguments aren't actually supplied, then we cannot determine the |
| 666 | // value of the index. We also want to skip predicate types. |
| 667 | if (ICA.getArgs().size() != ICA.getArgTypes().size() || |
| 668 | ICA.getReturnType()->getScalarType()->isIntegerTy(1)) |
| 669 | break; |
| 670 | |
| 671 | LLVMContext &C = RetTy->getContext(); |
| 672 | EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); |
| 673 | bool IsExtract = ICA.getID() == Intrinsic::vector_extract; |
| 674 | EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy) |
| 675 | : getTLI()->getValueType(DL, ICA.getArgTypes()[1]); |
| 676 | // Skip this if either the vector or subvector types are unpacked |
| 677 | // SVE types; they may get lowered to stack stores and loads. |
| 678 | if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT)) |
| 679 | break; |
| 680 | |
| 681 | TargetLoweringBase::LegalizeKind SubVecLK = |
| 682 | getTLI()->getTypeConversion(C, SubVecVT); |
| 683 | TargetLoweringBase::LegalizeKind VecLK = |
| 684 | getTLI()->getTypeConversion(C, VecVT); |
| 685 | const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2]; |
Graham Hunter | 56abb8d | 2024-03-05 09:35:16 +0000 | [diff] [blame] | 686 | const ConstantInt *CIdx = cast<ConstantInt>(Idx); |
Graham Hunter | 03f852f | 2024-03-04 16:17:01 +0000 | [diff] [blame] | 687 | if (SubVecLK.first == TargetLoweringBase::TypeLegal && |
Graham Hunter | 56abb8d | 2024-03-05 09:35:16 +0000 | [diff] [blame] | 688 | VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero()) |
Graham Hunter | 03f852f | 2024-03-04 16:17:01 +0000 | [diff] [blame] | 689 | return TTI::TCC_Free; |
| 690 | break; |
| 691 | } |
Irina Dobrescu | de79919 | 2021-06-03 09:46:12 +0100 | [diff] [blame] | 692 | case Intrinsic::bitreverse: { |
| 693 | static const CostTblEntry BitreverseTbl[] = { |
| 694 | {Intrinsic::bitreverse, MVT::i32, 1}, |
| 695 | {Intrinsic::bitreverse, MVT::i64, 1}, |
| 696 | {Intrinsic::bitreverse, MVT::v8i8, 1}, |
| 697 | {Intrinsic::bitreverse, MVT::v16i8, 1}, |
| 698 | {Intrinsic::bitreverse, MVT::v4i16, 2}, |
| 699 | {Intrinsic::bitreverse, MVT::v8i16, 2}, |
| 700 | {Intrinsic::bitreverse, MVT::v2i32, 2}, |
| 701 | {Intrinsic::bitreverse, MVT::v4i32, 2}, |
| 702 | {Intrinsic::bitreverse, MVT::v1i64, 2}, |
| 703 | {Intrinsic::bitreverse, MVT::v2i64, 2}, |
| 704 | }; |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 705 | const auto LegalisationCost = getTypeLegalizationCost(RetTy); |
Irina Dobrescu | de79919 | 2021-06-03 09:46:12 +0100 | [diff] [blame] | 706 | const auto *Entry = |
| 707 | CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); |
Simon Pilgrim | 676f280 | 2021-09-29 12:28:38 +0100 | [diff] [blame] | 708 | if (Entry) { |
| 709 | // Cost Model is using the legal type(i32) that i8 and i16 will be |
| 710 | // converted to +1 so that we match the actual lowering cost |
| 711 | if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || |
| 712 | TLI->getValueType(DL, RetTy, true) == MVT::i16) |
| 713 | return LegalisationCost.first * Entry->Cost + 1; |
| 714 | |
Irina Dobrescu | de79919 | 2021-06-03 09:46:12 +0100 | [diff] [blame] | 715 | return LegalisationCost.first * Entry->Cost; |
Simon Pilgrim | 676f280 | 2021-09-29 12:28:38 +0100 | [diff] [blame] | 716 | } |
Benjamin Kramer | 3dceffd | 2021-06-10 17:23:37 +0200 | [diff] [blame] | 717 | break; |
Irina Dobrescu | de79919 | 2021-06-03 09:46:12 +0100 | [diff] [blame] | 718 | } |
Rosie Sumpter | d7c219a | 2021-06-09 10:00:16 +0100 | [diff] [blame] | 719 | case Intrinsic::ctpop: { |
Eli Friedman | b219a9c | 2022-09-02 15:17:55 -0700 | [diff] [blame] | 720 | if (!ST->hasNEON()) { |
| 721 | // 32-bit or 64-bit ctpop without NEON is 12 instructions. |
| 722 | return getTypeLegalizationCost(RetTy).first * 12; |
| 723 | } |
Rosie Sumpter | d7c219a | 2021-06-09 10:00:16 +0100 | [diff] [blame] | 724 | static const CostTblEntry CtpopCostTbl[] = { |
| 725 | {ISD::CTPOP, MVT::v2i64, 4}, |
| 726 | {ISD::CTPOP, MVT::v4i32, 3}, |
| 727 | {ISD::CTPOP, MVT::v8i16, 2}, |
| 728 | {ISD::CTPOP, MVT::v16i8, 1}, |
| 729 | {ISD::CTPOP, MVT::i64, 4}, |
| 730 | {ISD::CTPOP, MVT::v2i32, 3}, |
| 731 | {ISD::CTPOP, MVT::v4i16, 2}, |
| 732 | {ISD::CTPOP, MVT::v8i8, 1}, |
| 733 | {ISD::CTPOP, MVT::i32, 5}, |
| 734 | }; |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 735 | auto LT = getTypeLegalizationCost(RetTy); |
Rosie Sumpter | d7c219a | 2021-06-09 10:00:16 +0100 | [diff] [blame] | 736 | MVT MTy = LT.second; |
| 737 | if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { |
| 738 | // Extra cost of +1 when illegal vector types are legalized by promoting |
| 739 | // the integer type. |
| 740 | int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != |
| 741 | RetTy->getScalarSizeInBits() |
| 742 | ? 1 |
| 743 | : 0; |
| 744 | return LT.first * Entry->Cost + ExtraCost; |
| 745 | } |
| 746 | break; |
| 747 | } |
David Green | bc615e4 | 2022-01-07 16:20:23 +0000 | [diff] [blame] | 748 | case Intrinsic::sadd_with_overflow: |
| 749 | case Intrinsic::uadd_with_overflow: |
| 750 | case Intrinsic::ssub_with_overflow: |
| 751 | case Intrinsic::usub_with_overflow: |
David Green | c65270c | 2022-01-06 17:22:47 +0000 | [diff] [blame] | 752 | case Intrinsic::smul_with_overflow: |
| 753 | case Intrinsic::umul_with_overflow: { |
| 754 | static const CostTblEntry WithOverflowCostTbl[] = { |
David Green | bc615e4 | 2022-01-07 16:20:23 +0000 | [diff] [blame] | 755 | {Intrinsic::sadd_with_overflow, MVT::i8, 3}, |
| 756 | {Intrinsic::uadd_with_overflow, MVT::i8, 3}, |
| 757 | {Intrinsic::sadd_with_overflow, MVT::i16, 3}, |
| 758 | {Intrinsic::uadd_with_overflow, MVT::i16, 3}, |
| 759 | {Intrinsic::sadd_with_overflow, MVT::i32, 1}, |
| 760 | {Intrinsic::uadd_with_overflow, MVT::i32, 1}, |
| 761 | {Intrinsic::sadd_with_overflow, MVT::i64, 1}, |
| 762 | {Intrinsic::uadd_with_overflow, MVT::i64, 1}, |
| 763 | {Intrinsic::ssub_with_overflow, MVT::i8, 3}, |
| 764 | {Intrinsic::usub_with_overflow, MVT::i8, 3}, |
| 765 | {Intrinsic::ssub_with_overflow, MVT::i16, 3}, |
| 766 | {Intrinsic::usub_with_overflow, MVT::i16, 3}, |
| 767 | {Intrinsic::ssub_with_overflow, MVT::i32, 1}, |
| 768 | {Intrinsic::usub_with_overflow, MVT::i32, 1}, |
| 769 | {Intrinsic::ssub_with_overflow, MVT::i64, 1}, |
| 770 | {Intrinsic::usub_with_overflow, MVT::i64, 1}, |
David Green | c65270c | 2022-01-06 17:22:47 +0000 | [diff] [blame] | 771 | {Intrinsic::smul_with_overflow, MVT::i8, 5}, |
| 772 | {Intrinsic::umul_with_overflow, MVT::i8, 4}, |
| 773 | {Intrinsic::smul_with_overflow, MVT::i16, 5}, |
| 774 | {Intrinsic::umul_with_overflow, MVT::i16, 4}, |
| 775 | {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst |
| 776 | {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw |
| 777 | {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp |
| 778 | {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr |
| 779 | }; |
| 780 | EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); |
| 781 | if (MTy.isSimple()) |
| 782 | if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), |
| 783 | MTy.getSimpleVT())) |
| 784 | return Entry->Cost; |
| 785 | break; |
| 786 | } |
David Green | 2dcb2d8 | 2022-05-02 11:36:05 +0100 | [diff] [blame] | 787 | case Intrinsic::fptosi_sat: |
| 788 | case Intrinsic::fptoui_sat: { |
| 789 | if (ICA.getArgTypes().empty()) |
| 790 | break; |
| 791 | bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 792 | auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]); |
David Green | 2dcb2d8 | 2022-05-02 11:36:05 +0100 | [diff] [blame] | 793 | EVT MTy = TLI->getValueType(DL, RetTy); |
| 794 | // Check for the legal types, which are where the size of the input and the |
| 795 | // output are the same, or we are using cvt f64->i32 or f32->i64. |
| 796 | if ((LT.second == MVT::f32 || LT.second == MVT::f64 || |
| 797 | LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || |
David Green | 6907ab4 | 2024-07-28 10:47:40 +0100 | [diff] [blame] | 798 | LT.second == MVT::v2f64)) { |
| 799 | if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || |
| 800 | (LT.second == MVT::f64 && MTy == MVT::i32) || |
| 801 | (LT.second == MVT::f32 && MTy == MVT::i64))) |
| 802 | return LT.first; |
| 803 | // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2 |
| 804 | if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() && |
| 805 | MTy.getScalarSizeInBits() == 64) |
| 806 | return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2); |
| 807 | } |
| 808 | // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to |
| 809 | // f32. |
| 810 | if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) |
| 811 | return LT.first + getIntrinsicInstrCost( |
| 812 | {ICA.getID(), |
| 813 | RetTy, |
| 814 | {ICA.getArgTypes()[0]->getWithNewType( |
| 815 | Type::getFloatTy(RetTy->getContext()))}}, |
| 816 | CostKind); |
| 817 | if ((LT.second == MVT::f16 && MTy == MVT::i32) || |
| 818 | (LT.second == MVT::f16 && MTy == MVT::i64) || |
| 819 | ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && |
| 820 | (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))) |
David Green | 2dcb2d8 | 2022-05-02 11:36:05 +0100 | [diff] [blame] | 821 | return LT.first; |
David Green | 6907ab4 | 2024-07-28 10:47:40 +0100 | [diff] [blame] | 822 | // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2 |
| 823 | if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() && |
| 824 | MTy.getScalarSizeInBits() == 32) |
| 825 | return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2); |
| 826 | // Extending vector types v8f16->v8i32. These current scalarize but the |
| 827 | // codegen could be better. |
| 828 | if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() && |
| 829 | MTy.getScalarSizeInBits() == 64) |
| 830 | return MTy.getVectorNumElements() * 3; |
David Green | 2dcb2d8 | 2022-05-02 11:36:05 +0100 | [diff] [blame] | 831 | |
David Green | 6907ab4 | 2024-07-28 10:47:40 +0100 | [diff] [blame] | 832 | // If we can we use a legal convert followed by a min+max |
David Green | 2dcb2d8 | 2022-05-02 11:36:05 +0100 | [diff] [blame] | 833 | if ((LT.second.getScalarType() == MVT::f32 || |
| 834 | LT.second.getScalarType() == MVT::f64 || |
David Green | 6907ab4 | 2024-07-28 10:47:40 +0100 | [diff] [blame] | 835 | LT.second.getScalarType() == MVT::f16) && |
David Green | 2dcb2d8 | 2022-05-02 11:36:05 +0100 | [diff] [blame] | 836 | LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { |
| 837 | Type *LegalTy = |
| 838 | Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits()); |
| 839 | if (LT.second.isVector()) |
| 840 | LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount()); |
| 841 | InstructionCost Cost = 1; |
| 842 | IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, |
| 843 | LegalTy, {LegalTy, LegalTy}); |
| 844 | Cost += getIntrinsicInstrCost(Attrs1, CostKind); |
| 845 | IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, |
| 846 | LegalTy, {LegalTy, LegalTy}); |
| 847 | Cost += getIntrinsicInstrCost(Attrs2, CostKind); |
David Green | 6907ab4 | 2024-07-28 10:47:40 +0100 | [diff] [blame] | 848 | return LT.first * Cost + |
| 849 | ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0 |
| 850 | : 1); |
David Green | 2dcb2d8 | 2022-05-02 11:36:05 +0100 | [diff] [blame] | 851 | } |
David Green | 6907ab4 | 2024-07-28 10:47:40 +0100 | [diff] [blame] | 852 | // Otherwise we need to follow the default expansion that clamps the value |
| 853 | // using a float min/max with a fcmp+sel for nan handling when signed. |
| 854 | Type *FPTy = ICA.getArgTypes()[0]->getScalarType(); |
| 855 | RetTy = RetTy->getScalarType(); |
| 856 | if (LT.second.isVector()) { |
| 857 | FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount()); |
| 858 | RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount()); |
| 859 | } |
| 860 | IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy}); |
| 861 | InstructionCost Cost = getIntrinsicInstrCost(Attrs1, CostKind); |
| 862 | IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy}); |
| 863 | Cost += getIntrinsicInstrCost(Attrs2, CostKind); |
| 864 | Cost += |
| 865 | getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI, |
| 866 | RetTy, FPTy, TTI::CastContextHint::None, CostKind); |
| 867 | if (IsSigned) { |
| 868 | Type *CondTy = RetTy->getWithNewBitWidth(1); |
| 869 | Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy, |
| 870 | CmpInst::FCMP_UNO, CostKind); |
| 871 | Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, |
| 872 | CmpInst::FCMP_UNO, CostKind); |
| 873 | } |
| 874 | return LT.first * Cost; |
David Green | 2dcb2d8 | 2022-05-02 11:36:05 +0100 | [diff] [blame] | 875 | } |
Zain Jaffal | 3d3d8fe | 2023-04-20 18:20:01 +0100 | [diff] [blame] | 876 | case Intrinsic::fshl: |
| 877 | case Intrinsic::fshr: { |
| 878 | if (ICA.getArgs().empty()) |
| 879 | break; |
| 880 | |
| 881 | // TODO: Add handling for fshl where third argument is not a constant. |
| 882 | const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]); |
| 883 | if (!OpInfoZ.isConstant()) |
| 884 | break; |
| 885 | |
| 886 | const auto LegalisationCost = getTypeLegalizationCost(RetTy); |
| 887 | if (OpInfoZ.isUniform()) { |
Zain Jaffal | 3d3d8fe | 2023-04-20 18:20:01 +0100 | [diff] [blame] | 888 | static const CostTblEntry FshlTbl[] = { |
David Green | e44e24d | 2025-03-09 18:01:45 +0000 | [diff] [blame] | 889 | {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra |
| 890 | {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2}, |
| 891 | {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2}, |
| 892 | {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}}; |
Zain Jaffal | 3d3d8fe | 2023-04-20 18:20:01 +0100 | [diff] [blame] | 893 | // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl |
| 894 | // to avoid having to duplicate the costs. |
| 895 | const auto *Entry = |
| 896 | CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second); |
| 897 | if (Entry) |
| 898 | return LegalisationCost.first * Entry->Cost; |
| 899 | } |
| 900 | |
| 901 | auto TyL = getTypeLegalizationCost(RetTy); |
| 902 | if (!RetTy->isIntegerTy()) |
| 903 | break; |
| 904 | |
| 905 | // Estimate cost manually, as types like i8 and i16 will get promoted to |
| 906 | // i32 and CostTableLookup will ignore the extra conversion cost. |
| 907 | bool HigherCost = (RetTy->getScalarSizeInBits() != 32 && |
| 908 | RetTy->getScalarSizeInBits() < 64) || |
| 909 | (RetTy->getScalarSizeInBits() % 64 != 0); |
| 910 | unsigned ExtraCost = HigherCost ? 1 : 0; |
| 911 | if (RetTy->getScalarSizeInBits() == 32 || |
| 912 | RetTy->getScalarSizeInBits() == 64) |
| 913 | ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single |
| 914 | // extr instruction. |
| 915 | else if (HigherCost) |
| 916 | ExtraCost = 1; |
| 917 | else |
| 918 | break; |
| 919 | return TyL.first + ExtraCost; |
| 920 | } |
David Sherwood | 96b2e35 | 2024-04-24 14:31:06 +0100 | [diff] [blame] | 921 | case Intrinsic::get_active_lane_mask: { |
| 922 | auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType()); |
| 923 | if (RetTy) { |
| 924 | EVT RetVT = getTLI()->getValueType(DL, RetTy); |
| 925 | EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); |
| 926 | if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) && |
| 927 | !getTLI()->isTypeLegal(RetVT)) { |
| 928 | // We don't have enough context at this point to determine if the mask |
| 929 | // is going to be kept live after the block, which will force the vXi1 |
| 930 | // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. |
| 931 | // For now, we just assume the vectorizer created this intrinsic and |
| 932 | // the result will be the input for a PHI. In this case the cost will |
| 933 | // be extremely high for fixed-width vectors. |
| 934 | // NOTE: getScalarizationOverhead returns a cost that's far too |
| 935 | // pessimistic for the actual generated codegen. In reality there are |
| 936 | // two instructions generated per lane. |
| 937 | return RetTy->getNumElements() * 2; |
| 938 | } |
| 939 | } |
| 940 | break; |
| 941 | } |
Ricardo Jesus | 2fe30bc | 2024-12-11 07:51:11 +0000 | [diff] [blame] | 942 | case Intrinsic::experimental_vector_match: { |
| 943 | auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]); |
| 944 | EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); |
| 945 | unsigned SearchSize = NeedleTy->getNumElements(); |
| 946 | if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) { |
| 947 | // Base cost for MATCH instructions. At least on the Neoverse V2 and |
| 948 | // Neoverse V3, these are cheap operations with the same latency as a |
| 949 | // vector ADD. In most cases, however, we also need to do an extra DUP. |
| 950 | // For fixed-length vectors we currently need an extra five--six |
| 951 | // instructions besides the MATCH. |
| 952 | InstructionCost Cost = 4; |
| 953 | if (isa<FixedVectorType>(RetTy)) |
| 954 | Cost += 10; |
| 955 | return Cost; |
| 956 | } |
| 957 | break; |
| 958 | } |
David Sherwood | de5d588 | 2025-02-04 09:41:53 +0000 | [diff] [blame] | 959 | case Intrinsic::experimental_cttz_elts: { |
| 960 | EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); |
| 961 | if (!getTLI()->shouldExpandCttzElements(ArgVT)) { |
| 962 | // This will consist of a SVE brkb and a cntp instruction. These |
| 963 | // typically have the same latency and half the throughput as a vector |
| 964 | // add instruction. |
| 965 | return 4; |
| 966 | } |
| 967 | break; |
| 968 | } |
Florian Hahn | 0fcc6f7 | 2020-10-23 09:00:20 +0100 | [diff] [blame] | 969 | default: |
| 970 | break; |
| 971 | } |
| 972 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
| 973 | } |
| 974 | |
Bradley Smith | c8f20ed | 2021-04-26 16:19:25 +0100 | [diff] [blame] | 975 | /// The function will remove redundant reinterprets casting in the presence |
| 976 | /// of the control flow |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 977 | static std::optional<Instruction *> processPhiNode(InstCombiner &IC, |
| 978 | IntrinsicInst &II) { |
Bradley Smith | c8f20ed | 2021-04-26 16:19:25 +0100 | [diff] [blame] | 979 | SmallVector<Instruction *, 32> Worklist; |
| 980 | auto RequiredType = II.getType(); |
| 981 | |
| 982 | auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); |
| 983 | assert(PN && "Expected Phi Node!"); |
| 984 | |
| 985 | // Don't create a new Phi unless we can remove the old one. |
| 986 | if (!PN->hasOneUse()) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 987 | return std::nullopt; |
Bradley Smith | c8f20ed | 2021-04-26 16:19:25 +0100 | [diff] [blame] | 988 | |
| 989 | for (Value *IncValPhi : PN->incoming_values()) { |
| 990 | auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); |
| 991 | if (!Reinterpret || |
| 992 | Reinterpret->getIntrinsicID() != |
| 993 | Intrinsic::aarch64_sve_convert_to_svbool || |
| 994 | RequiredType != Reinterpret->getArgOperand(0)->getType()) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 995 | return std::nullopt; |
Bradley Smith | c8f20ed | 2021-04-26 16:19:25 +0100 | [diff] [blame] | 996 | } |
| 997 | |
| 998 | // Create the new Phi |
Nikita Popov | f9f8517 | 2023-06-16 14:58:33 +0200 | [diff] [blame] | 999 | IC.Builder.SetInsertPoint(PN); |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 1000 | PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); |
Bradley Smith | c8f20ed | 2021-04-26 16:19:25 +0100 | [diff] [blame] | 1001 | Worklist.push_back(PN); |
| 1002 | |
| 1003 | for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { |
| 1004 | auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); |
| 1005 | NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); |
| 1006 | Worklist.push_back(Reinterpret); |
| 1007 | } |
| 1008 | |
| 1009 | // Cleanup Phi Node and reinterprets |
| 1010 | return IC.replaceInstUsesWith(II, NPN); |
| 1011 | } |
| 1012 | |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1013 | // A collection of properties common to SVE intrinsics that allow for combines |
| 1014 | // to be written without needing to know the specific intrinsic. |
| 1015 | struct SVEIntrinsicInfo { |
| 1016 | // |
| 1017 | // Helper routines for common intrinsic definitions. |
| 1018 | // |
| 1019 | |
| 1020 | // e.g. llvm.aarch64.sve.add pg, op1, op2 |
| 1021 | // with IID ==> llvm.aarch64.sve.add_u |
| 1022 | static SVEIntrinsicInfo |
| 1023 | defaultMergingOp(Intrinsic::ID IID = Intrinsic::not_intrinsic) { |
| 1024 | return SVEIntrinsicInfo() |
| 1025 | .setGoverningPredicateOperandIdx(0) |
| 1026 | .setOperandIdxInactiveLanesTakenFrom(1) |
| 1027 | .setMatchingUndefIntrinsic(IID); |
| 1028 | } |
| 1029 | |
| 1030 | // e.g. llvm.aarch64.sve.neg inactive, pg, op |
| 1031 | static SVEIntrinsicInfo defaultMergingUnaryOp() { |
| 1032 | return SVEIntrinsicInfo() |
| 1033 | .setGoverningPredicateOperandIdx(1) |
| 1034 | .setOperandIdxInactiveLanesTakenFrom(0) |
| 1035 | .setOperandIdxWithNoActiveLanes(0); |
| 1036 | } |
| 1037 | |
| 1038 | // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op |
| 1039 | static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp() { |
| 1040 | return SVEIntrinsicInfo() |
| 1041 | .setGoverningPredicateOperandIdx(1) |
| 1042 | .setOperandIdxInactiveLanesTakenFrom(0); |
| 1043 | } |
| 1044 | |
| 1045 | // e.g. llvm.aarch64.sve.add_u pg, op1, op2 |
| 1046 | static SVEIntrinsicInfo defaultUndefOp() { |
| 1047 | return SVEIntrinsicInfo() |
| 1048 | .setGoverningPredicateOperandIdx(0) |
| 1049 | .setInactiveLanesAreNotDefined(); |
| 1050 | } |
| 1051 | |
| 1052 | // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0) |
| 1053 | // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1) |
| 1054 | static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) { |
| 1055 | return SVEIntrinsicInfo() |
| 1056 | .setGoverningPredicateOperandIdx(GPIndex) |
| 1057 | .setInactiveLanesAreUnused(); |
| 1058 | } |
| 1059 | |
| 1060 | // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2 |
| 1061 | // llvm.aarch64.sve.ld1 pg, ptr |
| 1062 | static SVEIntrinsicInfo defaultZeroingOp() { |
| 1063 | return SVEIntrinsicInfo() |
| 1064 | .setGoverningPredicateOperandIdx(0) |
| 1065 | .setInactiveLanesAreUnused() |
| 1066 | .setResultIsZeroInitialized(); |
| 1067 | } |
| 1068 | |
| 1069 | // All properties relate to predication and thus having a general predicate |
| 1070 | // is the minimum requirement to say there is intrinsic info to act on. |
| 1071 | explicit operator bool() const { return hasGoverningPredicate(); } |
| 1072 | |
| 1073 | // |
| 1074 | // Properties relating to the governing predicate. |
| 1075 | // |
| 1076 | |
| 1077 | bool hasGoverningPredicate() const { |
| 1078 | return GoverningPredicateIdx != std::numeric_limits<unsigned>::max(); |
| 1079 | } |
| 1080 | |
| 1081 | unsigned getGoverningPredicateOperandIdx() const { |
| 1082 | assert(hasGoverningPredicate() && "Propery not set!"); |
| 1083 | return GoverningPredicateIdx; |
| 1084 | } |
| 1085 | |
| 1086 | SVEIntrinsicInfo &setGoverningPredicateOperandIdx(unsigned Index) { |
| 1087 | assert(!hasGoverningPredicate() && "Cannot set property twice!"); |
| 1088 | GoverningPredicateIdx = Index; |
| 1089 | return *this; |
| 1090 | } |
| 1091 | |
| 1092 | // |
| 1093 | // Properties relating to operations the intrinsic could be transformed into. |
| 1094 | // NOTE: This does not mean such a transformation is always possible, but the |
| 1095 | // knowledge makes it possible to reuse existing optimisations without needing |
| 1096 | // to embed specific handling for each intrinsic. For example, instruction |
| 1097 | // simplification can be used to optimise an intrinsic's active lanes. |
| 1098 | // |
| 1099 | |
| 1100 | bool hasMatchingUndefIntrinsic() const { |
| 1101 | return UndefIntrinsic != Intrinsic::not_intrinsic; |
| 1102 | } |
| 1103 | |
| 1104 | Intrinsic::ID getMatchingUndefIntrinsic() const { |
| 1105 | assert(hasMatchingUndefIntrinsic() && "Propery not set!"); |
| 1106 | return UndefIntrinsic; |
| 1107 | } |
| 1108 | |
| 1109 | SVEIntrinsicInfo &setMatchingUndefIntrinsic(Intrinsic::ID IID) { |
| 1110 | assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!"); |
| 1111 | UndefIntrinsic = IID; |
| 1112 | return *this; |
| 1113 | } |
| 1114 | |
Paul Walker | 1997073 | 2025-04-08 11:38:27 +0100 | [diff] [blame] | 1115 | bool hasMatchingIROpode() const { return IROpcode != 0; } |
| 1116 | |
| 1117 | unsigned getMatchingIROpode() const { |
| 1118 | assert(hasMatchingIROpode() && "Propery not set!"); |
| 1119 | return IROpcode; |
| 1120 | } |
| 1121 | |
| 1122 | SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) { |
| 1123 | assert(!hasMatchingIROpode() && "Cannot set property twice!"); |
| 1124 | IROpcode = Opcode; |
| 1125 | return *this; |
| 1126 | } |
| 1127 | |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1128 | // |
| 1129 | // Properties relating to the result of inactive lanes. |
| 1130 | // |
| 1131 | |
| 1132 | bool inactiveLanesTakenFromOperand() const { |
| 1133 | return ResultLanes == InactiveLanesTakenFromOperand; |
| 1134 | } |
| 1135 | |
| 1136 | unsigned getOperandIdxInactiveLanesTakenFrom() const { |
| 1137 | assert(inactiveLanesTakenFromOperand() && "Propery not set!"); |
| 1138 | return OperandIdxForInactiveLanes; |
| 1139 | } |
| 1140 | |
| 1141 | SVEIntrinsicInfo &setOperandIdxInactiveLanesTakenFrom(unsigned Index) { |
| 1142 | assert(ResultLanes == Uninitialized && "Cannot set property twice!"); |
| 1143 | ResultLanes = InactiveLanesTakenFromOperand; |
| 1144 | OperandIdxForInactiveLanes = Index; |
| 1145 | return *this; |
| 1146 | } |
| 1147 | |
| 1148 | bool inactiveLanesAreNotDefined() const { |
| 1149 | return ResultLanes == InactiveLanesAreNotDefined; |
| 1150 | } |
| 1151 | |
| 1152 | SVEIntrinsicInfo &setInactiveLanesAreNotDefined() { |
| 1153 | assert(ResultLanes == Uninitialized && "Cannot set property twice!"); |
| 1154 | ResultLanes = InactiveLanesAreNotDefined; |
| 1155 | return *this; |
| 1156 | } |
| 1157 | |
| 1158 | bool inactiveLanesAreUnused() const { |
| 1159 | return ResultLanes == InactiveLanesAreUnused; |
| 1160 | } |
| 1161 | |
| 1162 | SVEIntrinsicInfo &setInactiveLanesAreUnused() { |
| 1163 | assert(ResultLanes == Uninitialized && "Cannot set property twice!"); |
| 1164 | ResultLanes = InactiveLanesAreUnused; |
| 1165 | return *this; |
| 1166 | } |
| 1167 | |
| 1168 | // NOTE: Whilst not limited to only inactive lanes, the common use case is: |
| 1169 | // inactiveLanesAreZerod = |
| 1170 | // resultIsZeroInitialized() && inactiveLanesAreUnused() |
| 1171 | bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; } |
| 1172 | |
| 1173 | SVEIntrinsicInfo &setResultIsZeroInitialized() { |
| 1174 | ResultIsZeroInitialized = true; |
| 1175 | return *this; |
| 1176 | } |
| 1177 | |
| 1178 | // |
| 1179 | // The first operand of unary merging operations is typically only used to |
| 1180 | // set the result for inactive lanes. Knowing this allows us to deadcode the |
| 1181 | // operand when we can prove there are no inactive lanes. |
| 1182 | // |
| 1183 | |
| 1184 | bool hasOperandWithNoActiveLanes() const { |
| 1185 | return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max(); |
| 1186 | } |
| 1187 | |
| 1188 | unsigned getOperandIdxWithNoActiveLanes() const { |
| 1189 | assert(hasOperandWithNoActiveLanes() && "Propery not set!"); |
| 1190 | return OperandIdxWithNoActiveLanes; |
| 1191 | } |
| 1192 | |
| 1193 | SVEIntrinsicInfo &setOperandIdxWithNoActiveLanes(unsigned Index) { |
| 1194 | assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!"); |
| 1195 | OperandIdxWithNoActiveLanes = Index; |
| 1196 | return *this; |
| 1197 | } |
| 1198 | |
| 1199 | private: |
| 1200 | unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max(); |
| 1201 | |
| 1202 | Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic; |
Paul Walker | 1997073 | 2025-04-08 11:38:27 +0100 | [diff] [blame] | 1203 | unsigned IROpcode = 0; |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1204 | |
| 1205 | enum PredicationStyle { |
| 1206 | Uninitialized, |
| 1207 | InactiveLanesTakenFromOperand, |
| 1208 | InactiveLanesAreNotDefined, |
| 1209 | InactiveLanesAreUnused |
| 1210 | } ResultLanes = Uninitialized; |
| 1211 | |
| 1212 | bool ResultIsZeroInitialized = false; |
| 1213 | unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max(); |
| 1214 | unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max(); |
| 1215 | }; |
| 1216 | |
| 1217 | static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) { |
| 1218 | // Some SVE intrinsics do not use scalable vector types, but since they are |
| 1219 | // not relevant from an SVEIntrinsicInfo perspective, they are also ignored. |
| 1220 | if (!isa<ScalableVectorType>(II.getType()) && |
| 1221 | all_of(II.args(), [&](const Value *V) { |
| 1222 | return !isa<ScalableVectorType>(V->getType()); |
| 1223 | })) |
| 1224 | return SVEIntrinsicInfo(); |
| 1225 | |
| 1226 | Intrinsic::ID IID = II.getIntrinsicID(); |
| 1227 | switch (IID) { |
| 1228 | default: |
| 1229 | break; |
| 1230 | case Intrinsic::aarch64_sve_fcvt_bf16f32_v2: |
| 1231 | case Intrinsic::aarch64_sve_fcvt_f16f32: |
| 1232 | case Intrinsic::aarch64_sve_fcvt_f16f64: |
| 1233 | case Intrinsic::aarch64_sve_fcvt_f32f16: |
| 1234 | case Intrinsic::aarch64_sve_fcvt_f32f64: |
| 1235 | case Intrinsic::aarch64_sve_fcvt_f64f16: |
| 1236 | case Intrinsic::aarch64_sve_fcvt_f64f32: |
| 1237 | case Intrinsic::aarch64_sve_fcvtlt_f32f16: |
| 1238 | case Intrinsic::aarch64_sve_fcvtlt_f64f32: |
| 1239 | case Intrinsic::aarch64_sve_fcvtx_f32f64: |
| 1240 | case Intrinsic::aarch64_sve_fcvtzs: |
| 1241 | case Intrinsic::aarch64_sve_fcvtzs_i32f16: |
| 1242 | case Intrinsic::aarch64_sve_fcvtzs_i32f64: |
| 1243 | case Intrinsic::aarch64_sve_fcvtzs_i64f16: |
| 1244 | case Intrinsic::aarch64_sve_fcvtzs_i64f32: |
| 1245 | case Intrinsic::aarch64_sve_fcvtzu: |
| 1246 | case Intrinsic::aarch64_sve_fcvtzu_i32f16: |
| 1247 | case Intrinsic::aarch64_sve_fcvtzu_i32f64: |
| 1248 | case Intrinsic::aarch64_sve_fcvtzu_i64f16: |
| 1249 | case Intrinsic::aarch64_sve_fcvtzu_i64f32: |
| 1250 | case Intrinsic::aarch64_sve_scvtf: |
| 1251 | case Intrinsic::aarch64_sve_scvtf_f16i32: |
| 1252 | case Intrinsic::aarch64_sve_scvtf_f16i64: |
| 1253 | case Intrinsic::aarch64_sve_scvtf_f32i64: |
| 1254 | case Intrinsic::aarch64_sve_scvtf_f64i32: |
| 1255 | case Intrinsic::aarch64_sve_ucvtf: |
| 1256 | case Intrinsic::aarch64_sve_ucvtf_f16i32: |
| 1257 | case Intrinsic::aarch64_sve_ucvtf_f16i64: |
| 1258 | case Intrinsic::aarch64_sve_ucvtf_f32i64: |
| 1259 | case Intrinsic::aarch64_sve_ucvtf_f64i32: |
| 1260 | return SVEIntrinsicInfo::defaultMergingUnaryOp(); |
| 1261 | |
| 1262 | case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2: |
| 1263 | case Intrinsic::aarch64_sve_fcvtnt_f16f32: |
| 1264 | case Intrinsic::aarch64_sve_fcvtnt_f32f64: |
| 1265 | case Intrinsic::aarch64_sve_fcvtxnt_f32f64: |
| 1266 | return SVEIntrinsicInfo::defaultMergingUnaryNarrowingTopOp(); |
| 1267 | |
| 1268 | case Intrinsic::aarch64_sve_fabd: |
| 1269 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u); |
| 1270 | case Intrinsic::aarch64_sve_fadd: |
Paul Walker | 96ec17d | 2025-04-25 11:30:03 +0100 | [diff] [blame] | 1271 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u) |
| 1272 | .setMatchingIROpcode(Instruction::FAdd); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1273 | case Intrinsic::aarch64_sve_fdiv: |
Paul Walker | 96ec17d | 2025-04-25 11:30:03 +0100 | [diff] [blame] | 1274 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u) |
| 1275 | .setMatchingIROpcode(Instruction::FDiv); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1276 | case Intrinsic::aarch64_sve_fmax: |
| 1277 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u); |
| 1278 | case Intrinsic::aarch64_sve_fmaxnm: |
| 1279 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u); |
| 1280 | case Intrinsic::aarch64_sve_fmin: |
| 1281 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u); |
| 1282 | case Intrinsic::aarch64_sve_fminnm: |
| 1283 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u); |
| 1284 | case Intrinsic::aarch64_sve_fmla: |
| 1285 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u); |
| 1286 | case Intrinsic::aarch64_sve_fmls: |
| 1287 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u); |
| 1288 | case Intrinsic::aarch64_sve_fmul: |
Paul Walker | 1997073 | 2025-04-08 11:38:27 +0100 | [diff] [blame] | 1289 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u) |
| 1290 | .setMatchingIROpcode(Instruction::FMul); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1291 | case Intrinsic::aarch64_sve_fmulx: |
| 1292 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u); |
| 1293 | case Intrinsic::aarch64_sve_fnmla: |
| 1294 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u); |
| 1295 | case Intrinsic::aarch64_sve_fnmls: |
| 1296 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u); |
| 1297 | case Intrinsic::aarch64_sve_fsub: |
Paul Walker | 96ec17d | 2025-04-25 11:30:03 +0100 | [diff] [blame] | 1298 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u) |
| 1299 | .setMatchingIROpcode(Instruction::FSub); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1300 | case Intrinsic::aarch64_sve_add: |
Paul Walker | 96ec17d | 2025-04-25 11:30:03 +0100 | [diff] [blame] | 1301 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u) |
| 1302 | .setMatchingIROpcode(Instruction::Add); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1303 | case Intrinsic::aarch64_sve_mla: |
| 1304 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u); |
| 1305 | case Intrinsic::aarch64_sve_mls: |
| 1306 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u); |
| 1307 | case Intrinsic::aarch64_sve_mul: |
Paul Walker | 1997073 | 2025-04-08 11:38:27 +0100 | [diff] [blame] | 1308 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u) |
| 1309 | .setMatchingIROpcode(Instruction::Mul); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1310 | case Intrinsic::aarch64_sve_sabd: |
| 1311 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u); |
Paul Walker | 149d795 | 2025-05-01 13:20:05 +0100 | [diff] [blame] | 1312 | case Intrinsic::aarch64_sve_sdiv: |
| 1313 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u) |
| 1314 | .setMatchingIROpcode(Instruction::SDiv); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1315 | case Intrinsic::aarch64_sve_smax: |
| 1316 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u); |
| 1317 | case Intrinsic::aarch64_sve_smin: |
| 1318 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u); |
| 1319 | case Intrinsic::aarch64_sve_smulh: |
| 1320 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u); |
| 1321 | case Intrinsic::aarch64_sve_sub: |
Paul Walker | 96ec17d | 2025-04-25 11:30:03 +0100 | [diff] [blame] | 1322 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u) |
| 1323 | .setMatchingIROpcode(Instruction::Sub); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1324 | case Intrinsic::aarch64_sve_uabd: |
| 1325 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u); |
Paul Walker | 149d795 | 2025-05-01 13:20:05 +0100 | [diff] [blame] | 1326 | case Intrinsic::aarch64_sve_udiv: |
| 1327 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u) |
| 1328 | .setMatchingIROpcode(Instruction::UDiv); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1329 | case Intrinsic::aarch64_sve_umax: |
| 1330 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u); |
| 1331 | case Intrinsic::aarch64_sve_umin: |
| 1332 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u); |
| 1333 | case Intrinsic::aarch64_sve_umulh: |
| 1334 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u); |
| 1335 | case Intrinsic::aarch64_sve_asr: |
Paul Walker | 8dc89e3 | 2025-04-30 13:21:46 +0100 | [diff] [blame] | 1336 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u) |
| 1337 | .setMatchingIROpcode(Instruction::AShr); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1338 | case Intrinsic::aarch64_sve_lsl: |
Paul Walker | 8dc89e3 | 2025-04-30 13:21:46 +0100 | [diff] [blame] | 1339 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u) |
| 1340 | .setMatchingIROpcode(Instruction::Shl); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1341 | case Intrinsic::aarch64_sve_lsr: |
Paul Walker | 8dc89e3 | 2025-04-30 13:21:46 +0100 | [diff] [blame] | 1342 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u) |
| 1343 | .setMatchingIROpcode(Instruction::LShr); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1344 | case Intrinsic::aarch64_sve_and: |
Paul Walker | 96ec17d | 2025-04-25 11:30:03 +0100 | [diff] [blame] | 1345 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u) |
| 1346 | .setMatchingIROpcode(Instruction::And); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1347 | case Intrinsic::aarch64_sve_bic: |
| 1348 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u); |
| 1349 | case Intrinsic::aarch64_sve_eor: |
Paul Walker | 96ec17d | 2025-04-25 11:30:03 +0100 | [diff] [blame] | 1350 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u) |
| 1351 | .setMatchingIROpcode(Instruction::Xor); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1352 | case Intrinsic::aarch64_sve_orr: |
Paul Walker | 96ec17d | 2025-04-25 11:30:03 +0100 | [diff] [blame] | 1353 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u) |
| 1354 | .setMatchingIROpcode(Instruction::Or); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1355 | case Intrinsic::aarch64_sve_sqsub: |
| 1356 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u); |
| 1357 | case Intrinsic::aarch64_sve_uqsub: |
| 1358 | return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u); |
| 1359 | |
Paul Walker | 96ec17d | 2025-04-25 11:30:03 +0100 | [diff] [blame] | 1360 | case Intrinsic::aarch64_sve_add_u: |
| 1361 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1362 | Instruction::Add); |
| 1363 | case Intrinsic::aarch64_sve_and_u: |
| 1364 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1365 | Instruction::And); |
Paul Walker | 8dc89e3 | 2025-04-30 13:21:46 +0100 | [diff] [blame] | 1366 | case Intrinsic::aarch64_sve_asr_u: |
| 1367 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1368 | Instruction::AShr); |
Paul Walker | 96ec17d | 2025-04-25 11:30:03 +0100 | [diff] [blame] | 1369 | case Intrinsic::aarch64_sve_eor_u: |
| 1370 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1371 | Instruction::Xor); |
| 1372 | case Intrinsic::aarch64_sve_fadd_u: |
| 1373 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1374 | Instruction::FAdd); |
| 1375 | case Intrinsic::aarch64_sve_fdiv_u: |
| 1376 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1377 | Instruction::FDiv); |
Paul Walker | 1997073 | 2025-04-08 11:38:27 +0100 | [diff] [blame] | 1378 | case Intrinsic::aarch64_sve_fmul_u: |
| 1379 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1380 | Instruction::FMul); |
Paul Walker | 96ec17d | 2025-04-25 11:30:03 +0100 | [diff] [blame] | 1381 | case Intrinsic::aarch64_sve_fsub_u: |
| 1382 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1383 | Instruction::FSub); |
Paul Walker | 8dc89e3 | 2025-04-30 13:21:46 +0100 | [diff] [blame] | 1384 | case Intrinsic::aarch64_sve_lsl_u: |
| 1385 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1386 | Instruction::Shl); |
| 1387 | case Intrinsic::aarch64_sve_lsr_u: |
| 1388 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1389 | Instruction::LShr); |
Paul Walker | 1997073 | 2025-04-08 11:38:27 +0100 | [diff] [blame] | 1390 | case Intrinsic::aarch64_sve_mul_u: |
| 1391 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1392 | Instruction::Mul); |
Paul Walker | 96ec17d | 2025-04-25 11:30:03 +0100 | [diff] [blame] | 1393 | case Intrinsic::aarch64_sve_orr_u: |
| 1394 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1395 | Instruction::Or); |
Paul Walker | 149d795 | 2025-05-01 13:20:05 +0100 | [diff] [blame] | 1396 | case Intrinsic::aarch64_sve_sdiv_u: |
| 1397 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1398 | Instruction::SDiv); |
Paul Walker | 96ec17d | 2025-04-25 11:30:03 +0100 | [diff] [blame] | 1399 | case Intrinsic::aarch64_sve_sub_u: |
| 1400 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1401 | Instruction::Sub); |
Paul Walker | 149d795 | 2025-05-01 13:20:05 +0100 | [diff] [blame] | 1402 | case Intrinsic::aarch64_sve_udiv_u: |
| 1403 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1404 | Instruction::UDiv); |
Paul Walker | 1997073 | 2025-04-08 11:38:27 +0100 | [diff] [blame] | 1405 | |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1406 | case Intrinsic::aarch64_sve_addqv: |
| 1407 | case Intrinsic::aarch64_sve_and_z: |
| 1408 | case Intrinsic::aarch64_sve_bic_z: |
| 1409 | case Intrinsic::aarch64_sve_brka_z: |
| 1410 | case Intrinsic::aarch64_sve_brkb_z: |
| 1411 | case Intrinsic::aarch64_sve_brkn_z: |
| 1412 | case Intrinsic::aarch64_sve_brkpa_z: |
| 1413 | case Intrinsic::aarch64_sve_brkpb_z: |
| 1414 | case Intrinsic::aarch64_sve_cntp: |
| 1415 | case Intrinsic::aarch64_sve_compact: |
| 1416 | case Intrinsic::aarch64_sve_eor_z: |
| 1417 | case Intrinsic::aarch64_sve_eorv: |
| 1418 | case Intrinsic::aarch64_sve_eorqv: |
| 1419 | case Intrinsic::aarch64_sve_nand_z: |
| 1420 | case Intrinsic::aarch64_sve_nor_z: |
| 1421 | case Intrinsic::aarch64_sve_orn_z: |
| 1422 | case Intrinsic::aarch64_sve_orr_z: |
| 1423 | case Intrinsic::aarch64_sve_orv: |
| 1424 | case Intrinsic::aarch64_sve_orqv: |
| 1425 | case Intrinsic::aarch64_sve_pnext: |
| 1426 | case Intrinsic::aarch64_sve_rdffr_z: |
| 1427 | case Intrinsic::aarch64_sve_saddv: |
| 1428 | case Intrinsic::aarch64_sve_uaddv: |
| 1429 | case Intrinsic::aarch64_sve_umaxv: |
| 1430 | case Intrinsic::aarch64_sve_umaxqv: |
| 1431 | case Intrinsic::aarch64_sve_cmpeq: |
| 1432 | case Intrinsic::aarch64_sve_cmpeq_wide: |
| 1433 | case Intrinsic::aarch64_sve_cmpge: |
| 1434 | case Intrinsic::aarch64_sve_cmpge_wide: |
| 1435 | case Intrinsic::aarch64_sve_cmpgt: |
| 1436 | case Intrinsic::aarch64_sve_cmpgt_wide: |
| 1437 | case Intrinsic::aarch64_sve_cmphi: |
| 1438 | case Intrinsic::aarch64_sve_cmphi_wide: |
| 1439 | case Intrinsic::aarch64_sve_cmphs: |
| 1440 | case Intrinsic::aarch64_sve_cmphs_wide: |
| 1441 | case Intrinsic::aarch64_sve_cmple_wide: |
| 1442 | case Intrinsic::aarch64_sve_cmplo_wide: |
| 1443 | case Intrinsic::aarch64_sve_cmpls_wide: |
| 1444 | case Intrinsic::aarch64_sve_cmplt_wide: |
| 1445 | case Intrinsic::aarch64_sve_cmpne: |
| 1446 | case Intrinsic::aarch64_sve_cmpne_wide: |
| 1447 | case Intrinsic::aarch64_sve_facge: |
| 1448 | case Intrinsic::aarch64_sve_facgt: |
| 1449 | case Intrinsic::aarch64_sve_fcmpeq: |
| 1450 | case Intrinsic::aarch64_sve_fcmpge: |
| 1451 | case Intrinsic::aarch64_sve_fcmpgt: |
| 1452 | case Intrinsic::aarch64_sve_fcmpne: |
| 1453 | case Intrinsic::aarch64_sve_fcmpuo: |
| 1454 | case Intrinsic::aarch64_sve_ld1: |
| 1455 | case Intrinsic::aarch64_sve_ld1_gather: |
| 1456 | case Intrinsic::aarch64_sve_ld1_gather_index: |
| 1457 | case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: |
| 1458 | case Intrinsic::aarch64_sve_ld1_gather_sxtw: |
| 1459 | case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: |
| 1460 | case Intrinsic::aarch64_sve_ld1_gather_uxtw: |
| 1461 | case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: |
| 1462 | case Intrinsic::aarch64_sve_ld1q_gather_index: |
| 1463 | case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset: |
| 1464 | case Intrinsic::aarch64_sve_ld1q_gather_vector_offset: |
| 1465 | case Intrinsic::aarch64_sve_ld1ro: |
| 1466 | case Intrinsic::aarch64_sve_ld1rq: |
| 1467 | case Intrinsic::aarch64_sve_ld1udq: |
| 1468 | case Intrinsic::aarch64_sve_ld1uwq: |
| 1469 | case Intrinsic::aarch64_sve_ld2_sret: |
| 1470 | case Intrinsic::aarch64_sve_ld2q_sret: |
| 1471 | case Intrinsic::aarch64_sve_ld3_sret: |
| 1472 | case Intrinsic::aarch64_sve_ld3q_sret: |
| 1473 | case Intrinsic::aarch64_sve_ld4_sret: |
| 1474 | case Intrinsic::aarch64_sve_ld4q_sret: |
| 1475 | case Intrinsic::aarch64_sve_ldff1: |
| 1476 | case Intrinsic::aarch64_sve_ldff1_gather: |
| 1477 | case Intrinsic::aarch64_sve_ldff1_gather_index: |
| 1478 | case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: |
| 1479 | case Intrinsic::aarch64_sve_ldff1_gather_sxtw: |
| 1480 | case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: |
| 1481 | case Intrinsic::aarch64_sve_ldff1_gather_uxtw: |
| 1482 | case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: |
| 1483 | case Intrinsic::aarch64_sve_ldnf1: |
| 1484 | case Intrinsic::aarch64_sve_ldnt1: |
| 1485 | case Intrinsic::aarch64_sve_ldnt1_gather: |
| 1486 | case Intrinsic::aarch64_sve_ldnt1_gather_index: |
| 1487 | case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: |
| 1488 | case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: |
| 1489 | return SVEIntrinsicInfo::defaultZeroingOp(); |
| 1490 | |
| 1491 | case Intrinsic::aarch64_sve_prf: |
| 1492 | case Intrinsic::aarch64_sve_prfb_gather_index: |
| 1493 | case Intrinsic::aarch64_sve_prfb_gather_scalar_offset: |
| 1494 | case Intrinsic::aarch64_sve_prfb_gather_sxtw_index: |
| 1495 | case Intrinsic::aarch64_sve_prfb_gather_uxtw_index: |
| 1496 | case Intrinsic::aarch64_sve_prfd_gather_index: |
| 1497 | case Intrinsic::aarch64_sve_prfd_gather_scalar_offset: |
| 1498 | case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: |
| 1499 | case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: |
| 1500 | case Intrinsic::aarch64_sve_prfh_gather_index: |
| 1501 | case Intrinsic::aarch64_sve_prfh_gather_scalar_offset: |
| 1502 | case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: |
| 1503 | case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: |
| 1504 | case Intrinsic::aarch64_sve_prfw_gather_index: |
| 1505 | case Intrinsic::aarch64_sve_prfw_gather_scalar_offset: |
| 1506 | case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: |
| 1507 | case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: |
| 1508 | return SVEIntrinsicInfo::defaultVoidOp(0); |
| 1509 | |
| 1510 | case Intrinsic::aarch64_sve_st1_scatter: |
| 1511 | case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: |
| 1512 | case Intrinsic::aarch64_sve_st1_scatter_sxtw: |
| 1513 | case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: |
| 1514 | case Intrinsic::aarch64_sve_st1_scatter_uxtw: |
| 1515 | case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: |
| 1516 | case Intrinsic::aarch64_sve_st1dq: |
| 1517 | case Intrinsic::aarch64_sve_st1q_scatter_index: |
| 1518 | case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset: |
| 1519 | case Intrinsic::aarch64_sve_st1q_scatter_vector_offset: |
| 1520 | case Intrinsic::aarch64_sve_st1wq: |
| 1521 | case Intrinsic::aarch64_sve_stnt1: |
| 1522 | case Intrinsic::aarch64_sve_stnt1_scatter: |
| 1523 | case Intrinsic::aarch64_sve_stnt1_scatter_index: |
| 1524 | case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: |
| 1525 | case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: |
| 1526 | return SVEIntrinsicInfo::defaultVoidOp(1); |
| 1527 | case Intrinsic::aarch64_sve_st2: |
| 1528 | case Intrinsic::aarch64_sve_st2q: |
| 1529 | return SVEIntrinsicInfo::defaultVoidOp(2); |
| 1530 | case Intrinsic::aarch64_sve_st3: |
| 1531 | case Intrinsic::aarch64_sve_st3q: |
| 1532 | return SVEIntrinsicInfo::defaultVoidOp(3); |
| 1533 | case Intrinsic::aarch64_sve_st4: |
| 1534 | case Intrinsic::aarch64_sve_st4q: |
| 1535 | return SVEIntrinsicInfo::defaultVoidOp(4); |
| 1536 | } |
| 1537 | |
| 1538 | return SVEIntrinsicInfo(); |
| 1539 | } |
| 1540 | |
| 1541 | static bool isAllActivePredicate(Value *Pred) { |
| 1542 | // Look through convert.from.svbool(convert.to.svbool(...) chain. |
| 1543 | Value *UncastedPred; |
| 1544 | if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( |
| 1545 | m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( |
| 1546 | m_Value(UncastedPred))))) |
| 1547 | // If the predicate has the same or less lanes than the uncasted |
| 1548 | // predicate then we know the casting has no effect. |
| 1549 | if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= |
| 1550 | cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) |
| 1551 | Pred = UncastedPred; |
Matthew Devereau | 91a2056 | 2025-04-13 20:40:51 +0100 | [diff] [blame] | 1552 | auto *C = dyn_cast<Constant>(Pred); |
| 1553 | return (C && C->isAllOnesValue()); |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1554 | } |
| 1555 | |
Paul Walker | a7999f3 | 2025-04-17 15:58:39 +0100 | [diff] [blame] | 1556 | // Simplify `V` by only considering the operations that affect active lanes. |
| 1557 | // This function should only return existing Values or newly created Constants. |
| 1558 | static Value *stripInactiveLanes(Value *V, const Value *Pg) { |
| 1559 | auto *Dup = dyn_cast<IntrinsicInst>(V); |
| 1560 | if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup && |
| 1561 | Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2))) |
| 1562 | return ConstantVector::getSplat( |
| 1563 | cast<VectorType>(V->getType())->getElementCount(), |
| 1564 | cast<Constant>(Dup->getOperand(2))); |
| 1565 | |
| 1566 | return V; |
| 1567 | } |
| 1568 | |
| 1569 | static std::optional<Instruction *> |
| 1570 | simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, |
| 1571 | const SVEIntrinsicInfo &IInfo) { |
| 1572 | const unsigned Opc = IInfo.getMatchingIROpode(); |
| 1573 | assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!"); |
| 1574 | |
| 1575 | Value *Pg = II.getOperand(0); |
| 1576 | Value *Op1 = II.getOperand(1); |
| 1577 | Value *Op2 = II.getOperand(2); |
| 1578 | const DataLayout &DL = II.getDataLayout(); |
| 1579 | |
| 1580 | // Canonicalise constants to the RHS. |
| 1581 | if (Instruction::isCommutative(Opc) && IInfo.inactiveLanesAreNotDefined() && |
| 1582 | isa<Constant>(Op1) && !isa<Constant>(Op2)) { |
| 1583 | IC.replaceOperand(II, 1, Op2); |
| 1584 | IC.replaceOperand(II, 2, Op1); |
| 1585 | return &II; |
| 1586 | } |
| 1587 | |
| 1588 | // Only active lanes matter when simplifying the operation. |
| 1589 | Op1 = stripInactiveLanes(Op1, Pg); |
| 1590 | Op2 = stripInactiveLanes(Op2, Pg); |
| 1591 | |
| 1592 | Value *SimpleII; |
| 1593 | if (auto FII = dyn_cast<FPMathOperator>(&II)) |
| 1594 | SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL); |
| 1595 | else |
| 1596 | SimpleII = simplifyBinOp(Opc, Op1, Op2, DL); |
| 1597 | |
Paul Walker | 8dc89e3 | 2025-04-30 13:21:46 +0100 | [diff] [blame] | 1598 | // An SVE intrinsic's result is always defined. However, this is not the case |
| 1599 | // for its equivalent IR instruction (e.g. when shifting by an amount more |
| 1600 | // than the data's bitwidth). Simplifications to an undefined result must be |
| 1601 | // ignored to preserve the intrinsic's expected behaviour. |
| 1602 | if (!SimpleII || isa<UndefValue>(SimpleII)) |
Paul Walker | a7999f3 | 2025-04-17 15:58:39 +0100 | [diff] [blame] | 1603 | return std::nullopt; |
| 1604 | |
| 1605 | if (IInfo.inactiveLanesAreNotDefined()) |
| 1606 | return IC.replaceInstUsesWith(II, SimpleII); |
| 1607 | |
| 1608 | Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()); |
| 1609 | |
| 1610 | // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)). |
| 1611 | if (SimpleII == Inactive) |
| 1612 | return IC.replaceInstUsesWith(II, SimpleII); |
| 1613 | |
| 1614 | // Inactive lanes must be preserved. |
| 1615 | SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive); |
| 1616 | return IC.replaceInstUsesWith(II, SimpleII); |
| 1617 | } |
| 1618 | |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1619 | // Use SVE intrinsic info to eliminate redundant operands and/or canonicalise |
| 1620 | // to operations with less strict inactive lane requirements. |
| 1621 | static std::optional<Instruction *> |
| 1622 | simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, |
| 1623 | const SVEIntrinsicInfo &IInfo) { |
| 1624 | if (!IInfo.hasGoverningPredicate()) |
| 1625 | return std::nullopt; |
| 1626 | |
| 1627 | auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx()); |
| 1628 | |
| 1629 | // If there are no active lanes. |
| 1630 | if (match(OpPredicate, m_ZeroInt())) { |
| 1631 | if (IInfo.inactiveLanesTakenFromOperand()) |
| 1632 | return IC.replaceInstUsesWith( |
| 1633 | II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom())); |
| 1634 | |
| 1635 | if (IInfo.inactiveLanesAreUnused()) { |
| 1636 | if (IInfo.resultIsZeroInitialized()) |
| 1637 | IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); |
| 1638 | |
| 1639 | return IC.eraseInstFromFunction(II); |
| 1640 | } |
| 1641 | } |
| 1642 | |
| 1643 | // If there are no inactive lanes. |
| 1644 | if (isAllActivePredicate(OpPredicate)) { |
| 1645 | if (IInfo.hasOperandWithNoActiveLanes()) { |
| 1646 | unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes(); |
| 1647 | if (!isa<UndefValue>(II.getOperand(OpIdx))) |
| 1648 | return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType())); |
| 1649 | } |
| 1650 | |
| 1651 | if (IInfo.hasMatchingUndefIntrinsic()) { |
| 1652 | auto *NewDecl = Intrinsic::getOrInsertDeclaration( |
| 1653 | II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()}); |
| 1654 | II.setCalledFunction(NewDecl); |
| 1655 | return &II; |
| 1656 | } |
| 1657 | } |
| 1658 | |
Paul Walker | a7999f3 | 2025-04-17 15:58:39 +0100 | [diff] [blame] | 1659 | // Operation specific simplifications. |
| 1660 | if (IInfo.hasMatchingIROpode() && |
| 1661 | Instruction::isBinaryOp(IInfo.getMatchingIROpode())) |
| 1662 | return simplifySVEIntrinsicBinOp(IC, II, IInfo); |
| 1663 | |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 1664 | return std::nullopt; |
| 1665 | } |
| 1666 | |
Matt Devereau | cee8b25 | 2022-01-05 13:42:01 +0000 | [diff] [blame] | 1667 | // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) |
| 1668 | // => (binop (pred) (from_svbool _) (from_svbool _)) |
| 1669 | // |
| 1670 | // The above transformation eliminates a `to_svbool` in the predicate |
| 1671 | // operand of bitwise operation `binop` by narrowing the vector width of |
| 1672 | // the operation. For example, it would convert a `<vscale x 16 x i1> |
| 1673 | // and` into a `<vscale x 4 x i1> and`. This is profitable because |
| 1674 | // to_svbool must zero the new lanes during widening, whereas |
| 1675 | // from_svbool is free. |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1676 | static std::optional<Instruction *> |
| 1677 | tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) { |
Matt Devereau | cee8b25 | 2022-01-05 13:42:01 +0000 | [diff] [blame] | 1678 | auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); |
| 1679 | if (!BinOp) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1680 | return std::nullopt; |
Matt Devereau | cee8b25 | 2022-01-05 13:42:01 +0000 | [diff] [blame] | 1681 | |
| 1682 | auto IntrinsicID = BinOp->getIntrinsicID(); |
| 1683 | switch (IntrinsicID) { |
| 1684 | case Intrinsic::aarch64_sve_and_z: |
| 1685 | case Intrinsic::aarch64_sve_bic_z: |
| 1686 | case Intrinsic::aarch64_sve_eor_z: |
| 1687 | case Intrinsic::aarch64_sve_nand_z: |
| 1688 | case Intrinsic::aarch64_sve_nor_z: |
| 1689 | case Intrinsic::aarch64_sve_orn_z: |
| 1690 | case Intrinsic::aarch64_sve_orr_z: |
| 1691 | break; |
| 1692 | default: |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1693 | return std::nullopt; |
Matt Devereau | cee8b25 | 2022-01-05 13:42:01 +0000 | [diff] [blame] | 1694 | } |
| 1695 | |
| 1696 | auto BinOpPred = BinOp->getOperand(0); |
| 1697 | auto BinOpOp1 = BinOp->getOperand(1); |
| 1698 | auto BinOpOp2 = BinOp->getOperand(2); |
| 1699 | |
| 1700 | auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); |
| 1701 | if (!PredIntr || |
| 1702 | PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1703 | return std::nullopt; |
Matt Devereau | cee8b25 | 2022-01-05 13:42:01 +0000 | [diff] [blame] | 1704 | |
| 1705 | auto PredOp = PredIntr->getOperand(0); |
| 1706 | auto PredOpTy = cast<VectorType>(PredOp->getType()); |
| 1707 | if (PredOpTy != II.getType()) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1708 | return std::nullopt; |
Matt Devereau | cee8b25 | 2022-01-05 13:42:01 +0000 | [diff] [blame] | 1709 | |
Matt Devereau | cee8b25 | 2022-01-05 13:42:01 +0000 | [diff] [blame] | 1710 | SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 1711 | auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic( |
Matt Devereau | cee8b25 | 2022-01-05 13:42:01 +0000 | [diff] [blame] | 1712 | Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); |
| 1713 | NarrowedBinOpArgs.push_back(NarrowBinOpOp1); |
| 1714 | if (BinOpOp1 == BinOpOp2) |
| 1715 | NarrowedBinOpArgs.push_back(NarrowBinOpOp1); |
| 1716 | else |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 1717 | NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic( |
Matt Devereau | cee8b25 | 2022-01-05 13:42:01 +0000 | [diff] [blame] | 1718 | Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); |
| 1719 | |
| 1720 | auto NarrowedBinOp = |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 1721 | IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); |
Matt Devereau | cee8b25 | 2022-01-05 13:42:01 +0000 | [diff] [blame] | 1722 | return IC.replaceInstUsesWith(II, NarrowedBinOp); |
| 1723 | } |
| 1724 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1725 | static std::optional<Instruction *> |
| 1726 | instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) { |
Bradley Smith | c8f20ed | 2021-04-26 16:19:25 +0100 | [diff] [blame] | 1727 | // If the reinterpret instruction operand is a PHI Node |
| 1728 | if (isa<PHINode>(II.getArgOperand(0))) |
| 1729 | return processPhiNode(IC, II); |
| 1730 | |
Matt Devereau | cee8b25 | 2022-01-05 13:42:01 +0000 | [diff] [blame] | 1731 | if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) |
| 1732 | return BinOpCombine; |
| 1733 | |
Sander de Smalen | 11926e6 | 2023-05-22 13:52:18 +0000 | [diff] [blame] | 1734 | // Ignore converts to/from svcount_t. |
| 1735 | if (isa<TargetExtType>(II.getArgOperand(0)->getType()) || |
| 1736 | isa<TargetExtType>(II.getType())) |
| 1737 | return std::nullopt; |
| 1738 | |
Bradley Smith | c8f20ed | 2021-04-26 16:19:25 +0100 | [diff] [blame] | 1739 | SmallVector<Instruction *, 32> CandidatesForRemoval; |
| 1740 | Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; |
| 1741 | |
| 1742 | const auto *IVTy = cast<VectorType>(II.getType()); |
| 1743 | |
| 1744 | // Walk the chain of conversions. |
| 1745 | while (Cursor) { |
| 1746 | // If the type of the cursor has fewer lanes than the final result, zeroing |
| 1747 | // must take place, which breaks the equivalence chain. |
| 1748 | const auto *CursorVTy = cast<VectorType>(Cursor->getType()); |
| 1749 | if (CursorVTy->getElementCount().getKnownMinValue() < |
| 1750 | IVTy->getElementCount().getKnownMinValue()) |
| 1751 | break; |
| 1752 | |
| 1753 | // If the cursor has the same type as I, it is a viable replacement. |
| 1754 | if (Cursor->getType() == IVTy) |
| 1755 | EarliestReplacement = Cursor; |
| 1756 | |
| 1757 | auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); |
| 1758 | |
| 1759 | // If this is not an SVE conversion intrinsic, this is the end of the chain. |
| 1760 | if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == |
| 1761 | Intrinsic::aarch64_sve_convert_to_svbool || |
| 1762 | IntrinsicCursor->getIntrinsicID() == |
| 1763 | Intrinsic::aarch64_sve_convert_from_svbool)) |
| 1764 | break; |
| 1765 | |
| 1766 | CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); |
| 1767 | Cursor = IntrinsicCursor->getOperand(0); |
| 1768 | } |
| 1769 | |
| 1770 | // If no viable replacement in the conversion chain was found, there is |
| 1771 | // nothing to do. |
| 1772 | if (!EarliestReplacement) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1773 | return std::nullopt; |
Bradley Smith | c8f20ed | 2021-04-26 16:19:25 +0100 | [diff] [blame] | 1774 | |
| 1775 | return IC.replaceInstUsesWith(II, EarliestReplacement); |
| 1776 | } |
| 1777 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1778 | static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC, |
| 1779 | IntrinsicInst &II) { |
zhongyunde 00443407 | bf90ffb | 2023-09-27 22:42:43 -0400 | [diff] [blame] | 1780 | // svsel(ptrue, x, y) => x |
| 1781 | auto *OpPredicate = II.getOperand(0); |
| 1782 | if (isAllActivePredicate(OpPredicate)) |
| 1783 | return IC.replaceInstUsesWith(II, II.getOperand(1)); |
| 1784 | |
| 1785 | auto Select = |
| 1786 | IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2)); |
Matt Devereau | a9e08bc | 2022-03-16 11:41:14 +0000 | [diff] [blame] | 1787 | return IC.replaceInstUsesWith(II, Select); |
| 1788 | } |
| 1789 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1790 | static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC, |
| 1791 | IntrinsicInst &II) { |
Bradley Smith | 89085bc | 2021-04-23 13:55:42 +0100 | [diff] [blame] | 1792 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); |
| 1793 | if (!Pg) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1794 | return std::nullopt; |
Bradley Smith | 89085bc | 2021-04-23 13:55:42 +0100 | [diff] [blame] | 1795 | |
| 1796 | if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1797 | return std::nullopt; |
Bradley Smith | 89085bc | 2021-04-23 13:55:42 +0100 | [diff] [blame] | 1798 | |
| 1799 | const auto PTruePattern = |
| 1800 | cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); |
| 1801 | if (PTruePattern != AArch64SVEPredPattern::vl1) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1802 | return std::nullopt; |
Bradley Smith | 89085bc | 2021-04-23 13:55:42 +0100 | [diff] [blame] | 1803 | |
| 1804 | // The intrinsic is inserting into lane zero so use an insert instead. |
| 1805 | auto *IdxTy = Type::getInt64Ty(II.getContext()); |
| 1806 | auto *Insert = InsertElementInst::Create( |
| 1807 | II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); |
Jeremy Morse | 8e70273 | 2025-01-24 10:53:11 +0000 | [diff] [blame] | 1808 | Insert->insertBefore(II.getIterator()); |
Bradley Smith | 89085bc | 2021-04-23 13:55:42 +0100 | [diff] [blame] | 1809 | Insert->takeName(&II); |
| 1810 | |
| 1811 | return IC.replaceInstUsesWith(II, Insert); |
| 1812 | } |
| 1813 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1814 | static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, |
| 1815 | IntrinsicInst &II) { |
Usman Nadeem | ab111e9 | 2021-09-10 17:57:29 -0700 | [diff] [blame] | 1816 | // Replace DupX with a regular IR splat. |
Usman Nadeem | ab111e9 | 2021-09-10 17:57:29 -0700 | [diff] [blame] | 1817 | auto *RetTy = cast<ScalableVectorType>(II.getType()); |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 1818 | Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(), |
| 1819 | II.getArgOperand(0)); |
Usman Nadeem | ab111e9 | 2021-09-10 17:57:29 -0700 | [diff] [blame] | 1820 | Splat->takeName(&II); |
| 1821 | return IC.replaceInstUsesWith(II, Splat); |
| 1822 | } |
| 1823 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1824 | static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, |
| 1825 | IntrinsicInst &II) { |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 1826 | LLVMContext &Ctx = II.getContext(); |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 1827 | |
Matthew Devereau | 91a2056 | 2025-04-13 20:40:51 +0100 | [diff] [blame] | 1828 | if (!isAllActivePredicate(II.getArgOperand(0))) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1829 | return std::nullopt; |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 1830 | |
| 1831 | // Check that we have a compare of zero.. |
Usman Nadeem | ab111e9 | 2021-09-10 17:57:29 -0700 | [diff] [blame] | 1832 | auto *SplatValue = |
| 1833 | dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2))); |
| 1834 | if (!SplatValue || !SplatValue->isZero()) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1835 | return std::nullopt; |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 1836 | |
| 1837 | // ..against a dupq |
| 1838 | auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); |
| 1839 | if (!DupQLane || |
| 1840 | DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1841 | return std::nullopt; |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 1842 | |
| 1843 | // Where the dupq is a lane 0 replicate of a vector insert |
cceerczw | 67a9093 | 2024-08-23 22:30:51 +0800 | [diff] [blame] | 1844 | auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1)); |
| 1845 | if (!DupQLaneIdx || !DupQLaneIdx->isZero()) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1846 | return std::nullopt; |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 1847 | |
| 1848 | auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); |
Bradley Smith | a83aa33 | 2022-06-16 14:45:28 +0000 | [diff] [blame] | 1849 | if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1850 | return std::nullopt; |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 1851 | |
| 1852 | // Where the vector insert is a fixed constant vector insert into undef at |
| 1853 | // index zero |
| 1854 | if (!isa<UndefValue>(VecIns->getArgOperand(0))) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1855 | return std::nullopt; |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 1856 | |
| 1857 | if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1858 | return std::nullopt; |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 1859 | |
| 1860 | auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); |
| 1861 | if (!ConstVec) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1862 | return std::nullopt; |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 1863 | |
| 1864 | auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); |
| 1865 | auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); |
| 1866 | if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1867 | return std::nullopt; |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 1868 | |
| 1869 | unsigned NumElts = VecTy->getNumElements(); |
| 1870 | unsigned PredicateBits = 0; |
| 1871 | |
| 1872 | // Expand intrinsic operands to a 16-bit byte level predicate |
| 1873 | for (unsigned I = 0; I < NumElts; ++I) { |
| 1874 | auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); |
| 1875 | if (!Arg) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1876 | return std::nullopt; |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 1877 | if (!Arg->isZero()) |
| 1878 | PredicateBits |= 1 << (I * (16 / NumElts)); |
| 1879 | } |
| 1880 | |
| 1881 | // If all bits are zero bail early with an empty predicate |
| 1882 | if (PredicateBits == 0) { |
| 1883 | auto *PFalse = Constant::getNullValue(II.getType()); |
| 1884 | PFalse->takeName(&II); |
| 1885 | return IC.replaceInstUsesWith(II, PFalse); |
| 1886 | } |
| 1887 | |
| 1888 | // Calculate largest predicate type used (where byte predicate is largest) |
| 1889 | unsigned Mask = 8; |
| 1890 | for (unsigned I = 0; I < 16; ++I) |
| 1891 | if ((PredicateBits & (1 << I)) != 0) |
| 1892 | Mask |= (I % 8); |
| 1893 | |
| 1894 | unsigned PredSize = Mask & -Mask; |
| 1895 | auto *PredType = ScalableVectorType::get( |
| 1896 | Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); |
| 1897 | |
| 1898 | // Ensure all relevant bits are set |
| 1899 | for (unsigned I = 0; I < 16; I += PredSize) |
| 1900 | if ((PredicateBits & (1 << I)) == 0) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1901 | return std::nullopt; |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 1902 | |
| 1903 | auto *PTruePat = |
| 1904 | ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 1905 | auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, |
| 1906 | {PredType}, {PTruePat}); |
| 1907 | auto *ConvertToSVBool = IC.Builder.CreateIntrinsic( |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 1908 | Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); |
| 1909 | auto *ConvertFromSVBool = |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 1910 | IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, |
| 1911 | {II.getType()}, {ConvertToSVBool}); |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 1912 | |
| 1913 | ConvertFromSVBool->takeName(&II); |
| 1914 | return IC.replaceInstUsesWith(II, ConvertFromSVBool); |
| 1915 | } |
| 1916 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1917 | static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC, |
| 1918 | IntrinsicInst &II) { |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 1919 | Value *Pg = II.getArgOperand(0); |
| 1920 | Value *Vec = II.getArgOperand(1); |
Usman Nadeem | 85bbc05 | 2021-07-27 21:02:32 -0700 | [diff] [blame] | 1921 | auto IntrinsicID = II.getIntrinsicID(); |
| 1922 | bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 1923 | |
Sander de Smalen | eb1a512 | 2021-07-19 10:48:42 +0100 | [diff] [blame] | 1924 | // lastX(splat(X)) --> X |
| 1925 | if (auto *SplatVal = getSplatValue(Vec)) |
| 1926 | return IC.replaceInstUsesWith(II, SplatVal); |
| 1927 | |
Usman Nadeem | 85bbc05 | 2021-07-27 21:02:32 -0700 | [diff] [blame] | 1928 | // If x and/or y is a splat value then: |
| 1929 | // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) |
| 1930 | Value *LHS, *RHS; |
| 1931 | if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { |
| 1932 | if (isSplatValue(LHS) || isSplatValue(RHS)) { |
| 1933 | auto *OldBinOp = cast<BinaryOperator>(Vec); |
| 1934 | auto OpC = OldBinOp->getOpcode(); |
| 1935 | auto *NewLHS = |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 1936 | IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); |
Usman Nadeem | 85bbc05 | 2021-07-27 21:02:32 -0700 | [diff] [blame] | 1937 | auto *NewRHS = |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 1938 | IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); |
Usman Nadeem | 85bbc05 | 2021-07-27 21:02:32 -0700 | [diff] [blame] | 1939 | auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( |
Jeremy Morse | b9d83ef | 2024-03-19 16:36:29 +0000 | [diff] [blame] | 1940 | OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator()); |
Usman Nadeem | 85bbc05 | 2021-07-27 21:02:32 -0700 | [diff] [blame] | 1941 | return IC.replaceInstUsesWith(II, NewBinOp); |
| 1942 | } |
| 1943 | } |
| 1944 | |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 1945 | auto *C = dyn_cast<Constant>(Pg); |
| 1946 | if (IsAfter && C && C->isNullValue()) { |
| 1947 | // The intrinsic is extracting lane 0 so use an extract instead. |
| 1948 | auto *IdxTy = Type::getInt64Ty(II.getContext()); |
| 1949 | auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); |
Jeremy Morse | 8e70273 | 2025-01-24 10:53:11 +0000 | [diff] [blame] | 1950 | Extract->insertBefore(II.getIterator()); |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 1951 | Extract->takeName(&II); |
| 1952 | return IC.replaceInstUsesWith(II, Extract); |
| 1953 | } |
| 1954 | |
| 1955 | auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); |
| 1956 | if (!IntrPG) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1957 | return std::nullopt; |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 1958 | |
| 1959 | if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1960 | return std::nullopt; |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 1961 | |
| 1962 | const auto PTruePattern = |
| 1963 | cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); |
| 1964 | |
| 1965 | // Can the intrinsic's predicate be converted to a known constant index? |
Jun Ma | 8c47103 | 2021-08-25 17:25:39 +0800 | [diff] [blame] | 1966 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); |
| 1967 | if (!MinNumElts) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1968 | return std::nullopt; |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 1969 | |
Jun Ma | 8c47103 | 2021-08-25 17:25:39 +0800 | [diff] [blame] | 1970 | unsigned Idx = MinNumElts - 1; |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 1971 | // Increment the index if extracting the element after the last active |
| 1972 | // predicate element. |
| 1973 | if (IsAfter) |
| 1974 | ++Idx; |
| 1975 | |
| 1976 | // Ignore extracts whose index is larger than the known minimum vector |
| 1977 | // length. NOTE: This is an artificial constraint where we prefer to |
| 1978 | // maintain what the user asked for until an alternative is proven faster. |
| 1979 | auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); |
| 1980 | if (Idx >= PgVTy->getMinNumElements()) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1981 | return std::nullopt; |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 1982 | |
| 1983 | // The intrinsic is extracting a fixed lane so use an extract instead. |
| 1984 | auto *IdxTy = Type::getInt64Ty(II.getContext()); |
| 1985 | auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); |
Jeremy Morse | 8e70273 | 2025-01-24 10:53:11 +0000 | [diff] [blame] | 1986 | Extract->insertBefore(II.getIterator()); |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 1987 | Extract->takeName(&II); |
| 1988 | return IC.replaceInstUsesWith(II, Extract); |
| 1989 | } |
| 1990 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 1991 | static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC, |
| 1992 | IntrinsicInst &II) { |
Cullen Rhodes | 7c3cda5 | 2022-07-08 15:18:27 +0000 | [diff] [blame] | 1993 | // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar |
| 1994 | // integer variant across a variety of micro-architectures. Replace scalar |
| 1995 | // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple |
| 1996 | // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more |
| 1997 | // depending on the micro-architecture, but has been observed as generally |
| 1998 | // being faster, particularly when the CLAST[AB] op is a loop-carried |
| 1999 | // dependency. |
Cullen Rhodes | 7c3cda5 | 2022-07-08 15:18:27 +0000 | [diff] [blame] | 2000 | Value *Pg = II.getArgOperand(0); |
| 2001 | Value *Fallback = II.getArgOperand(1); |
| 2002 | Value *Vec = II.getArgOperand(2); |
| 2003 | Type *Ty = II.getType(); |
| 2004 | |
| 2005 | if (!Ty->isIntegerTy()) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2006 | return std::nullopt; |
Cullen Rhodes | 7c3cda5 | 2022-07-08 15:18:27 +0000 | [diff] [blame] | 2007 | |
| 2008 | Type *FPTy; |
| 2009 | switch (cast<IntegerType>(Ty)->getBitWidth()) { |
| 2010 | default: |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2011 | return std::nullopt; |
Cullen Rhodes | 7c3cda5 | 2022-07-08 15:18:27 +0000 | [diff] [blame] | 2012 | case 16: |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2013 | FPTy = IC.Builder.getHalfTy(); |
Cullen Rhodes | 7c3cda5 | 2022-07-08 15:18:27 +0000 | [diff] [blame] | 2014 | break; |
| 2015 | case 32: |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2016 | FPTy = IC.Builder.getFloatTy(); |
Cullen Rhodes | 7c3cda5 | 2022-07-08 15:18:27 +0000 | [diff] [blame] | 2017 | break; |
| 2018 | case 64: |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2019 | FPTy = IC.Builder.getDoubleTy(); |
Cullen Rhodes | 7c3cda5 | 2022-07-08 15:18:27 +0000 | [diff] [blame] | 2020 | break; |
| 2021 | } |
| 2022 | |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2023 | Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy); |
Cullen Rhodes | 7c3cda5 | 2022-07-08 15:18:27 +0000 | [diff] [blame] | 2024 | auto *FPVTy = VectorType::get( |
| 2025 | FPTy, cast<VectorType>(Vec->getType())->getElementCount()); |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2026 | Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy); |
| 2027 | auto *FPII = IC.Builder.CreateIntrinsic( |
| 2028 | II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec}); |
| 2029 | Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType()); |
Cullen Rhodes | 7c3cda5 | 2022-07-08 15:18:27 +0000 | [diff] [blame] | 2030 | return IC.replaceInstUsesWith(II, FPIItoInt); |
| 2031 | } |
| 2032 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2033 | static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC, |
| 2034 | IntrinsicInst &II) { |
Peter Waller | 2d574a1 | 2021-05-12 14:47:22 +0000 | [diff] [blame] | 2035 | LLVMContext &Ctx = II.getContext(); |
Peter Waller | 2d574a1 | 2021-05-12 14:47:22 +0000 | [diff] [blame] | 2036 | // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr |
| 2037 | // can work with RDFFR_PP for ptest elimination. |
| 2038 | auto *AllPat = |
| 2039 | ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2040 | auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, |
| 2041 | {II.getType()}, {AllPat}); |
Peter Waller | 2d574a1 | 2021-05-12 14:47:22 +0000 | [diff] [blame] | 2042 | auto *RDFFR = |
Rahul Joshi | 74b7abf | 2025-03-31 08:10:34 -0700 | [diff] [blame] | 2043 | IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue}); |
Peter Waller | 2d574a1 | 2021-05-12 14:47:22 +0000 | [diff] [blame] | 2044 | RDFFR->takeName(&II); |
| 2045 | return IC.replaceInstUsesWith(II, RDFFR); |
| 2046 | } |
| 2047 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2048 | static std::optional<Instruction *> |
Jun Ma | ae543394 | 2021-06-18 11:55:01 +0800 | [diff] [blame] | 2049 | instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { |
| 2050 | const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); |
| 2051 | |
| 2052 | if (Pattern == AArch64SVEPredPattern::all) { |
Jun Ma | ae543394 | 2021-06-18 11:55:01 +0800 | [diff] [blame] | 2053 | Constant *StepVal = ConstantInt::get(II.getType(), NumElts); |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2054 | auto *VScale = IC.Builder.CreateVScale(StepVal); |
Jun Ma | ae543394 | 2021-06-18 11:55:01 +0800 | [diff] [blame] | 2055 | VScale->takeName(&II); |
| 2056 | return IC.replaceInstUsesWith(II, VScale); |
| 2057 | } |
| 2058 | |
Jun Ma | 8c47103 | 2021-08-25 17:25:39 +0800 | [diff] [blame] | 2059 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); |
Jun Ma | ae543394 | 2021-06-18 11:55:01 +0800 | [diff] [blame] | 2060 | |
Jun Ma | 8c47103 | 2021-08-25 17:25:39 +0800 | [diff] [blame] | 2061 | return MinNumElts && NumElts >= MinNumElts |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2062 | ? std::optional<Instruction *>(IC.replaceInstUsesWith( |
Jun Ma | ae543394 | 2021-06-18 11:55:01 +0800 | [diff] [blame] | 2063 | II, ConstantInt::get(II.getType(), MinNumElts))) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2064 | : std::nullopt; |
Jun Ma | ae543394 | 2021-06-18 11:55:01 +0800 | [diff] [blame] | 2065 | } |
| 2066 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2067 | static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, |
| 2068 | IntrinsicInst &II) { |
Bradley Smith | daf1a1f | 2022-11-11 15:24:57 +0000 | [diff] [blame] | 2069 | Value *PgVal = II.getArgOperand(0); |
| 2070 | Value *OpVal = II.getArgOperand(1); |
| 2071 | |
Bradley Smith | daf1a1f | 2022-11-11 15:24:57 +0000 | [diff] [blame] | 2072 | // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X). |
| 2073 | // Later optimizations prefer this form. |
| 2074 | if (PgVal == OpVal && |
| 2075 | (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first || |
| 2076 | II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) { |
| 2077 | Value *Ops[] = {PgVal, OpVal}; |
| 2078 | Type *Tys[] = {PgVal->getType()}; |
| 2079 | |
| 2080 | auto *PTest = |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2081 | IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops); |
Bradley Smith | daf1a1f | 2022-11-11 15:24:57 +0000 | [diff] [blame] | 2082 | PTest->takeName(&II); |
| 2083 | |
| 2084 | return IC.replaceInstUsesWith(II, PTest); |
| 2085 | } |
| 2086 | |
| 2087 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal); |
| 2088 | IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal); |
Bradley Smith | 191f9fa | 2021-07-13 14:42:36 +0000 | [diff] [blame] | 2089 | |
Cullen Rhodes | 5062116 | 2022-11-04 08:40:18 +0000 | [diff] [blame] | 2090 | if (!Pg || !Op) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2091 | return std::nullopt; |
Cullen Rhodes | 388cacb | 2022-10-12 08:36:03 +0000 | [diff] [blame] | 2092 | |
Cullen Rhodes | 5062116 | 2022-11-04 08:40:18 +0000 | [diff] [blame] | 2093 | Intrinsic::ID OpIID = Op->getIntrinsicID(); |
| 2094 | |
Cullen Rhodes | 5062116 | 2022-11-04 08:40:18 +0000 | [diff] [blame] | 2095 | if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && |
| 2096 | OpIID == Intrinsic::aarch64_sve_convert_to_svbool && |
| 2097 | Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) { |
| 2098 | Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)}; |
| 2099 | Type *Tys[] = {Pg->getArgOperand(0)->getType()}; |
Bradley Smith | 191f9fa | 2021-07-13 14:42:36 +0000 | [diff] [blame] | 2100 | |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2101 | auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); |
Bradley Smith | 191f9fa | 2021-07-13 14:42:36 +0000 | [diff] [blame] | 2102 | |
| 2103 | PTest->takeName(&II); |
| 2104 | return IC.replaceInstUsesWith(II, PTest); |
| 2105 | } |
| 2106 | |
Cullen Rhodes | 388cacb | 2022-10-12 08:36:03 +0000 | [diff] [blame] | 2107 | // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)). |
| 2108 | // Later optimizations may rewrite sequence to use the flag-setting variant |
| 2109 | // of instruction X to remove PTEST. |
Cullen Rhodes | 5062116 | 2022-11-04 08:40:18 +0000 | [diff] [blame] | 2110 | if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) && |
| 2111 | ((OpIID == Intrinsic::aarch64_sve_brka_z) || |
| 2112 | (OpIID == Intrinsic::aarch64_sve_brkb_z) || |
| 2113 | (OpIID == Intrinsic::aarch64_sve_brkpa_z) || |
| 2114 | (OpIID == Intrinsic::aarch64_sve_brkpb_z) || |
| 2115 | (OpIID == Intrinsic::aarch64_sve_rdffr_z) || |
| 2116 | (OpIID == Intrinsic::aarch64_sve_and_z) || |
| 2117 | (OpIID == Intrinsic::aarch64_sve_bic_z) || |
| 2118 | (OpIID == Intrinsic::aarch64_sve_eor_z) || |
| 2119 | (OpIID == Intrinsic::aarch64_sve_nand_z) || |
| 2120 | (OpIID == Intrinsic::aarch64_sve_nor_z) || |
| 2121 | (OpIID == Intrinsic::aarch64_sve_orn_z) || |
| 2122 | (OpIID == Intrinsic::aarch64_sve_orr_z))) { |
| 2123 | Value *Ops[] = {Pg->getArgOperand(0), Pg}; |
| 2124 | Type *Tys[] = {Pg->getType()}; |
Cullen Rhodes | 388cacb | 2022-10-12 08:36:03 +0000 | [diff] [blame] | 2125 | |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2126 | auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); |
Cullen Rhodes | 388cacb | 2022-10-12 08:36:03 +0000 | [diff] [blame] | 2127 | PTest->takeName(&II); |
| 2128 | |
| 2129 | return IC.replaceInstUsesWith(II, PTest); |
| 2130 | } |
| 2131 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2132 | return std::nullopt; |
Bradley Smith | 191f9fa | 2021-07-13 14:42:36 +0000 | [diff] [blame] | 2133 | } |
| 2134 | |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2135 | template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc> |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2136 | static std::optional<Instruction *> |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2137 | instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, |
| 2138 | bool MergeIntoAddendOp) { |
Matt | 4a59694 | 2021-11-03 11:31:41 +0000 | [diff] [blame] | 2139 | Value *P = II.getOperand(0); |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2140 | Value *MulOp0, *MulOp1, *AddendOp, *Mul; |
| 2141 | if (MergeIntoAddendOp) { |
| 2142 | AddendOp = II.getOperand(1); |
| 2143 | Mul = II.getOperand(2); |
| 2144 | } else { |
| 2145 | AddendOp = II.getOperand(2); |
| 2146 | Mul = II.getOperand(1); |
| 2147 | } |
| 2148 | |
| 2149 | if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0), |
| 2150 | m_Value(MulOp1)))) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2151 | return std::nullopt; |
Matt | 4a59694 | 2021-11-03 11:31:41 +0000 | [diff] [blame] | 2152 | |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2153 | if (!Mul->hasOneUse()) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2154 | return std::nullopt; |
Matt | 4a59694 | 2021-11-03 11:31:41 +0000 | [diff] [blame] | 2155 | |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2156 | Instruction *FMFSource = nullptr; |
| 2157 | if (II.getType()->isFPOrFPVectorTy()) { |
| 2158 | llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); |
| 2159 | // Stop the combine when the flags on the inputs differ in case dropping |
| 2160 | // flags would lead to us missing out on more beneficial optimizations. |
| 2161 | if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags()) |
| 2162 | return std::nullopt; |
| 2163 | if (!FAddFlags.allowContract()) |
| 2164 | return std::nullopt; |
| 2165 | FMFSource = &II; |
| 2166 | } |
Matt | 4a59694 | 2021-11-03 11:31:41 +0000 | [diff] [blame] | 2167 | |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2168 | CallInst *Res; |
| 2169 | if (MergeIntoAddendOp) |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2170 | Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()}, |
| 2171 | {P, AddendOp, MulOp0, MulOp1}, FMFSource); |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2172 | else |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2173 | Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()}, |
| 2174 | {P, MulOp0, MulOp1, AddendOp}, FMFSource); |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2175 | |
| 2176 | return IC.replaceInstUsesWith(II, Res); |
Matt | 4a59694 | 2021-11-03 11:31:41 +0000 | [diff] [blame] | 2177 | } |
| 2178 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2179 | static std::optional<Instruction *> |
Matt Devereau | f526c60 | 2021-11-04 16:10:55 +0000 | [diff] [blame] | 2180 | instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { |
Matt Devereau | f526c60 | 2021-11-04 16:10:55 +0000 | [diff] [blame] | 2181 | Value *Pred = II.getOperand(0); |
| 2182 | Value *PtrOp = II.getOperand(1); |
| 2183 | Type *VecTy = II.getType(); |
Matt Devereau | f526c60 | 2021-11-04 16:10:55 +0000 | [diff] [blame] | 2184 | |
Paul Walker | 01bc67e | 2021-12-03 14:36:54 +0000 | [diff] [blame] | 2185 | if (isAllActivePredicate(Pred)) { |
Youngsuk Kim | f69b9b7 | 2023-07-08 13:05:58 -0400 | [diff] [blame] | 2186 | LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp); |
Sander de Smalen | 0b41238 | 2022-02-11 07:53:20 +0000 | [diff] [blame] | 2187 | Load->copyMetadata(II); |
Matt Devereau | f526c60 | 2021-11-04 16:10:55 +0000 | [diff] [blame] | 2188 | return IC.replaceInstUsesWith(II, Load); |
| 2189 | } |
| 2190 | |
| 2191 | CallInst *MaskedLoad = |
Youngsuk Kim | f69b9b7 | 2023-07-08 13:05:58 -0400 | [diff] [blame] | 2192 | IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL), |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2193 | Pred, ConstantAggregateZero::get(VecTy)); |
Sander de Smalen | 0b41238 | 2022-02-11 07:53:20 +0000 | [diff] [blame] | 2194 | MaskedLoad->copyMetadata(II); |
Matt Devereau | f526c60 | 2021-11-04 16:10:55 +0000 | [diff] [blame] | 2195 | return IC.replaceInstUsesWith(II, MaskedLoad); |
| 2196 | } |
| 2197 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2198 | static std::optional<Instruction *> |
Matt Devereau | f526c60 | 2021-11-04 16:10:55 +0000 | [diff] [blame] | 2199 | instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { |
Matt Devereau | f526c60 | 2021-11-04 16:10:55 +0000 | [diff] [blame] | 2200 | Value *VecOp = II.getOperand(0); |
| 2201 | Value *Pred = II.getOperand(1); |
| 2202 | Value *PtrOp = II.getOperand(2); |
Matt Devereau | f526c60 | 2021-11-04 16:10:55 +0000 | [diff] [blame] | 2203 | |
Paul Walker | 01bc67e | 2021-12-03 14:36:54 +0000 | [diff] [blame] | 2204 | if (isAllActivePredicate(Pred)) { |
Youngsuk Kim | f69b9b7 | 2023-07-08 13:05:58 -0400 | [diff] [blame] | 2205 | StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp); |
Sander de Smalen | 0b41238 | 2022-02-11 07:53:20 +0000 | [diff] [blame] | 2206 | Store->copyMetadata(II); |
Matt Devereau | f526c60 | 2021-11-04 16:10:55 +0000 | [diff] [blame] | 2207 | return IC.eraseInstFromFunction(II); |
| 2208 | } |
| 2209 | |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2210 | CallInst *MaskedStore = IC.Builder.CreateMaskedStore( |
Youngsuk Kim | f69b9b7 | 2023-07-08 13:05:58 -0400 | [diff] [blame] | 2211 | VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred); |
Sander de Smalen | 0b41238 | 2022-02-11 07:53:20 +0000 | [diff] [blame] | 2212 | MaskedStore->copyMetadata(II); |
Matt Devereau | f526c60 | 2021-11-04 16:10:55 +0000 | [diff] [blame] | 2213 | return IC.eraseInstFromFunction(II); |
| 2214 | } |
| 2215 | |
Matthew Devereau | f085a9d | 2021-09-01 16:41:42 +0100 | [diff] [blame] | 2216 | static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { |
| 2217 | switch (Intrinsic) { |
Jolanta Jensen | dc63b35 | 2023-05-17 09:21:40 +0000 | [diff] [blame] | 2218 | case Intrinsic::aarch64_sve_fmul_u: |
Matthew Devereau | f085a9d | 2021-09-01 16:41:42 +0100 | [diff] [blame] | 2219 | return Instruction::BinaryOps::FMul; |
Jolanta Jensen | dc63b35 | 2023-05-17 09:21:40 +0000 | [diff] [blame] | 2220 | case Intrinsic::aarch64_sve_fadd_u: |
Matthew Devereau | f085a9d | 2021-09-01 16:41:42 +0100 | [diff] [blame] | 2221 | return Instruction::BinaryOps::FAdd; |
Jolanta Jensen | dc63b35 | 2023-05-17 09:21:40 +0000 | [diff] [blame] | 2222 | case Intrinsic::aarch64_sve_fsub_u: |
Matthew Devereau | f085a9d | 2021-09-01 16:41:42 +0100 | [diff] [blame] | 2223 | return Instruction::BinaryOps::FSub; |
| 2224 | default: |
| 2225 | return Instruction::BinaryOpsEnd; |
| 2226 | } |
| 2227 | } |
| 2228 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2229 | static std::optional<Instruction *> |
| 2230 | instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) { |
Paul Walker | 65031c1 | 2023-04-04 12:51:25 +0000 | [diff] [blame] | 2231 | // Bail due to missing support for ISD::STRICT_ scalable vector operations. |
| 2232 | if (II.isStrictFP()) |
| 2233 | return std::nullopt; |
| 2234 | |
Matthew Devereau | 2ac1999 | 2021-10-04 16:56:56 +0100 | [diff] [blame] | 2235 | auto *OpPredicate = II.getOperand(0); |
Matthew Devereau | f085a9d | 2021-09-01 16:41:42 +0100 | [diff] [blame] | 2236 | auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID()); |
| 2237 | if (BinOpCode == Instruction::BinaryOpsEnd || |
Matthew Devereau | 91a2056 | 2025-04-13 20:40:51 +0100 | [diff] [blame] | 2238 | !isAllActivePredicate(OpPredicate)) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2239 | return std::nullopt; |
Yingwei Zheng | a77346b | 2025-01-06 14:37:04 +0800 | [diff] [blame] | 2240 | auto BinOp = IC.Builder.CreateBinOpFMF( |
| 2241 | BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags()); |
Matthew Devereau | 2ac1999 | 2021-10-04 16:56:56 +0100 | [diff] [blame] | 2242 | return IC.replaceInstUsesWith(II, BinOp); |
Matthew Devereau | f085a9d | 2021-09-01 16:41:42 +0100 | [diff] [blame] | 2243 | } |
| 2244 | |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2245 | static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC, |
| 2246 | IntrinsicInst &II) { |
Paul Walker | c7c71aa | 2023-06-17 16:48:09 +0100 | [diff] [blame] | 2247 | if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, |
| 2248 | Intrinsic::aarch64_sve_mla>( |
| 2249 | IC, II, true)) |
| 2250 | return MLA; |
| 2251 | if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, |
| 2252 | Intrinsic::aarch64_sve_mad>( |
| 2253 | IC, II, false)) |
| 2254 | return MAD; |
| 2255 | return std::nullopt; |
| 2256 | } |
| 2257 | |
| 2258 | static std::optional<Instruction *> |
| 2259 | instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) { |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2260 | if (auto FMLA = |
| 2261 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2262 | Intrinsic::aarch64_sve_fmla>(IC, II, |
| 2263 | true)) |
Matt | 4a59694 | 2021-11-03 11:31:41 +0000 | [diff] [blame] | 2264 | return FMLA; |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2265 | if (auto FMAD = |
| 2266 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2267 | Intrinsic::aarch64_sve_fmad>(IC, II, |
| 2268 | false)) |
| 2269 | return FMAD; |
Paul Walker | b7287a8 | 2023-06-17 17:51:49 +0100 | [diff] [blame] | 2270 | if (auto FMLA = |
Jolanta Jensen | dc63b35 | 2023-05-17 09:21:40 +0000 | [diff] [blame] | 2271 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
Paul Walker | b7287a8 | 2023-06-17 17:51:49 +0100 | [diff] [blame] | 2272 | Intrinsic::aarch64_sve_fmla>(IC, II, |
| 2273 | true)) |
| 2274 | return FMLA; |
Jolanta Jensen | 5cd16e2 | 2023-06-20 12:51:41 +0000 | [diff] [blame] | 2275 | return std::nullopt; |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2276 | } |
| 2277 | |
Paul Walker | c7c71aa | 2023-06-17 16:48:09 +0100 | [diff] [blame] | 2278 | static std::optional<Instruction *> |
| 2279 | instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) { |
| 2280 | if (auto FMLA = |
| 2281 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2282 | Intrinsic::aarch64_sve_fmla>(IC, II, |
| 2283 | true)) |
| 2284 | return FMLA; |
| 2285 | if (auto FMAD = |
| 2286 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2287 | Intrinsic::aarch64_sve_fmad>(IC, II, |
| 2288 | false)) |
| 2289 | return FMAD; |
| 2290 | if (auto FMLA_U = |
| 2291 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
| 2292 | Intrinsic::aarch64_sve_fmla_u>( |
| 2293 | IC, II, true)) |
| 2294 | return FMLA_U; |
| 2295 | return instCombineSVEVectorBinOp(IC, II); |
| 2296 | } |
| 2297 | |
| 2298 | static std::optional<Instruction *> |
| 2299 | instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) { |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2300 | if (auto FMLS = |
| 2301 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2302 | Intrinsic::aarch64_sve_fmls>(IC, II, |
| 2303 | true)) |
| 2304 | return FMLS; |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2305 | if (auto FMSB = |
| 2306 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2307 | Intrinsic::aarch64_sve_fnmsb>( |
| 2308 | IC, II, false)) |
| 2309 | return FMSB; |
Paul Walker | b7287a8 | 2023-06-17 17:51:49 +0100 | [diff] [blame] | 2310 | if (auto FMLS = |
Jolanta Jensen | dc63b35 | 2023-05-17 09:21:40 +0000 | [diff] [blame] | 2311 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
Paul Walker | b7287a8 | 2023-06-17 17:51:49 +0100 | [diff] [blame] | 2312 | Intrinsic::aarch64_sve_fmls>(IC, II, |
| 2313 | true)) |
| 2314 | return FMLS; |
Jolanta Jensen | 5cd16e2 | 2023-06-20 12:51:41 +0000 | [diff] [blame] | 2315 | return std::nullopt; |
Matt | 4a59694 | 2021-11-03 11:31:41 +0000 | [diff] [blame] | 2316 | } |
| 2317 | |
Paul Walker | c7c71aa | 2023-06-17 16:48:09 +0100 | [diff] [blame] | 2318 | static std::optional<Instruction *> |
| 2319 | instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) { |
| 2320 | if (auto FMLS = |
| 2321 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2322 | Intrinsic::aarch64_sve_fmls>(IC, II, |
| 2323 | true)) |
| 2324 | return FMLS; |
| 2325 | if (auto FMSB = |
| 2326 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2327 | Intrinsic::aarch64_sve_fnmsb>( |
| 2328 | IC, II, false)) |
| 2329 | return FMSB; |
| 2330 | if (auto FMLS_U = |
| 2331 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
| 2332 | Intrinsic::aarch64_sve_fmls_u>( |
| 2333 | IC, II, true)) |
| 2334 | return FMLS_U; |
| 2335 | return instCombineSVEVectorBinOp(IC, II); |
| 2336 | } |
| 2337 | |
| 2338 | static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC, |
| 2339 | IntrinsicInst &II) { |
| 2340 | if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, |
| 2341 | Intrinsic::aarch64_sve_mls>( |
| 2342 | IC, II, true)) |
| 2343 | return MLS; |
| 2344 | return std::nullopt; |
| 2345 | } |
| 2346 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2347 | static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, |
| 2348 | IntrinsicInst &II) { |
Usman Nadeem | 5420fc4 | 2021-08-05 17:23:01 -0700 | [diff] [blame] | 2349 | Value *UnpackArg = II.getArgOperand(0); |
| 2350 | auto *RetTy = cast<ScalableVectorType>(II.getType()); |
| 2351 | bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || |
| 2352 | II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; |
| 2353 | |
| 2354 | // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) |
| 2355 | // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) |
| 2356 | if (auto *ScalarArg = getSplatValue(UnpackArg)) { |
| 2357 | ScalarArg = |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2358 | IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); |
Usman Nadeem | 5420fc4 | 2021-08-05 17:23:01 -0700 | [diff] [blame] | 2359 | Value *NewVal = |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2360 | IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); |
Usman Nadeem | 5420fc4 | 2021-08-05 17:23:01 -0700 | [diff] [blame] | 2361 | NewVal->takeName(&II); |
| 2362 | return IC.replaceInstUsesWith(II, NewVal); |
| 2363 | } |
| 2364 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2365 | return std::nullopt; |
Usman Nadeem | 5420fc4 | 2021-08-05 17:23:01 -0700 | [diff] [blame] | 2366 | } |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2367 | static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC, |
| 2368 | IntrinsicInst &II) { |
Bradley Smith | 191f9fa | 2021-07-13 14:42:36 +0000 | [diff] [blame] | 2369 | auto *OpVal = II.getOperand(0); |
| 2370 | auto *OpIndices = II.getOperand(1); |
| 2371 | VectorType *VTy = cast<VectorType>(II.getType()); |
| 2372 | |
Usman Nadeem | ab111e9 | 2021-09-10 17:57:29 -0700 | [diff] [blame] | 2373 | // Check whether OpIndices is a constant splat value < minimal element count |
| 2374 | // of result. |
| 2375 | auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices)); |
Bradley Smith | 191f9fa | 2021-07-13 14:42:36 +0000 | [diff] [blame] | 2376 | if (!SplatValue || |
| 2377 | SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2378 | return std::nullopt; |
Bradley Smith | 191f9fa | 2021-07-13 14:42:36 +0000 | [diff] [blame] | 2379 | |
| 2380 | // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to |
| 2381 | // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2382 | auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue); |
Bradley Smith | 191f9fa | 2021-07-13 14:42:36 +0000 | [diff] [blame] | 2383 | auto *VectorSplat = |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2384 | IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract); |
Bradley Smith | 191f9fa | 2021-07-13 14:42:36 +0000 | [diff] [blame] | 2385 | |
| 2386 | VectorSplat->takeName(&II); |
| 2387 | return IC.replaceInstUsesWith(II, VectorSplat); |
| 2388 | } |
| 2389 | |
Usman Nadeem | 267d6b5 | 2024-02-15 10:40:09 -0800 | [diff] [blame] | 2390 | static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC, |
| 2391 | IntrinsicInst &II) { |
| 2392 | Value *A, *B; |
| 2393 | Type *RetTy = II.getType(); |
| 2394 | constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool; |
| 2395 | constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool; |
| 2396 | |
| 2397 | // uzp1(to_svbool(A), to_svbool(B)) --> <A, B> |
| 2398 | // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B> |
| 2399 | if ((match(II.getArgOperand(0), |
| 2400 | m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) && |
| 2401 | match(II.getArgOperand(1), |
| 2402 | m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) || |
| 2403 | (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) && |
| 2404 | match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) { |
| 2405 | auto *TyA = cast<ScalableVectorType>(A->getType()); |
| 2406 | if (TyA == B->getType() && |
| 2407 | RetTy == ScalableVectorType::getDoubleElementsVectorType(TyA)) { |
| 2408 | auto *SubVec = IC.Builder.CreateInsertVector( |
Craig Topper | 123758b | 2025-05-02 16:10:18 -0700 | [diff] [blame] | 2409 | RetTy, PoisonValue::get(RetTy), A, uint64_t(0)); |
| 2410 | auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B, |
| 2411 | TyA->getMinNumElements()); |
Usman Nadeem | 267d6b5 | 2024-02-15 10:40:09 -0800 | [diff] [blame] | 2412 | ConcatVec->takeName(&II); |
| 2413 | return IC.replaceInstUsesWith(II, ConcatVec); |
| 2414 | } |
| 2415 | } |
| 2416 | |
| 2417 | return std::nullopt; |
| 2418 | } |
| 2419 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2420 | static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC, |
| 2421 | IntrinsicInst &II) { |
Usman Nadeem | 757384a | 2021-09-12 15:53:26 -0700 | [diff] [blame] | 2422 | // zip1(uzp1(A, B), uzp2(A, B)) --> A |
| 2423 | // zip2(uzp1(A, B), uzp2(A, B)) --> B |
| 2424 | Value *A, *B; |
| 2425 | if (match(II.getArgOperand(0), |
| 2426 | m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && |
| 2427 | match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( |
| 2428 | m_Specific(A), m_Specific(B)))) |
| 2429 | return IC.replaceInstUsesWith( |
| 2430 | II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); |
| 2431 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2432 | return std::nullopt; |
Usman Nadeem | 757384a | 2021-09-12 15:53:26 -0700 | [diff] [blame] | 2433 | } |
| 2434 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2435 | static std::optional<Instruction *> |
| 2436 | instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) { |
Peter Waller | 7a34145 | 2021-11-03 13:40:22 +0000 | [diff] [blame] | 2437 | Value *Mask = II.getOperand(0); |
| 2438 | Value *BasePtr = II.getOperand(1); |
| 2439 | Value *Index = II.getOperand(2); |
| 2440 | Type *Ty = II.getType(); |
Peter Waller | 7a34145 | 2021-11-03 13:40:22 +0000 | [diff] [blame] | 2441 | Value *PassThru = ConstantAggregateZero::get(Ty); |
| 2442 | |
| 2443 | // Contiguous gather => masked load. |
| 2444 | // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) |
| 2445 | // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) |
| 2446 | Value *IndexBase; |
| 2447 | if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( |
| 2448 | m_Value(IndexBase), m_SpecificInt(1)))) { |
Peter Waller | 7a34145 | 2021-11-03 13:40:22 +0000 | [diff] [blame] | 2449 | Align Alignment = |
Nikita Popov | 2d209d9 | 2024-06-27 16:38:15 +0200 | [diff] [blame] | 2450 | BasePtr->getPointerAlignment(II.getDataLayout()); |
Peter Waller | 7a34145 | 2021-11-03 13:40:22 +0000 | [diff] [blame] | 2451 | |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2452 | Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(), |
| 2453 | BasePtr, IndexBase); |
Peter Waller | 7a34145 | 2021-11-03 13:40:22 +0000 | [diff] [blame] | 2454 | CallInst *MaskedLoad = |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2455 | IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); |
Peter Waller | 7a34145 | 2021-11-03 13:40:22 +0000 | [diff] [blame] | 2456 | MaskedLoad->takeName(&II); |
| 2457 | return IC.replaceInstUsesWith(II, MaskedLoad); |
| 2458 | } |
| 2459 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2460 | return std::nullopt; |
Peter Waller | 7a34145 | 2021-11-03 13:40:22 +0000 | [diff] [blame] | 2461 | } |
| 2462 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2463 | static std::optional<Instruction *> |
| 2464 | instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) { |
Peter Waller | 7a34145 | 2021-11-03 13:40:22 +0000 | [diff] [blame] | 2465 | Value *Val = II.getOperand(0); |
| 2466 | Value *Mask = II.getOperand(1); |
| 2467 | Value *BasePtr = II.getOperand(2); |
| 2468 | Value *Index = II.getOperand(3); |
| 2469 | Type *Ty = Val->getType(); |
Peter Waller | 7a34145 | 2021-11-03 13:40:22 +0000 | [diff] [blame] | 2470 | |
| 2471 | // Contiguous scatter => masked store. |
Nikita Popov | 3196ef8 | 2022-02-08 15:16:16 +0100 | [diff] [blame] | 2472 | // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) |
Peter Waller | 7a34145 | 2021-11-03 13:40:22 +0000 | [diff] [blame] | 2473 | // => (masked.store Value (gep BasePtr IndexBase) Align Mask) |
| 2474 | Value *IndexBase; |
| 2475 | if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( |
| 2476 | m_Value(IndexBase), m_SpecificInt(1)))) { |
Peter Waller | 7a34145 | 2021-11-03 13:40:22 +0000 | [diff] [blame] | 2477 | Align Alignment = |
Nikita Popov | 2d209d9 | 2024-06-27 16:38:15 +0200 | [diff] [blame] | 2478 | BasePtr->getPointerAlignment(II.getDataLayout()); |
Peter Waller | 7a34145 | 2021-11-03 13:40:22 +0000 | [diff] [blame] | 2479 | |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2480 | Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(), |
| 2481 | BasePtr, IndexBase); |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2482 | (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); |
Peter Waller | 7a34145 | 2021-11-03 13:40:22 +0000 | [diff] [blame] | 2483 | |
| 2484 | return IC.eraseInstFromFunction(II); |
| 2485 | } |
| 2486 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2487 | return std::nullopt; |
Peter Waller | 7a34145 | 2021-11-03 13:40:22 +0000 | [diff] [blame] | 2488 | } |
| 2489 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2490 | static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, |
| 2491 | IntrinsicInst &II) { |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2492 | Type *Int32Ty = IC.Builder.getInt32Ty(); |
Matt Devereau | fb47725 | 2021-12-09 15:32:35 +0000 | [diff] [blame] | 2493 | Value *Pred = II.getOperand(0); |
| 2494 | Value *Vec = II.getOperand(1); |
| 2495 | Value *DivVec = II.getOperand(2); |
| 2496 | |
| 2497 | Value *SplatValue = getSplatValue(DivVec); |
| 2498 | ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); |
| 2499 | if (!SplatConstantInt) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2500 | return std::nullopt; |
Matthew Devereau | 1808fc1 | 2024-09-20 13:53:02 +0100 | [diff] [blame] | 2501 | |
Matt Devereau | fb47725 | 2021-12-09 15:32:35 +0000 | [diff] [blame] | 2502 | APInt Divisor = SplatConstantInt->getValue(); |
Matthew Devereau | 1808fc1 | 2024-09-20 13:53:02 +0100 | [diff] [blame] | 2503 | const int64_t DivisorValue = Divisor.getSExtValue(); |
| 2504 | if (DivisorValue == -1) |
| 2505 | return std::nullopt; |
| 2506 | if (DivisorValue == 1) |
| 2507 | IC.replaceInstUsesWith(II, Vec); |
Matt Devereau | fb47725 | 2021-12-09 15:32:35 +0000 | [diff] [blame] | 2508 | |
| 2509 | if (Divisor.isPowerOf2()) { |
| 2510 | Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2511 | auto ASRD = IC.Builder.CreateIntrinsic( |
Matt Devereau | fb47725 | 2021-12-09 15:32:35 +0000 | [diff] [blame] | 2512 | Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); |
| 2513 | return IC.replaceInstUsesWith(II, ASRD); |
| 2514 | } |
| 2515 | if (Divisor.isNegatedPowerOf2()) { |
| 2516 | Divisor.negate(); |
| 2517 | Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2518 | auto ASRD = IC.Builder.CreateIntrinsic( |
Matt Devereau | fb47725 | 2021-12-09 15:32:35 +0000 | [diff] [blame] | 2519 | Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2520 | auto NEG = IC.Builder.CreateIntrinsic( |
| 2521 | Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD}); |
Matt Devereau | fb47725 | 2021-12-09 15:32:35 +0000 | [diff] [blame] | 2522 | return IC.replaceInstUsesWith(II, NEG); |
| 2523 | } |
| 2524 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2525 | return std::nullopt; |
Matt Devereau | fb47725 | 2021-12-09 15:32:35 +0000 | [diff] [blame] | 2526 | } |
| 2527 | |
Matt Devereau | 48df06f | 2023-01-16 14:21:18 +0000 | [diff] [blame] | 2528 | bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) { |
Matt Devereau | e18b971 | 2022-12-16 11:19:28 +0000 | [diff] [blame] | 2529 | size_t VecSize = Vec.size(); |
| 2530 | if (VecSize == 1) |
| 2531 | return true; |
| 2532 | if (!isPowerOf2_64(VecSize)) |
| 2533 | return false; |
| 2534 | size_t HalfVecSize = VecSize / 2; |
| 2535 | |
| 2536 | for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize; |
| 2537 | RHS != Vec.end(); LHS++, RHS++) { |
Matt Devereau | 48df06f | 2023-01-16 14:21:18 +0000 | [diff] [blame] | 2538 | if (*LHS != nullptr && *RHS != nullptr) { |
| 2539 | if (*LHS == *RHS) |
| 2540 | continue; |
| 2541 | else |
| 2542 | return false; |
| 2543 | } |
| 2544 | if (!AllowPoison) |
| 2545 | return false; |
| 2546 | if (*LHS == nullptr && *RHS != nullptr) |
| 2547 | *LHS = *RHS; |
Matt Devereau | e18b971 | 2022-12-16 11:19:28 +0000 | [diff] [blame] | 2548 | } |
| 2549 | |
| 2550 | Vec.resize(HalfVecSize); |
Matt Devereau | 48df06f | 2023-01-16 14:21:18 +0000 | [diff] [blame] | 2551 | SimplifyValuePattern(Vec, AllowPoison); |
Matt Devereau | e18b971 | 2022-12-16 11:19:28 +0000 | [diff] [blame] | 2552 | return true; |
| 2553 | } |
| 2554 | |
| 2555 | // Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B) |
| 2556 | // to dupqlane(f64(C)) where C is A concatenated with B |
| 2557 | static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC, |
| 2558 | IntrinsicInst &II) { |
| 2559 | Value *CurrentInsertElt = nullptr, *Default = nullptr; |
| 2560 | if (!match(II.getOperand(0), |
| 2561 | m_Intrinsic<Intrinsic::vector_insert>( |
| 2562 | m_Value(Default), m_Value(CurrentInsertElt), m_Value())) || |
| 2563 | !isa<FixedVectorType>(CurrentInsertElt->getType())) |
| 2564 | return std::nullopt; |
| 2565 | auto IIScalableTy = cast<ScalableVectorType>(II.getType()); |
| 2566 | |
| 2567 | // Insert the scalars into a container ordered by InsertElement index |
| 2568 | SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr); |
| 2569 | while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) { |
| 2570 | auto Idx = cast<ConstantInt>(InsertElt->getOperand(2)); |
| 2571 | Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1); |
| 2572 | CurrentInsertElt = InsertElt->getOperand(0); |
| 2573 | } |
| 2574 | |
Matt Devereau | 48df06f | 2023-01-16 14:21:18 +0000 | [diff] [blame] | 2575 | bool AllowPoison = |
| 2576 | isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default); |
| 2577 | if (!SimplifyValuePattern(Elts, AllowPoison)) |
Matt Devereau | e18b971 | 2022-12-16 11:19:28 +0000 | [diff] [blame] | 2578 | return std::nullopt; |
| 2579 | |
| 2580 | // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b) |
Matt Devereau | e18b971 | 2022-12-16 11:19:28 +0000 | [diff] [blame] | 2581 | Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType()); |
| 2582 | for (size_t I = 0; I < Elts.size(); I++) { |
Matt Devereau | 48df06f | 2023-01-16 14:21:18 +0000 | [diff] [blame] | 2583 | if (Elts[I] == nullptr) |
| 2584 | continue; |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2585 | InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I], |
| 2586 | IC.Builder.getInt64(I)); |
Matt Devereau | e18b971 | 2022-12-16 11:19:28 +0000 | [diff] [blame] | 2587 | } |
Matt Devereau | 48df06f | 2023-01-16 14:21:18 +0000 | [diff] [blame] | 2588 | if (InsertEltChain == nullptr) |
| 2589 | return std::nullopt; |
Matt Devereau | e18b971 | 2022-12-16 11:19:28 +0000 | [diff] [blame] | 2590 | |
| 2591 | // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64 |
| 2592 | // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector |
| 2593 | // be bitcast to a type wide enough to fit the sequence, be splatted, and then |
| 2594 | // be narrowed back to the original type. |
| 2595 | unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size(); |
| 2596 | unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() * |
| 2597 | IIScalableTy->getMinNumElements() / |
| 2598 | PatternWidth; |
| 2599 | |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2600 | IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth); |
Matt Devereau | e18b971 | 2022-12-16 11:19:28 +0000 | [diff] [blame] | 2601 | auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount); |
| 2602 | auto *WideShuffleMaskTy = |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2603 | ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount); |
Matt Devereau | e18b971 | 2022-12-16 11:19:28 +0000 | [diff] [blame] | 2604 | |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2605 | auto InsertSubvector = IC.Builder.CreateInsertVector( |
Craig Topper | 123758b | 2025-05-02 16:10:18 -0700 | [diff] [blame] | 2606 | II.getType(), PoisonValue::get(II.getType()), InsertEltChain, |
| 2607 | uint64_t(0)); |
Matt Devereau | e18b971 | 2022-12-16 11:19:28 +0000 | [diff] [blame] | 2608 | auto WideBitcast = |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2609 | IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy); |
Matt Devereau | e18b971 | 2022-12-16 11:19:28 +0000 | [diff] [blame] | 2610 | auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy); |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2611 | auto WideShuffle = IC.Builder.CreateShuffleVector( |
Matt Devereau | e18b971 | 2022-12-16 11:19:28 +0000 | [diff] [blame] | 2612 | WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask); |
| 2613 | auto NarrowBitcast = |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2614 | IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType()); |
Matt Devereau | e18b971 | 2022-12-16 11:19:28 +0000 | [diff] [blame] | 2615 | |
| 2616 | return IC.replaceInstUsesWith(II, NarrowBitcast); |
| 2617 | } |
| 2618 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2619 | static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, |
| 2620 | IntrinsicInst &II) { |
Florian Hahn | 17a7399 | 2022-05-10 19:57:43 +0100 | [diff] [blame] | 2621 | Value *A = II.getArgOperand(0); |
| 2622 | Value *B = II.getArgOperand(1); |
| 2623 | if (A == B) |
| 2624 | return IC.replaceInstUsesWith(II, A); |
| 2625 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2626 | return std::nullopt; |
Florian Hahn | 17a7399 | 2022-05-10 19:57:43 +0100 | [diff] [blame] | 2627 | } |
| 2628 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2629 | static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, |
| 2630 | IntrinsicInst &II) { |
Bradley Smith | 5f4541f | 2022-05-06 14:45:56 +0000 | [diff] [blame] | 2631 | Value *Pred = II.getOperand(0); |
| 2632 | Value *Vec = II.getOperand(1); |
| 2633 | Value *Shift = II.getOperand(2); |
| 2634 | |
| 2635 | // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. |
| 2636 | Value *AbsPred, *MergedValue; |
| 2637 | if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( |
| 2638 | m_Value(MergedValue), m_Value(AbsPred), m_Value())) && |
| 2639 | !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>( |
| 2640 | m_Value(MergedValue), m_Value(AbsPred), m_Value()))) |
| 2641 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2642 | return std::nullopt; |
Bradley Smith | 5f4541f | 2022-05-06 14:45:56 +0000 | [diff] [blame] | 2643 | |
| 2644 | // Transform is valid if any of the following are true: |
| 2645 | // * The ABS merge value is an undef or non-negative |
| 2646 | // * The ABS predicate is all active |
| 2647 | // * The ABS predicate and the SRSHL predicates are the same |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2648 | if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) && |
Bradley Smith | 5f4541f | 2022-05-06 14:45:56 +0000 | [diff] [blame] | 2649 | AbsPred != Pred && !isAllActivePredicate(AbsPred)) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2650 | return std::nullopt; |
Bradley Smith | 5f4541f | 2022-05-06 14:45:56 +0000 | [diff] [blame] | 2651 | |
| 2652 | // Only valid when the shift amount is non-negative, otherwise the rounding |
| 2653 | // behaviour of SRSHL cannot be ignored. |
| 2654 | if (!match(Shift, m_NonNegative())) |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2655 | return std::nullopt; |
Bradley Smith | 5f4541f | 2022-05-06 14:45:56 +0000 | [diff] [blame] | 2656 | |
Nikita Popov | 724f4a5 | 2023-05-16 18:11:17 +0200 | [diff] [blame] | 2657 | auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, |
| 2658 | {II.getType()}, {Pred, Vec, Shift}); |
Bradley Smith | 5f4541f | 2022-05-06 14:45:56 +0000 | [diff] [blame] | 2659 | |
| 2660 | return IC.replaceInstUsesWith(II, LSL); |
| 2661 | } |
| 2662 | |
Paul Walker | 622ae7f | 2024-09-24 15:11:36 +0100 | [diff] [blame] | 2663 | static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC, |
| 2664 | IntrinsicInst &II) { |
| 2665 | Value *Vec = II.getOperand(0); |
| 2666 | |
| 2667 | if (getSplatValue(Vec) == II.getOperand(1)) |
| 2668 | return IC.replaceInstUsesWith(II, Vec); |
| 2669 | |
| 2670 | return std::nullopt; |
| 2671 | } |
| 2672 | |
Danila Malyutin | 1a60905 | 2024-10-17 21:04:04 +0400 | [diff] [blame] | 2673 | static std::optional<Instruction *> instCombineDMB(InstCombiner &IC, |
| 2674 | IntrinsicInst &II) { |
| 2675 | // If this barrier is post-dominated by identical one we can remove it |
| 2676 | auto *NI = II.getNextNonDebugInstruction(); |
| 2677 | unsigned LookaheadThreshold = DMBLookaheadThreshold; |
| 2678 | auto CanSkipOver = [](Instruction *I) { |
| 2679 | return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects(); |
| 2680 | }; |
| 2681 | while (LookaheadThreshold-- && CanSkipOver(NI)) { |
| 2682 | auto *NIBB = NI->getParent(); |
| 2683 | NI = NI->getNextNonDebugInstruction(); |
| 2684 | if (!NI) { |
| 2685 | if (auto *SuccBB = NIBB->getUniqueSuccessor()) |
Jeremy Morse | 81d18ad8 | 2025-01-27 16:27:54 +0000 | [diff] [blame] | 2686 | NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime(); |
Danila Malyutin | 1a60905 | 2024-10-17 21:04:04 +0400 | [diff] [blame] | 2687 | else |
| 2688 | break; |
| 2689 | } |
| 2690 | } |
| 2691 | auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI); |
| 2692 | if (NextII && II.isIdenticalTo(NextII)) |
| 2693 | return IC.eraseInstFromFunction(II); |
| 2694 | |
| 2695 | return std::nullopt; |
| 2696 | } |
| 2697 | |
Matthew Devereau | 91a2056 | 2025-04-13 20:40:51 +0100 | [diff] [blame] | 2698 | static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC, |
| 2699 | IntrinsicInst &II) { |
| 2700 | if (match(II.getOperand(0), m_ConstantInt<AArch64SVEPredPattern::all>())) |
| 2701 | return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType())); |
| 2702 | return std::nullopt; |
| 2703 | } |
| 2704 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2705 | std::optional<Instruction *> |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 2706 | AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, |
| 2707 | IntrinsicInst &II) const { |
Paul Walker | c192737 | 2025-04-01 13:27:46 +0100 | [diff] [blame] | 2708 | const SVEIntrinsicInfo &IInfo = constructSVEIntrinsicInfo(II); |
| 2709 | if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo)) |
| 2710 | return I; |
| 2711 | |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 2712 | Intrinsic::ID IID = II.getIntrinsicID(); |
| 2713 | switch (IID) { |
| 2714 | default: |
| 2715 | break; |
Danila Malyutin | 1a60905 | 2024-10-17 21:04:04 +0400 | [diff] [blame] | 2716 | case Intrinsic::aarch64_dmb: |
| 2717 | return instCombineDMB(IC, II); |
Florian Hahn | 17a7399 | 2022-05-10 19:57:43 +0100 | [diff] [blame] | 2718 | case Intrinsic::aarch64_neon_fmaxnm: |
| 2719 | case Intrinsic::aarch64_neon_fminnm: |
| 2720 | return instCombineMaxMinNM(IC, II); |
Bradley Smith | c8f20ed | 2021-04-26 16:19:25 +0100 | [diff] [blame] | 2721 | case Intrinsic::aarch64_sve_convert_from_svbool: |
| 2722 | return instCombineConvertFromSVBool(IC, II); |
Bradley Smith | 89085bc | 2021-04-23 13:55:42 +0100 | [diff] [blame] | 2723 | case Intrinsic::aarch64_sve_dup: |
| 2724 | return instCombineSVEDup(IC, II); |
Usman Nadeem | ab111e9 | 2021-09-10 17:57:29 -0700 | [diff] [blame] | 2725 | case Intrinsic::aarch64_sve_dup_x: |
| 2726 | return instCombineSVEDupX(IC, II); |
Bradley Smith | 60c9b5f | 2021-05-20 11:13:34 +0100 | [diff] [blame] | 2727 | case Intrinsic::aarch64_sve_cmpne: |
| 2728 | case Intrinsic::aarch64_sve_cmpne_wide: |
| 2729 | return instCombineSVECmpNE(IC, II); |
Peter Waller | 2d574a1 | 2021-05-12 14:47:22 +0000 | [diff] [blame] | 2730 | case Intrinsic::aarch64_sve_rdffr: |
| 2731 | return instCombineRDFFR(IC, II); |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 2732 | case Intrinsic::aarch64_sve_lasta: |
| 2733 | case Intrinsic::aarch64_sve_lastb: |
| 2734 | return instCombineSVELast(IC, II); |
Cullen Rhodes | 7c3cda5 | 2022-07-08 15:18:27 +0000 | [diff] [blame] | 2735 | case Intrinsic::aarch64_sve_clasta_n: |
| 2736 | case Intrinsic::aarch64_sve_clastb_n: |
| 2737 | return instCombineSVECondLast(IC, II); |
Jun Ma | ae543394 | 2021-06-18 11:55:01 +0800 | [diff] [blame] | 2738 | case Intrinsic::aarch64_sve_cntd: |
| 2739 | return instCombineSVECntElts(IC, II, 2); |
| 2740 | case Intrinsic::aarch64_sve_cntw: |
| 2741 | return instCombineSVECntElts(IC, II, 4); |
| 2742 | case Intrinsic::aarch64_sve_cnth: |
| 2743 | return instCombineSVECntElts(IC, II, 8); |
| 2744 | case Intrinsic::aarch64_sve_cntb: |
| 2745 | return instCombineSVECntElts(IC, II, 16); |
Bradley Smith | 191f9fa | 2021-07-13 14:42:36 +0000 | [diff] [blame] | 2746 | case Intrinsic::aarch64_sve_ptest_any: |
| 2747 | case Intrinsic::aarch64_sve_ptest_first: |
| 2748 | case Intrinsic::aarch64_sve_ptest_last: |
| 2749 | return instCombineSVEPTest(IC, II); |
Matthew Devereau | f085a9d | 2021-09-01 16:41:42 +0100 | [diff] [blame] | 2750 | case Intrinsic::aarch64_sve_fadd: |
Paul Walker | c7c71aa | 2023-06-17 16:48:09 +0100 | [diff] [blame] | 2751 | return instCombineSVEVectorFAdd(IC, II); |
Jolanta Jensen | dc63b35 | 2023-05-17 09:21:40 +0000 | [diff] [blame] | 2752 | case Intrinsic::aarch64_sve_fadd_u: |
Paul Walker | c7c71aa | 2023-06-17 16:48:09 +0100 | [diff] [blame] | 2753 | return instCombineSVEVectorFAddU(IC, II); |
Jolanta Jensen | ecb07f4 | 2023-05-17 09:21:40 +0000 | [diff] [blame] | 2754 | case Intrinsic::aarch64_sve_fmul_u: |
Paul Walker | a7999f3 | 2025-04-17 15:58:39 +0100 | [diff] [blame] | 2755 | return instCombineSVEVectorBinOp(IC, II); |
Jolanta Jensen | ecb07f4 | 2023-05-17 09:21:40 +0000 | [diff] [blame] | 2756 | case Intrinsic::aarch64_sve_fsub: |
| 2757 | return instCombineSVEVectorFSub(IC, II); |
| 2758 | case Intrinsic::aarch64_sve_fsub_u: |
| 2759 | return instCombineSVEVectorFSubU(IC, II); |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2760 | case Intrinsic::aarch64_sve_add: |
| 2761 | return instCombineSVEVectorAdd(IC, II); |
Jolanta Jensen | 105d63a | 2023-05-12 13:00:55 +0000 | [diff] [blame] | 2762 | case Intrinsic::aarch64_sve_add_u: |
| 2763 | return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, |
| 2764 | Intrinsic::aarch64_sve_mla_u>( |
| 2765 | IC, II, true); |
Matt Devereau | a107cf0 | 2022-12-15 16:09:13 +0000 | [diff] [blame] | 2766 | case Intrinsic::aarch64_sve_sub: |
| 2767 | return instCombineSVEVectorSub(IC, II); |
Jolanta Jensen | 105d63a | 2023-05-12 13:00:55 +0000 | [diff] [blame] | 2768 | case Intrinsic::aarch64_sve_sub_u: |
| 2769 | return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, |
| 2770 | Intrinsic::aarch64_sve_mls_u>( |
| 2771 | IC, II, true); |
Bradley Smith | 191f9fa | 2021-07-13 14:42:36 +0000 | [diff] [blame] | 2772 | case Intrinsic::aarch64_sve_tbl: |
| 2773 | return instCombineSVETBL(IC, II); |
Usman Nadeem | 5420fc4 | 2021-08-05 17:23:01 -0700 | [diff] [blame] | 2774 | case Intrinsic::aarch64_sve_uunpkhi: |
| 2775 | case Intrinsic::aarch64_sve_uunpklo: |
| 2776 | case Intrinsic::aarch64_sve_sunpkhi: |
| 2777 | case Intrinsic::aarch64_sve_sunpklo: |
| 2778 | return instCombineSVEUnpack(IC, II); |
Usman Nadeem | 267d6b5 | 2024-02-15 10:40:09 -0800 | [diff] [blame] | 2779 | case Intrinsic::aarch64_sve_uzp1: |
| 2780 | return instCombineSVEUzp1(IC, II); |
Usman Nadeem | 757384a | 2021-09-12 15:53:26 -0700 | [diff] [blame] | 2781 | case Intrinsic::aarch64_sve_zip1: |
| 2782 | case Intrinsic::aarch64_sve_zip2: |
| 2783 | return instCombineSVEZip(IC, II); |
Peter Waller | 7a34145 | 2021-11-03 13:40:22 +0000 | [diff] [blame] | 2784 | case Intrinsic::aarch64_sve_ld1_gather_index: |
| 2785 | return instCombineLD1GatherIndex(IC, II); |
| 2786 | case Intrinsic::aarch64_sve_st1_scatter_index: |
| 2787 | return instCombineST1ScatterIndex(IC, II); |
Matt Devereau | f526c60 | 2021-11-04 16:10:55 +0000 | [diff] [blame] | 2788 | case Intrinsic::aarch64_sve_ld1: |
| 2789 | return instCombineSVELD1(IC, II, DL); |
| 2790 | case Intrinsic::aarch64_sve_st1: |
| 2791 | return instCombineSVEST1(IC, II, DL); |
Matt Devereau | fb47725 | 2021-12-09 15:32:35 +0000 | [diff] [blame] | 2792 | case Intrinsic::aarch64_sve_sdiv: |
| 2793 | return instCombineSVESDIV(IC, II); |
Matt Devereau | a9e08bc | 2022-03-16 11:41:14 +0000 | [diff] [blame] | 2794 | case Intrinsic::aarch64_sve_sel: |
| 2795 | return instCombineSVESel(IC, II); |
Bradley Smith | 5f4541f | 2022-05-06 14:45:56 +0000 | [diff] [blame] | 2796 | case Intrinsic::aarch64_sve_srshl: |
| 2797 | return instCombineSVESrshl(IC, II); |
Matt Devereau | e18b971 | 2022-12-16 11:19:28 +0000 | [diff] [blame] | 2798 | case Intrinsic::aarch64_sve_dupq_lane: |
| 2799 | return instCombineSVEDupqLane(IC, II); |
Paul Walker | 622ae7f | 2024-09-24 15:11:36 +0100 | [diff] [blame] | 2800 | case Intrinsic::aarch64_sve_insr: |
| 2801 | return instCombineSVEInsr(IC, II); |
Matthew Devereau | 91a2056 | 2025-04-13 20:40:51 +0100 | [diff] [blame] | 2802 | case Intrinsic::aarch64_sve_ptrue: |
| 2803 | return instCombinePTrue(IC, II); |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 2804 | } |
| 2805 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2806 | return std::nullopt; |
Joe Ellis | c91cd4f | 2021-04-16 10:05:05 +0000 | [diff] [blame] | 2807 | } |
| 2808 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2809 | std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( |
David Green | 61888d9 | 2022-01-13 11:53:12 +0000 | [diff] [blame] | 2810 | InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, |
| 2811 | APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, |
| 2812 | std::function<void(Instruction *, unsigned, APInt, APInt &)> |
| 2813 | SimplifyAndSetOp) const { |
| 2814 | switch (II.getIntrinsicID()) { |
| 2815 | default: |
| 2816 | break; |
| 2817 | case Intrinsic::aarch64_neon_fcvtxn: |
| 2818 | case Intrinsic::aarch64_neon_rshrn: |
| 2819 | case Intrinsic::aarch64_neon_sqrshrn: |
| 2820 | case Intrinsic::aarch64_neon_sqrshrun: |
| 2821 | case Intrinsic::aarch64_neon_sqshrn: |
| 2822 | case Intrinsic::aarch64_neon_sqshrun: |
| 2823 | case Intrinsic::aarch64_neon_sqxtn: |
| 2824 | case Intrinsic::aarch64_neon_sqxtun: |
| 2825 | case Intrinsic::aarch64_neon_uqrshrn: |
| 2826 | case Intrinsic::aarch64_neon_uqshrn: |
| 2827 | case Intrinsic::aarch64_neon_uqxtn: |
| 2828 | SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); |
| 2829 | break; |
| 2830 | } |
| 2831 | |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 2832 | return std::nullopt; |
David Green | 61888d9 | 2022-01-13 11:53:12 +0000 | [diff] [blame] | 2833 | } |
| 2834 | |
Paul Walker | 7775a48 | 2024-08-05 11:25:44 +0100 | [diff] [blame] | 2835 | bool AArch64TTIImpl::enableScalableVectorization() const { |
| 2836 | return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() && |
| 2837 | EnableScalableAutovecInStreamingMode); |
| 2838 | } |
| 2839 | |
Sander de Smalen | 137459a | 2022-10-19 14:14:00 +0000 | [diff] [blame] | 2840 | TypeSize |
| 2841 | AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
| 2842 | switch (K) { |
| 2843 | case TargetTransformInfo::RGK_Scalar: |
Sander de Smalen | 81b7f11 | 2023-11-22 08:52:53 +0000 | [diff] [blame] | 2844 | return TypeSize::getFixed(64); |
Sander de Smalen | 137459a | 2022-10-19 14:14:00 +0000 | [diff] [blame] | 2845 | case TargetTransformInfo::RGK_FixedWidthVector: |
Sander de Smalen | 738533c | 2024-06-24 11:06:16 +0100 | [diff] [blame] | 2846 | if (ST->useSVEForFixedLengthVectors() && |
| 2847 | (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode)) |
Sander de Smalen | 81b7f11 | 2023-11-22 08:52:53 +0000 | [diff] [blame] | 2848 | return TypeSize::getFixed( |
| 2849 | std::max(ST->getMinSVEVectorSizeInBits(), 128u)); |
Sander de Smalen | 738533c | 2024-06-24 11:06:16 +0100 | [diff] [blame] | 2850 | else if (ST->isNeonAvailable()) |
| 2851 | return TypeSize::getFixed(128); |
| 2852 | else |
| 2853 | return TypeSize::getFixed(0); |
Sander de Smalen | 137459a | 2022-10-19 14:14:00 +0000 | [diff] [blame] | 2854 | case TargetTransformInfo::RGK_ScalableVector: |
Sander de Smalen | 738533c | 2024-06-24 11:06:16 +0100 | [diff] [blame] | 2855 | if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() && |
| 2856 | EnableScalableAutovecInStreamingMode)) |
| 2857 | return TypeSize::getScalable(128); |
| 2858 | else |
Sander de Smalen | 81b7f11 | 2023-11-22 08:52:53 +0000 | [diff] [blame] | 2859 | return TypeSize::getScalable(0); |
Sander de Smalen | 137459a | 2022-10-19 14:14:00 +0000 | [diff] [blame] | 2860 | } |
| 2861 | llvm_unreachable("Unsupported register kind"); |
| 2862 | } |
| 2863 | |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 2864 | bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, |
David Green | 1712ae6 | 2023-07-12 13:13:06 +0100 | [diff] [blame] | 2865 | ArrayRef<const Value *> Args, |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 2866 | Type *SrcOverrideTy) const { |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 2867 | // A helper that returns a vector type from the given type. The number of |
David Kreitzer | 6918a15 | 2022-04-29 12:26:13 -0700 | [diff] [blame] | 2868 | // elements in type Ty determines the vector width. |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 2869 | auto toVectorTy = [&](Type *ArgTy) { |
Caroline Concatto | 6c4d8f4 | 2020-11-11 14:41:01 +0000 | [diff] [blame] | 2870 | return VectorType::get(ArgTy->getScalarType(), |
| 2871 | cast<VectorType>(DstTy)->getElementCount()); |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 2872 | }; |
| 2873 | |
David Green | 1712ae6 | 2023-07-12 13:13:06 +0100 | [diff] [blame] | 2874 | // Exit early if DstTy is not a vector type whose elements are one of [i16, |
| 2875 | // i32, i64]. SVE doesn't generally have the same set of instructions to |
David Green | f2a92db | 2022-11-30 13:09:48 +0000 | [diff] [blame] | 2876 | // perform an extend with the add/sub/mul. There are SMULLB style |
| 2877 | // instructions, but they operate on top/bottom, requiring some sort of lane |
| 2878 | // interleaving to be used with zext/sext. |
David Green | 1712ae6 | 2023-07-12 13:13:06 +0100 | [diff] [blame] | 2879 | unsigned DstEltSize = DstTy->getScalarSizeInBits(); |
| 2880 | if (!useNeonVector(DstTy) || Args.size() != 2 || |
| 2881 | (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 2882 | return false; |
| 2883 | |
| 2884 | // Determine if the operation has a widening variant. We consider both the |
| 2885 | // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the |
| 2886 | // instructions. |
| 2887 | // |
David Green | 2abaa02 | 2022-04-04 12:45:04 +0100 | [diff] [blame] | 2888 | // TODO: Add additional widening operations (e.g., shl, etc.) once we |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 2889 | // verify that their extending operands are eliminated during code |
| 2890 | // generation. |
David Green | 1712ae6 | 2023-07-12 13:13:06 +0100 | [diff] [blame] | 2891 | Type *SrcTy = SrcOverrideTy; |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 2892 | switch (Opcode) { |
| 2893 | case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). |
| 2894 | case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). |
David Green | 1712ae6 | 2023-07-12 13:13:06 +0100 | [diff] [blame] | 2895 | // The second operand needs to be an extend |
| 2896 | if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) { |
| 2897 | if (!SrcTy) |
| 2898 | SrcTy = |
| 2899 | toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType()); |
| 2900 | } else |
| 2901 | return false; |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 2902 | break; |
David Green | 1712ae6 | 2023-07-12 13:13:06 +0100 | [diff] [blame] | 2903 | case Instruction::Mul: { // SMULL(2), UMULL(2) |
| 2904 | // Both operands need to be extends of the same type. |
| 2905 | if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || |
| 2906 | (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { |
| 2907 | if (!SrcTy) |
| 2908 | SrcTy = |
| 2909 | toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType()); |
| 2910 | } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) { |
| 2911 | // If one of the operands is a Zext and the other has enough zero bits to |
| 2912 | // be treated as unsigned, we can still general a umull, meaning the zext |
| 2913 | // is free. |
| 2914 | KnownBits Known = |
| 2915 | computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); |
| 2916 | if (Args[0]->getType()->getScalarSizeInBits() - |
| 2917 | Known.Zero.countLeadingOnes() > |
| 2918 | DstTy->getScalarSizeInBits() / 2) |
| 2919 | return false; |
| 2920 | if (!SrcTy) |
| 2921 | SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(), |
| 2922 | DstTy->getScalarSizeInBits() / 2)); |
| 2923 | } else |
| 2924 | return false; |
| 2925 | break; |
| 2926 | } |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 2927 | default: |
| 2928 | return false; |
| 2929 | } |
| 2930 | |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 2931 | // Legalize the destination type and ensure it can be used in a widening |
| 2932 | // operation. |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 2933 | auto DstTyL = getTypeLegalizationCost(DstTy); |
David Green | 1712ae6 | 2023-07-12 13:13:06 +0100 | [diff] [blame] | 2934 | if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits()) |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 2935 | return false; |
| 2936 | |
| 2937 | // Legalize the source type and ensure it can be used in a widening |
| 2938 | // operation. |
David Green | 1712ae6 | 2023-07-12 13:13:06 +0100 | [diff] [blame] | 2939 | assert(SrcTy && "Expected some SrcTy"); |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 2940 | auto SrcTyL = getTypeLegalizationCost(SrcTy); |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 2941 | unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); |
| 2942 | if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) |
| 2943 | return false; |
| 2944 | |
| 2945 | // Get the total number of vector elements in the legalized types. |
Daniil Fukalov | 3489c2d | 2021-04-29 16:02:51 +0300 | [diff] [blame] | 2946 | InstructionCost NumDstEls = |
| 2947 | DstTyL.first * DstTyL.second.getVectorMinNumElements(); |
| 2948 | InstructionCost NumSrcEls = |
| 2949 | SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 2950 | |
| 2951 | // Return true if the legalized types have the same number of vector elements |
| 2952 | // and the destination element type size is twice that of the source type. |
David Green | 1712ae6 | 2023-07-12 13:13:06 +0100 | [diff] [blame] | 2953 | return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 2954 | } |
| 2955 | |
Kerry McLaughlin | 9a98ab5 | 2023-08-29 08:15:29 +0000 | [diff] [blame] | 2956 | // s/urhadd instructions implement the following pattern, making the |
| 2957 | // extends free: |
| 2958 | // %x = add ((zext i8 -> i16), 1) |
| 2959 | // %y = (zext i8 -> i16) |
| 2960 | // trunc i16 (lshr (add %x, %y), 1) -> i8 |
| 2961 | // |
zhongyunde | f41223ee | 2023-09-01 23:40:21 +0800 | [diff] [blame] | 2962 | bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 2963 | Type *Src) const { |
Kerry McLaughlin | 9a98ab5 | 2023-08-29 08:15:29 +0000 | [diff] [blame] | 2964 | // The source should be a legal vector type. |
| 2965 | if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) || |
| 2966 | (Src->isScalableTy() && !ST->hasSVE2())) |
| 2967 | return false; |
| 2968 | |
| 2969 | if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse()) |
| 2970 | return false; |
| 2971 | |
| 2972 | // Look for trunc/shl/add before trying to match the pattern. |
| 2973 | const Instruction *Add = ExtUser; |
| 2974 | auto *AddUser = |
| 2975 | dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser()); |
| 2976 | if (AddUser && AddUser->getOpcode() == Instruction::Add) |
| 2977 | Add = AddUser; |
| 2978 | |
| 2979 | auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser()); |
| 2980 | if (!Shr || Shr->getOpcode() != Instruction::LShr) |
| 2981 | return false; |
| 2982 | |
| 2983 | auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser()); |
| 2984 | if (!Trunc || Trunc->getOpcode() != Instruction::Trunc || |
| 2985 | Src->getScalarSizeInBits() != |
| 2986 | cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits()) |
| 2987 | return false; |
| 2988 | |
| 2989 | // Try to match the whole pattern. Ext could be either the first or second |
| 2990 | // m_ZExtOrSExt matched. |
| 2991 | Instruction *Ex1, *Ex2; |
| 2992 | if (!(match(Add, m_c_Add(m_Instruction(Ex1), |
| 2993 | m_c_Add(m_Instruction(Ex2), m_SpecificInt(1)))))) |
| 2994 | return false; |
| 2995 | |
| 2996 | // Ensure both extends are of the same type |
| 2997 | if (match(Ex1, m_ZExtOrSExt(m_Value())) && |
| 2998 | Ex1->getOpcode() == Ex2->getOpcode()) |
| 2999 | return true; |
| 3000 | |
| 3001 | return false; |
| 3002 | } |
| 3003 | |
Sander de Smalen | 92d8421 | 2021-01-21 13:40:22 +0000 | [diff] [blame] | 3004 | InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, |
| 3005 | Type *Src, |
| 3006 | TTI::CastContextHint CCH, |
| 3007 | TTI::TargetCostKind CostKind, |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 3008 | const Instruction *I) const { |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 3009 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 3010 | assert(ISD && "Invalid opcode"); |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 3011 | // If the cast is observable, and it is used by a widening instruction (e.g., |
| 3012 | // uaddl, saddw, etc.), it may be free. |
David Green | 2abaa02 | 2022-04-04 12:45:04 +0100 | [diff] [blame] | 3013 | if (I && I->hasOneUser()) { |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 3014 | auto *SingleUser = cast<Instruction>(*I->user_begin()); |
| 3015 | SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); |
David Green | 1712ae6 | 2023-07-12 13:13:06 +0100 | [diff] [blame] | 3016 | if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) { |
| 3017 | // For adds only count the second operand as free if both operands are |
| 3018 | // extends but not the same operation. (i.e both operands are not free in |
| 3019 | // add(sext, zext)). |
| 3020 | if (SingleUser->getOpcode() == Instruction::Add) { |
| 3021 | if (I == SingleUser->getOperand(1) || |
| 3022 | (isa<CastInst>(SingleUser->getOperand(1)) && |
| 3023 | cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode)) |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 3024 | return 0; |
David Green | 1712ae6 | 2023-07-12 13:13:06 +0100 | [diff] [blame] | 3025 | } else // Others are free so long as isWideningInstruction returned true. |
| 3026 | return 0; |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 3027 | } |
Kerry McLaughlin | 9a98ab5 | 2023-08-29 08:15:29 +0000 | [diff] [blame] | 3028 | |
| 3029 | // The cast will be free for the s/urhadd instructions |
| 3030 | if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) && |
zhongyunde | f41223ee | 2023-09-01 23:40:21 +0800 | [diff] [blame] | 3031 | isExtPartOfAvgExpr(SingleUser, Dst, Src)) |
Kerry McLaughlin | 9a98ab5 | 2023-08-29 08:15:29 +0000 | [diff] [blame] | 3032 | return 0; |
Matthew Simpson | 78fd46b | 2017-05-09 20:18:12 +0000 | [diff] [blame] | 3033 | } |
| 3034 | |
Sam Parker | 8aaabad | 2020-05-26 11:27:57 +0100 | [diff] [blame] | 3035 | // TODO: Allow non-throughput costs that aren't binary. |
Sander de Smalen | 92d8421 | 2021-01-21 13:40:22 +0000 | [diff] [blame] | 3036 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { |
Sam Parker | 8aaabad | 2020-05-26 11:27:57 +0100 | [diff] [blame] | 3037 | if (CostKind != TTI::TCK_RecipThroughput) |
| 3038 | return Cost == 0 ? 0 : 1; |
| 3039 | return Cost; |
| 3040 | }; |
| 3041 | |
Mehdi Amini | 44ede33 | 2015-07-09 02:09:04 +0000 | [diff] [blame] | 3042 | EVT SrcTy = TLI->getValueType(DL, Src); |
| 3043 | EVT DstTy = TLI->getValueType(DL, Dst); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 3044 | |
| 3045 | if (!SrcTy.isSimple() || !DstTy.isSimple()) |
David Green | 60280e9 | 2020-07-29 13:32:53 +0100 | [diff] [blame] | 3046 | return AdjustCost( |
| 3047 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 3048 | |
David Green | 2db7b31 | 2025-01-07 09:39:08 +0000 | [diff] [blame] | 3049 | static const TypeConversionCostTblEntry BF16Tbl[] = { |
| 3050 | {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt |
| 3051 | {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt |
| 3052 | {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn |
| 3053 | {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2 |
| 3054 | {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn |
| 3055 | {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn |
| 3056 | {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn |
| 3057 | }; |
| 3058 | |
| 3059 | if (ST->hasBF16()) |
| 3060 | if (const auto *Entry = ConvertCostTableLookup( |
| 3061 | BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) |
| 3062 | return AdjustCost(Entry->Cost); |
| 3063 | |
Graham Hunter | f737df7 | 2025-03-25 10:43:44 +0000 | [diff] [blame] | 3064 | // Symbolic constants for the SVE sitofp/uitofp entries in the table below |
| 3065 | // The cost of unpacking twice is artificially increased for now in order |
| 3066 | // to avoid regressions against NEON, which will use tbl instructions directly |
| 3067 | // instead of multiple layers of [s|u]unpk[lo|hi]. |
| 3068 | // We use the unpacks in cases where the destination type is illegal and |
| 3069 | // requires splitting of the input, even if the input type itself is legal. |
| 3070 | const unsigned int SVE_EXT_COST = 1; |
| 3071 | const unsigned int SVE_FCVT_COST = 1; |
| 3072 | const unsigned int SVE_UNPACK_ONCE = 4; |
| 3073 | const unsigned int SVE_UNPACK_TWICE = 16; |
| 3074 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3075 | static const TypeConversionCostTblEntry ConversionTbl[] = { |
| 3076 | {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn |
| 3077 | {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn |
| 3078 | {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn |
| 3079 | {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn |
| 3080 | {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1 |
| 3081 | {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn |
| 3082 | {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn |
| 3083 | {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1 |
| 3084 | {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn |
| 3085 | {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn |
| 3086 | {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn |
| 3087 | {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1 |
| 3088 | {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1 |
| 3089 | {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1 |
| 3090 | {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1 |
| 3091 | {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1 |
| 3092 | {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1 |
| 3093 | {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1 |
| 3094 | {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1 |
| 3095 | {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1 |
Silviu Baranga | b322aa6 | 2015-08-17 16:05:09 +0000 | [diff] [blame] | 3096 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3097 | // Truncations on nxvmiN |
David Sherwood | eaf482f | 2024-12-19 10:07:41 +0000 | [diff] [blame] | 3098 | {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2}, |
| 3099 | {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2}, |
| 3100 | {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2}, |
| 3101 | {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2}, |
| 3102 | {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2}, |
| 3103 | {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2}, |
| 3104 | {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2}, |
| 3105 | {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5}, |
| 3106 | {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2}, |
| 3107 | {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2}, |
| 3108 | {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5}, |
| 3109 | {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11}, |
| 3110 | {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2}, |
| 3111 | {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0}, |
| 3112 | {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0}, |
| 3113 | {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0}, |
| 3114 | {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0}, |
| 3115 | {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0}, |
| 3116 | {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0}, |
| 3117 | {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0}, |
| 3118 | {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0}, |
| 3119 | {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1}, |
| 3120 | {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0}, |
| 3121 | {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1}, |
| 3122 | {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1}, |
| 3123 | {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0}, |
| 3124 | {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1}, |
| 3125 | {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3}, |
| 3126 | {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1}, |
| 3127 | {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3}, |
| 3128 | {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1}, |
| 3129 | {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3}, |
| 3130 | {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7}, |
Nashe Mncube | 19601a4c | 2021-03-17 12:00:31 +0000 | [diff] [blame] | 3131 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3132 | // The number of shll instructions for the extension. |
| 3133 | {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3}, |
| 3134 | {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3}, |
| 3135 | {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2}, |
| 3136 | {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2}, |
| 3137 | {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3}, |
| 3138 | {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3}, |
| 3139 | {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2}, |
| 3140 | {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2}, |
| 3141 | {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7}, |
| 3142 | {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7}, |
| 3143 | {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6}, |
| 3144 | {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6}, |
| 3145 | {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2}, |
| 3146 | {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2}, |
| 3147 | {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6}, |
| 3148 | {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6}, |
Silviu Baranga | b322aa6 | 2015-08-17 16:05:09 +0000 | [diff] [blame] | 3149 | |
David Green | 2f18b5e | 2024-12-11 06:26:41 +0000 | [diff] [blame] | 3150 | // FP Ext and trunc |
| 3151 | {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt |
| 3152 | {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl |
| 3153 | {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2 |
| 3154 | // FP16 |
| 3155 | {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt |
| 3156 | {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt |
| 3157 | {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl |
| 3158 | {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2 |
| 3159 | {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl |
| 3160 | {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl |
| 3161 | {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl |
David Green | 2db7b31 | 2025-01-07 09:39:08 +0000 | [diff] [blame] | 3162 | // BF16 (uses shift) |
| 3163 | {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl |
| 3164 | {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt |
| 3165 | {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll |
| 3166 | {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2 |
| 3167 | {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl |
| 3168 | {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2 |
| 3169 | {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2 |
David Green | 2f18b5e | 2024-12-11 06:26:41 +0000 | [diff] [blame] | 3170 | // FP Ext and trunc |
| 3171 | {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt |
| 3172 | {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn |
| 3173 | {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2 |
| 3174 | // FP16 |
| 3175 | {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt |
| 3176 | {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt |
| 3177 | {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn |
| 3178 | {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2 |
| 3179 | {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn |
| 3180 | {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn |
| 3181 | {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn |
David Green | 2db7b31 | 2025-01-07 09:39:08 +0000 | [diff] [blame] | 3182 | // BF16 (more complex, with +bf16 is handled above) |
| 3183 | {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns |
| 3184 | {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above |
| 3185 | {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8}, |
| 3186 | {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8}, |
| 3187 | {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15}, |
| 3188 | {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9}, |
| 3189 | {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10}, |
| 3190 | {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19}, |
David Green | 2f18b5e | 2024-12-11 06:26:41 +0000 | [diff] [blame] | 3191 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3192 | // LowerVectorINT_TO_FP: |
| 3193 | {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1}, |
| 3194 | {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1}, |
| 3195 | {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1}, |
| 3196 | {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1}, |
| 3197 | {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1}, |
| 3198 | {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1}, |
Tim Northover | ef0d760 | 2014-06-15 09:27:06 +0000 | [diff] [blame] | 3199 | |
Graham Hunter | f737df7 | 2025-03-25 10:43:44 +0000 | [diff] [blame] | 3200 | // SVE: to nxv2f16 |
| 3201 | {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8, |
| 3202 | SVE_EXT_COST + SVE_FCVT_COST}, |
| 3203 | {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST}, |
| 3204 | {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST}, |
| 3205 | {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST}, |
| 3206 | {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8, |
| 3207 | SVE_EXT_COST + SVE_FCVT_COST}, |
| 3208 | {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST}, |
| 3209 | {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST}, |
| 3210 | {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST}, |
| 3211 | |
| 3212 | // SVE: to nxv4f16 |
| 3213 | {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8, |
| 3214 | SVE_EXT_COST + SVE_FCVT_COST}, |
| 3215 | {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST}, |
| 3216 | {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST}, |
| 3217 | {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8, |
| 3218 | SVE_EXT_COST + SVE_FCVT_COST}, |
| 3219 | {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST}, |
| 3220 | {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST}, |
| 3221 | |
| 3222 | // SVE: to nxv8f16 |
| 3223 | {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8, |
| 3224 | SVE_EXT_COST + SVE_FCVT_COST}, |
| 3225 | {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST}, |
| 3226 | {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8, |
| 3227 | SVE_EXT_COST + SVE_FCVT_COST}, |
| 3228 | {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST}, |
| 3229 | |
| 3230 | // SVE: to nxv16f16 |
| 3231 | {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8, |
| 3232 | SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3233 | {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8, |
| 3234 | SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3235 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3236 | // Complex: to v2f32 |
| 3237 | {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3}, |
| 3238 | {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3}, |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3239 | {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3}, |
| 3240 | {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3}, |
Tim Northover | ef0d760 | 2014-06-15 09:27:06 +0000 | [diff] [blame] | 3241 | |
Graham Hunter | f737df7 | 2025-03-25 10:43:44 +0000 | [diff] [blame] | 3242 | // SVE: to nxv2f32 |
| 3243 | {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8, |
| 3244 | SVE_EXT_COST + SVE_FCVT_COST}, |
| 3245 | {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST}, |
| 3246 | {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST}, |
| 3247 | {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST}, |
| 3248 | {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8, |
| 3249 | SVE_EXT_COST + SVE_FCVT_COST}, |
| 3250 | {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST}, |
| 3251 | {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST}, |
| 3252 | {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST}, |
| 3253 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3254 | // Complex: to v4f32 |
| 3255 | {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4}, |
| 3256 | {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2}, |
| 3257 | {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3}, |
| 3258 | {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2}, |
Tim Northover | ef0d760 | 2014-06-15 09:27:06 +0000 | [diff] [blame] | 3259 | |
Graham Hunter | f737df7 | 2025-03-25 10:43:44 +0000 | [diff] [blame] | 3260 | // SVE: to nxv4f32 |
| 3261 | {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8, |
| 3262 | SVE_EXT_COST + SVE_FCVT_COST}, |
| 3263 | {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST}, |
| 3264 | {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST}, |
| 3265 | {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8, |
| 3266 | SVE_EXT_COST + SVE_FCVT_COST}, |
| 3267 | {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST}, |
| 3268 | {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST}, |
| 3269 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3270 | // Complex: to v8f32 |
| 3271 | {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10}, |
| 3272 | {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4}, |
| 3273 | {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10}, |
| 3274 | {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4}, |
Silviu Baranga | b322aa6 | 2015-08-17 16:05:09 +0000 | [diff] [blame] | 3275 | |
Graham Hunter | f737df7 | 2025-03-25 10:43:44 +0000 | [diff] [blame] | 3276 | // SVE: to nxv8f32 |
| 3277 | {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8, |
| 3278 | SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3279 | {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16, |
| 3280 | SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3281 | {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8, |
| 3282 | SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3283 | {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16, |
| 3284 | SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3285 | |
| 3286 | // SVE: to nxv16f32 |
| 3287 | {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8, |
| 3288 | SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
| 3289 | {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8, |
| 3290 | SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
| 3291 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3292 | // Complex: to v16f32 |
| 3293 | {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21}, |
| 3294 | {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21}, |
Silviu Baranga | b322aa6 | 2015-08-17 16:05:09 +0000 | [diff] [blame] | 3295 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3296 | // Complex: to v2f64 |
| 3297 | {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4}, |
| 3298 | {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4}, |
| 3299 | {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2}, |
| 3300 | {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4}, |
| 3301 | {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4}, |
| 3302 | {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2}, |
Tim Northover | ef0d760 | 2014-06-15 09:27:06 +0000 | [diff] [blame] | 3303 | |
Graham Hunter | f737df7 | 2025-03-25 10:43:44 +0000 | [diff] [blame] | 3304 | // SVE: to nxv2f64 |
| 3305 | {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8, |
| 3306 | SVE_EXT_COST + SVE_FCVT_COST}, |
| 3307 | {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST}, |
| 3308 | {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST}, |
| 3309 | {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST}, |
| 3310 | {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8, |
| 3311 | SVE_EXT_COST + SVE_FCVT_COST}, |
| 3312 | {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST}, |
| 3313 | {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST}, |
| 3314 | {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST}, |
| 3315 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3316 | // Complex: to v4f64 |
| 3317 | {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4}, |
| 3318 | {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4}, |
Tim Northover | ef0d760 | 2014-06-15 09:27:06 +0000 | [diff] [blame] | 3319 | |
Graham Hunter | f737df7 | 2025-03-25 10:43:44 +0000 | [diff] [blame] | 3320 | // SVE: to nxv4f64 |
| 3321 | {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8, |
| 3322 | SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3323 | {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16, |
| 3324 | SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3325 | {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32, |
| 3326 | SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3327 | {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8, |
| 3328 | SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3329 | {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16, |
| 3330 | SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3331 | {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32, |
| 3332 | SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3333 | |
| 3334 | // SVE: to nxv8f64 |
| 3335 | {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8, |
| 3336 | SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
| 3337 | {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16, |
| 3338 | SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
| 3339 | {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8, |
| 3340 | SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
| 3341 | {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16, |
| 3342 | SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
| 3343 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3344 | // LowerVectorFP_TO_INT |
| 3345 | {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1}, |
| 3346 | {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1}, |
| 3347 | {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1}, |
| 3348 | {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1}, |
| 3349 | {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1}, |
| 3350 | {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1}, |
Tim Northover | ef0d760 | 2014-06-15 09:27:06 +0000 | [diff] [blame] | 3351 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3352 | // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). |
| 3353 | {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2}, |
| 3354 | {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1}, |
| 3355 | {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1}, |
| 3356 | {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2}, |
| 3357 | {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1}, |
| 3358 | {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1}, |
Tim Northover | dbecc3b | 2014-06-15 09:27:15 +0000 | [diff] [blame] | 3359 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3360 | // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 |
| 3361 | {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2}, |
| 3362 | {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2}, |
| 3363 | {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2}, |
| 3364 | {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2}, |
Tim Northover | dbecc3b | 2014-06-15 09:27:15 +0000 | [diff] [blame] | 3365 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3366 | // Complex, from nxv2f32. |
| 3367 | {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1}, |
| 3368 | {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1}, |
| 3369 | {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1}, |
| 3370 | {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1}, |
| 3371 | {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1}, |
| 3372 | {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1}, |
| 3373 | {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1}, |
| 3374 | {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1}, |
Nashe Mncube | 19601a4c | 2021-03-17 12:00:31 +0000 | [diff] [blame] | 3375 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3376 | // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. |
| 3377 | {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2}, |
| 3378 | {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2}, |
| 3379 | {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2}, |
| 3380 | {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2}, |
| 3381 | {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2}, |
| 3382 | {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2}, |
Nashe Mncube | 19601a4c | 2021-03-17 12:00:31 +0000 | [diff] [blame] | 3383 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3384 | // Complex, from nxv2f64. |
| 3385 | {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1}, |
| 3386 | {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1}, |
| 3387 | {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1}, |
| 3388 | {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1}, |
Paul Walker | da4cbec | 2025-03-04 11:34:44 +0000 | [diff] [blame] | 3389 | {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1}, |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3390 | {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1}, |
| 3391 | {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1}, |
| 3392 | {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1}, |
| 3393 | {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1}, |
Paul Walker | da4cbec | 2025-03-04 11:34:44 +0000 | [diff] [blame] | 3394 | {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1}, |
Nashe Mncube | 19601a4c | 2021-03-17 12:00:31 +0000 | [diff] [blame] | 3395 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3396 | // Complex, from nxv4f32. |
| 3397 | {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4}, |
| 3398 | {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1}, |
| 3399 | {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1}, |
| 3400 | {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1}, |
Paul Walker | da4cbec | 2025-03-04 11:34:44 +0000 | [diff] [blame] | 3401 | {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1}, |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3402 | {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4}, |
| 3403 | {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1}, |
| 3404 | {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1}, |
| 3405 | {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1}, |
Paul Walker | da4cbec | 2025-03-04 11:34:44 +0000 | [diff] [blame] | 3406 | {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1}, |
Nashe Mncube | 19601a4c | 2021-03-17 12:00:31 +0000 | [diff] [blame] | 3407 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3408 | // Complex, from nxv8f64. Illegal -> illegal conversions not required. |
| 3409 | {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7}, |
| 3410 | {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7}, |
| 3411 | {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7}, |
| 3412 | {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7}, |
Nashe Mncube | 19601a4c | 2021-03-17 12:00:31 +0000 | [diff] [blame] | 3413 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3414 | // Complex, from nxv4f64. Illegal -> illegal conversions not required. |
| 3415 | {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3}, |
| 3416 | {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3}, |
| 3417 | {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3}, |
| 3418 | {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3}, |
| 3419 | {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3}, |
| 3420 | {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3}, |
Nashe Mncube | 19601a4c | 2021-03-17 12:00:31 +0000 | [diff] [blame] | 3421 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3422 | // Complex, from nxv8f32. Illegal -> illegal conversions not required. |
| 3423 | {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3}, |
| 3424 | {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3}, |
| 3425 | {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3}, |
| 3426 | {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3}, |
David Sherwood | 57ca65e | 2021-04-06 11:06:58 +0100 | [diff] [blame] | 3427 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3428 | // Complex, from nxv8f16. |
| 3429 | {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10}, |
| 3430 | {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4}, |
| 3431 | {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1}, |
| 3432 | {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1}, |
Paul Walker | da4cbec | 2025-03-04 11:34:44 +0000 | [diff] [blame] | 3433 | {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1}, |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3434 | {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10}, |
| 3435 | {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4}, |
| 3436 | {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1}, |
| 3437 | {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1}, |
Paul Walker | da4cbec | 2025-03-04 11:34:44 +0000 | [diff] [blame] | 3438 | {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1}, |
David Sherwood | 57ca65e | 2021-04-06 11:06:58 +0100 | [diff] [blame] | 3439 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3440 | // Complex, from nxv4f16. |
| 3441 | {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4}, |
| 3442 | {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1}, |
| 3443 | {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1}, |
| 3444 | {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1}, |
| 3445 | {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4}, |
| 3446 | {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1}, |
| 3447 | {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1}, |
| 3448 | {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1}, |
David Sherwood | 57ca65e | 2021-04-06 11:06:58 +0100 | [diff] [blame] | 3449 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3450 | // Complex, from nxv2f16. |
| 3451 | {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1}, |
| 3452 | {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1}, |
| 3453 | {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1}, |
| 3454 | {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1}, |
| 3455 | {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1}, |
| 3456 | {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1}, |
| 3457 | {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1}, |
| 3458 | {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1}, |
Nashe Mncube | 19601a4c | 2021-03-17 12:00:31 +0000 | [diff] [blame] | 3459 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3460 | // Truncate from nxvmf32 to nxvmf16. |
| 3461 | {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1}, |
| 3462 | {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1}, |
| 3463 | {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3}, |
Nashe Mncube | 19601a4c | 2021-03-17 12:00:31 +0000 | [diff] [blame] | 3464 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3465 | // Truncate from nxvmf64 to nxvmf16. |
| 3466 | {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1}, |
| 3467 | {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3}, |
| 3468 | {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7}, |
Nashe Mncube | 19601a4c | 2021-03-17 12:00:31 +0000 | [diff] [blame] | 3469 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3470 | // Truncate from nxvmf64 to nxvmf32. |
| 3471 | {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1}, |
| 3472 | {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3}, |
| 3473 | {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6}, |
Nashe Mncube | 19601a4c | 2021-03-17 12:00:31 +0000 | [diff] [blame] | 3474 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3475 | // Extend from nxvmf16 to nxvmf32. |
| 3476 | {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, |
| 3477 | {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, |
| 3478 | {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, |
Nashe Mncube | 19601a4c | 2021-03-17 12:00:31 +0000 | [diff] [blame] | 3479 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3480 | // Extend from nxvmf16 to nxvmf64. |
| 3481 | {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, |
| 3482 | {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, |
| 3483 | {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, |
Nashe Mncube | 19601a4c | 2021-03-17 12:00:31 +0000 | [diff] [blame] | 3484 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3485 | // Extend from nxvmf32 to nxvmf64. |
| 3486 | {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, |
| 3487 | {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, |
| 3488 | {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, |
Nashe Mncube | 19601a4c | 2021-03-17 12:00:31 +0000 | [diff] [blame] | 3489 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3490 | // Bitcasts from float to integer |
| 3491 | {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0}, |
| 3492 | {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0}, |
| 3493 | {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0}, |
Alban Bridonneau | 2feddb3 | 2022-01-26 13:33:38 +0000 | [diff] [blame] | 3494 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3495 | // Bitcasts from integer to float |
| 3496 | {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0}, |
| 3497 | {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0}, |
| 3498 | {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0}, |
Hassnaa Hamdi | 045eec6 | 2023-04-19 09:23:13 +0000 | [diff] [blame] | 3499 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3500 | // Add cost for extending to illegal -too wide- scalable vectors. |
| 3501 | // zero/sign extend are implemented by multiple unpack operations, |
| 3502 | // where each operation has a cost of 1. |
| 3503 | {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2}, |
| 3504 | {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6}, |
| 3505 | {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14}, |
| 3506 | {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2}, |
| 3507 | {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6}, |
| 3508 | {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2}, |
Hassnaa Hamdi | 045eec6 | 2023-04-19 09:23:13 +0000 | [diff] [blame] | 3509 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3510 | {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2}, |
| 3511 | {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6}, |
| 3512 | {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14}, |
| 3513 | {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2}, |
| 3514 | {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6}, |
| 3515 | {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2}, |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 3516 | }; |
| 3517 | |
Dinar Temirbulatov | 73668cc | 2023-05-15 16:18:45 +0000 | [diff] [blame] | 3518 | // We have to estimate a cost of fixed length operation upon |
| 3519 | // SVE registers(operations) with the number of registers required |
| 3520 | // for a fixed type to be represented upon SVE registers. |
| 3521 | EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy; |
| 3522 | if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() && |
| 3523 | SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() && |
| 3524 | ST->useSVEForFixedLengthVectors(WiderTy)) { |
| 3525 | std::pair<InstructionCost, MVT> LT = |
| 3526 | getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext())); |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3527 | unsigned NumElements = |
| 3528 | AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits(); |
Dinar Temirbulatov | 73668cc | 2023-05-15 16:18:45 +0000 | [diff] [blame] | 3529 | return AdjustCost( |
| 3530 | LT.first * |
| 3531 | getCastInstrCost( |
| 3532 | Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements), |
| 3533 | ScalableVectorType::get(Src->getScalarType(), NumElements), CCH, |
| 3534 | CostKind, I)); |
| 3535 | } |
| 3536 | |
David Green | ca88400 | 2024-12-09 23:41:18 +0000 | [diff] [blame] | 3537 | if (const auto *Entry = ConvertCostTableLookup( |
| 3538 | ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) |
Sam Parker | 8aaabad | 2020-05-26 11:27:57 +0100 | [diff] [blame] | 3539 | return AdjustCost(Entry->Cost); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 3540 | |
David Green | 47f4cd9 | 2022-03-03 11:17:24 +0000 | [diff] [blame] | 3541 | static const TypeConversionCostTblEntry FP16Tbl[] = { |
| 3542 | {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs |
| 3543 | {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, |
| 3544 | {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs |
| 3545 | {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, |
| 3546 | {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs |
| 3547 | {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, |
| 3548 | {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn |
| 3549 | {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, |
| 3550 | {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs |
| 3551 | {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, |
| 3552 | {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs |
| 3553 | {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, |
| 3554 | {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn |
| 3555 | {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, |
| 3556 | {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs |
| 3557 | {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, |
| 3558 | {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs |
| 3559 | {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, |
Florian Hahn | aa590e5 | 2022-03-11 10:27:17 +0000 | [diff] [blame] | 3560 | {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf |
| 3561 | {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf |
| 3562 | {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf |
| 3563 | {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf |
David Green | 47f4cd9 | 2022-03-03 11:17:24 +0000 | [diff] [blame] | 3564 | }; |
| 3565 | |
| 3566 | if (ST->hasFullFP16()) |
| 3567 | if (const auto *Entry = ConvertCostTableLookup( |
| 3568 | FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) |
| 3569 | return AdjustCost(Entry->Cost); |
| 3570 | |
David Green | e2202b9 | 2025-03-26 07:26:17 +0000 | [diff] [blame] | 3571 | // INT_TO_FP of i64->f32 will scalarize, which is required to avoid |
| 3572 | // double-rounding issues. |
| 3573 | if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && |
| 3574 | DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 && |
| 3575 | isa<FixedVectorType>(Dst) && isa<FixedVectorType>(Src)) |
| 3576 | return AdjustCost( |
| 3577 | cast<FixedVectorType>(Dst)->getNumElements() * |
| 3578 | getCastInstrCost(Opcode, Dst->getScalarType(), Src->getScalarType(), |
| 3579 | CCH, CostKind) + |
| 3580 | BaseT::getScalarizationOverhead(cast<FixedVectorType>(Src), false, true, |
| 3581 | CostKind) + |
| 3582 | BaseT::getScalarizationOverhead(cast<FixedVectorType>(Dst), true, false, |
| 3583 | CostKind)); |
| 3584 | |
David Sherwood | fad69a5 | 2023-10-02 10:50:56 +0100 | [diff] [blame] | 3585 | if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && |
Sander de Smalen | c436649 | 2024-06-25 13:27:06 +0100 | [diff] [blame] | 3586 | CCH == TTI::CastContextHint::Masked && |
| 3587 | ST->isSVEorStreamingSVEAvailable() && |
David Sherwood | fad69a5 | 2023-10-02 10:50:56 +0100 | [diff] [blame] | 3588 | TLI->getTypeAction(Src->getContext(), SrcTy) == |
| 3589 | TargetLowering::TypePromoteInteger && |
| 3590 | TLI->getTypeAction(Dst->getContext(), DstTy) == |
| 3591 | TargetLowering::TypeSplitVector) { |
| 3592 | // The standard behaviour in the backend for these cases is to split the |
| 3593 | // extend up into two parts: |
| 3594 | // 1. Perform an extending load or masked load up to the legal type. |
| 3595 | // 2. Extend the loaded data to the final type. |
| 3596 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src); |
| 3597 | Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext()); |
| 3598 | InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost( |
| 3599 | Opcode, LegalTy, Src, CCH, CostKind, I); |
| 3600 | InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost( |
| 3601 | Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I); |
| 3602 | return Part1 + Part2; |
| 3603 | } |
| 3604 | |
David Sherwood | afc2b7d | 2023-04-05 12:58:03 +0000 | [diff] [blame] | 3605 | // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal, |
| 3606 | // but we also want to include the TTI::CastContextHint::Masked case too. |
| 3607 | if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && |
Sander de Smalen | c436649 | 2024-06-25 13:27:06 +0100 | [diff] [blame] | 3608 | CCH == TTI::CastContextHint::Masked && |
| 3609 | ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy)) |
David Sherwood | afc2b7d | 2023-04-05 12:58:03 +0000 | [diff] [blame] | 3610 | CCH = TTI::CastContextHint::Normal; |
| 3611 | |
David Green | 60280e9 | 2020-07-29 13:32:53 +0100 | [diff] [blame] | 3612 | return AdjustCost( |
| 3613 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 3614 | } |
| 3615 | |
David Green | d20604e | 2025-04-22 15:09:43 +0100 | [diff] [blame] | 3616 | InstructionCost |
| 3617 | AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, |
| 3618 | VectorType *VecTy, unsigned Index, |
| 3619 | TTI::TargetCostKind CostKind) const { |
Matthew Simpson | e5dfb08 | 2016-04-27 15:20:21 +0000 | [diff] [blame] | 3620 | |
| 3621 | // Make sure we were given a valid extend opcode. |
Matthew Simpson | 47bd399 | 2016-04-27 16:25:04 +0000 | [diff] [blame] | 3622 | assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && |
| 3623 | "Invalid opcode"); |
Matthew Simpson | e5dfb08 | 2016-04-27 15:20:21 +0000 | [diff] [blame] | 3624 | |
| 3625 | // We are extending an element we extract from a vector, so the source type |
| 3626 | // of the extend is the element type of the vector. |
| 3627 | auto *Src = VecTy->getElementType(); |
| 3628 | |
| 3629 | // Sign- and zero-extends are for integer types only. |
| 3630 | assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); |
| 3631 | |
| 3632 | // Get the cost for the extract. We compute the cost (if any) for the extend |
| 3633 | // below. |
Alexey Bataev | 9b5f626 | 2022-12-21 13:38:38 -0800 | [diff] [blame] | 3634 | InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, |
ShihPo Hung | 5fb3a57 | 2023-01-21 05:29:05 -0800 | [diff] [blame] | 3635 | CostKind, Index, nullptr, nullptr); |
Matthew Simpson | e5dfb08 | 2016-04-27 15:20:21 +0000 | [diff] [blame] | 3636 | |
| 3637 | // Legalize the types. |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 3638 | auto VecLT = getTypeLegalizationCost(VecTy); |
Matthew Simpson | e5dfb08 | 2016-04-27 15:20:21 +0000 | [diff] [blame] | 3639 | auto DstVT = TLI->getValueType(DL, Dst); |
| 3640 | auto SrcVT = TLI->getValueType(DL, Src); |
Matthew Simpson | e5dfb08 | 2016-04-27 15:20:21 +0000 | [diff] [blame] | 3641 | |
| 3642 | // If the resulting type is still a vector and the destination type is legal, |
| 3643 | // we may get the extension for free. If not, get the default cost for the |
| 3644 | // extend. |
| 3645 | if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) |
David Green | 60280e9 | 2020-07-29 13:32:53 +0100 | [diff] [blame] | 3646 | return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, |
| 3647 | CostKind); |
Matthew Simpson | e5dfb08 | 2016-04-27 15:20:21 +0000 | [diff] [blame] | 3648 | |
| 3649 | // The destination type should be larger than the element type. If not, get |
| 3650 | // the default cost for the extend. |
David Sherwood | d67d8f8 | 2020-10-09 12:03:20 +0100 | [diff] [blame] | 3651 | if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) |
David Green | 60280e9 | 2020-07-29 13:32:53 +0100 | [diff] [blame] | 3652 | return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, |
| 3653 | CostKind); |
Matthew Simpson | e5dfb08 | 2016-04-27 15:20:21 +0000 | [diff] [blame] | 3654 | |
| 3655 | switch (Opcode) { |
| 3656 | default: |
| 3657 | llvm_unreachable("Opcode should be either SExt or ZExt"); |
| 3658 | |
| 3659 | // For sign-extends, we only need a smov, which performs the extension |
| 3660 | // automatically. |
| 3661 | case Instruction::SExt: |
| 3662 | return Cost; |
| 3663 | |
| 3664 | // For zero-extends, the extend is performed automatically by a umov unless |
| 3665 | // the destination type is i64 and the element type is i8 or i16. |
| 3666 | case Instruction::ZExt: |
| 3667 | if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) |
| 3668 | return Cost; |
| 3669 | } |
| 3670 | |
| 3671 | // If we are unable to perform the extend for free, get the default cost. |
David Green | 60280e9 | 2020-07-29 13:32:53 +0100 | [diff] [blame] | 3672 | return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, |
| 3673 | CostKind); |
Matthew Simpson | e5dfb08 | 2016-04-27 15:20:21 +0000 | [diff] [blame] | 3674 | } |
| 3675 | |
Sander de Smalen | 14b934f | 2021-01-26 16:32:30 +0000 | [diff] [blame] | 3676 | InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, |
| 3677 | TTI::TargetCostKind CostKind, |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 3678 | const Instruction *I) const { |
Florian Hahn | 1ccc499 | 2020-06-30 10:39:23 +0100 | [diff] [blame] | 3679 | if (CostKind != TTI::TCK_RecipThroughput) |
| 3680 | return Opcode == Instruction::PHI ? 0 : 1; |
Florian Hahn | c30da98 | 2020-07-01 18:20:01 +0100 | [diff] [blame] | 3681 | assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind"); |
Florian Hahn | 1ccc499 | 2020-06-30 10:39:23 +0100 | [diff] [blame] | 3682 | // Branches are assumed to be predicted. |
Florian Hahn | c30da98 | 2020-07-01 18:20:01 +0100 | [diff] [blame] | 3683 | return 0; |
Florian Hahn | 1ccc499 | 2020-06-30 10:39:23 +0100 | [diff] [blame] | 3684 | } |
| 3685 | |
Sushant Gokhale | 9991ea2 | 2024-11-13 11:10:49 +0530 | [diff] [blame] | 3686 | InstructionCost AArch64TTIImpl::getVectorInstrCostHelper( |
David Green | c6406c8 | 2025-03-27 17:25:02 +0000 | [diff] [blame] | 3687 | unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, |
| 3688 | bool HasRealUse, const Instruction *I, Value *Scalar, |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 3689 | ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const { |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 3690 | assert(Val->isVectorTy() && "This must be a vector type"); |
| 3691 | |
| 3692 | if (Index != -1U) { |
| 3693 | // Legalize the type. |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 3694 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 3695 | |
| 3696 | // This type is legalized to a scalar type. |
| 3697 | if (!LT.second.isVector()) |
| 3698 | return 0; |
| 3699 | |
David Sherwood | ef1ca4d | 2022-01-12 09:51:34 +0000 | [diff] [blame] | 3700 | // The type may be split. For fixed-width vectors we can normalize the |
| 3701 | // index to the new type. |
| 3702 | if (LT.second.isFixedLengthVector()) { |
| 3703 | unsigned Width = LT.second.getVectorNumElements(); |
| 3704 | Index = Index % Width; |
| 3705 | } |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 3706 | |
| 3707 | // The element at index zero is already inside the vector. |
Mingming Liu | 8aa8006 | 2022-06-21 13:38:30 -0700 | [diff] [blame] | 3708 | // - For a physical (HasRealUse==true) insert-element or extract-element |
| 3709 | // instruction that extracts integers, an explicit FPR -> GPR move is |
| 3710 | // needed. So it has non-zero cost. |
| 3711 | // - For the rest of cases (virtual instruction or element type is float), |
| 3712 | // consider the instruction free. |
Sjoerd Meijer | 079c488 | 2023-02-09 16:07:17 +0000 | [diff] [blame] | 3713 | if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy())) |
| 3714 | return 0; |
| 3715 | |
| 3716 | // This is recognising a LD1 single-element structure to one lane of one |
| 3717 | // register instruction. I.e., if this is an `insertelement` instruction, |
| 3718 | // and its second operand is a load, then we will generate a LD1, which |
| 3719 | // are expensive instructions. |
| 3720 | if (I && dyn_cast<LoadInst>(I->getOperand(1))) |
David Green | c6406c8 | 2025-03-27 17:25:02 +0000 | [diff] [blame] | 3721 | return CostKind == TTI::TCK_CodeSize |
| 3722 | ? 0 |
| 3723 | : ST->getVectorInsertExtractBaseCost() + 1; |
Sjoerd Meijer | 079c488 | 2023-02-09 16:07:17 +0000 | [diff] [blame] | 3724 | |
David Green | eb764a7 | 2023-06-01 10:54:53 +0100 | [diff] [blame] | 3725 | // i1 inserts and extract will include an extra cset or cmp of the vector |
| 3726 | // value. Increase the cost by 1 to account. |
| 3727 | if (Val->getScalarSizeInBits() == 1) |
David Green | c6406c8 | 2025-03-27 17:25:02 +0000 | [diff] [blame] | 3728 | return CostKind == TTI::TCK_CodeSize |
| 3729 | ? 2 |
| 3730 | : ST->getVectorInsertExtractBaseCost() + 1; |
David Green | eb764a7 | 2023-06-01 10:54:53 +0100 | [diff] [blame] | 3731 | |
Mingming Liu | 8aa8006 | 2022-06-21 13:38:30 -0700 | [diff] [blame] | 3732 | // FIXME: |
| 3733 | // If the extract-element and insert-element instructions could be |
| 3734 | // simplified away (e.g., could be combined into users by looking at use-def |
| 3735 | // context), they have no cost. This is not done in the first place for |
| 3736 | // compile-time considerations. |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 3737 | } |
| 3738 | |
Sushant Gokhale | 9991ea2 | 2024-11-13 11:10:49 +0530 | [diff] [blame] | 3739 | // In case of Neon, if there exists extractelement from lane != 0 such that |
| 3740 | // 1. extractelement does not necessitate a move from vector_reg -> GPR. |
| 3741 | // 2. extractelement result feeds into fmul. |
| 3742 | // 3. Other operand of fmul is an extractelement from lane 0 or lane |
| 3743 | // equivalent to 0. |
| 3744 | // then the extractelement can be merged with fmul in the backend and it |
| 3745 | // incurs no cost. |
| 3746 | // e.g. |
| 3747 | // define double @foo(<2 x double> %a) { |
| 3748 | // %1 = extractelement <2 x double> %a, i32 0 |
| 3749 | // %2 = extractelement <2 x double> %a, i32 1 |
| 3750 | // %res = fmul double %1, %2 |
| 3751 | // ret double %res |
| 3752 | // } |
| 3753 | // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1] |
| 3754 | auto ExtractCanFuseWithFmul = [&]() { |
| 3755 | // We bail out if the extract is from lane 0. |
| 3756 | if (Index == 0) |
| 3757 | return false; |
| 3758 | |
| 3759 | // Check if the scalar element type of the vector operand of ExtractElement |
| 3760 | // instruction is one of the allowed types. |
| 3761 | auto IsAllowedScalarTy = [&](const Type *T) { |
| 3762 | return T->isFloatTy() || T->isDoubleTy() || |
| 3763 | (T->isHalfTy() && ST->hasFullFP16()); |
| 3764 | }; |
| 3765 | |
| 3766 | // Check if the extractelement user is scalar fmul. |
| 3767 | auto IsUserFMulScalarTy = [](const Value *EEUser) { |
| 3768 | // Check if the user is scalar fmul. |
David Green | d106a39 | 2024-11-29 01:11:39 +0000 | [diff] [blame] | 3769 | const auto *BO = dyn_cast<BinaryOperator>(EEUser); |
Sushant Gokhale | 9991ea2 | 2024-11-13 11:10:49 +0530 | [diff] [blame] | 3770 | return BO && BO->getOpcode() == BinaryOperator::FMul && |
| 3771 | !BO->getType()->isVectorTy(); |
| 3772 | }; |
| 3773 | |
| 3774 | // Check if the extract index is from lane 0 or lane equivalent to 0 for a |
| 3775 | // certain scalar type and a certain vector register width. |
David Green | d106a39 | 2024-11-29 01:11:39 +0000 | [diff] [blame] | 3776 | auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) { |
Sushant Gokhale | 9991ea2 | 2024-11-13 11:10:49 +0530 | [diff] [blame] | 3777 | auto RegWidth = |
| 3778 | getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) |
| 3779 | .getFixedValue(); |
David Green | d714b22 | 2024-11-29 04:01:03 +0000 | [diff] [blame] | 3780 | return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0); |
Sushant Gokhale | 9991ea2 | 2024-11-13 11:10:49 +0530 | [diff] [blame] | 3781 | }; |
| 3782 | |
| 3783 | // Check if the type constraints on input vector type and result scalar type |
| 3784 | // of extractelement instruction are satisfied. |
| 3785 | if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType())) |
| 3786 | return false; |
| 3787 | |
| 3788 | if (Scalar) { |
| 3789 | DenseMap<User *, unsigned> UserToExtractIdx; |
| 3790 | for (auto *U : Scalar->users()) { |
| 3791 | if (!IsUserFMulScalarTy(U)) |
| 3792 | return false; |
| 3793 | // Recording entry for the user is important. Index value is not |
| 3794 | // important. |
| 3795 | UserToExtractIdx[U]; |
| 3796 | } |
David Green | d106a39 | 2024-11-29 01:11:39 +0000 | [diff] [blame] | 3797 | if (UserToExtractIdx.empty()) |
| 3798 | return false; |
Sushant Gokhale | 9991ea2 | 2024-11-13 11:10:49 +0530 | [diff] [blame] | 3799 | for (auto &[S, U, L] : ScalarUserAndIdx) { |
| 3800 | for (auto *U : S->users()) { |
Kazu Hirata | 2d287f5 | 2025-05-03 21:55:36 -0700 | [diff] [blame^] | 3801 | if (UserToExtractIdx.contains(U)) { |
Sushant Gokhale | 9991ea2 | 2024-11-13 11:10:49 +0530 | [diff] [blame] | 3802 | auto *FMul = cast<BinaryOperator>(U); |
| 3803 | auto *Op0 = FMul->getOperand(0); |
| 3804 | auto *Op1 = FMul->getOperand(1); |
David Green | d106a39 | 2024-11-29 01:11:39 +0000 | [diff] [blame] | 3805 | if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) { |
Sushant Gokhale | 9991ea2 | 2024-11-13 11:10:49 +0530 | [diff] [blame] | 3806 | UserToExtractIdx[U] = L; |
| 3807 | break; |
| 3808 | } |
| 3809 | } |
| 3810 | } |
| 3811 | } |
| 3812 | for (auto &[U, L] : UserToExtractIdx) { |
| 3813 | if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) && |
| 3814 | !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits())) |
| 3815 | return false; |
| 3816 | } |
| 3817 | } else { |
| 3818 | const auto *EE = cast<ExtractElementInst>(I); |
| 3819 | |
| 3820 | const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand()); |
| 3821 | if (!IdxOp) |
| 3822 | return false; |
| 3823 | |
| 3824 | return !EE->users().empty() && all_of(EE->users(), [&](const User *U) { |
| 3825 | if (!IsUserFMulScalarTy(U)) |
| 3826 | return false; |
| 3827 | |
| 3828 | // Check if the other operand of extractelement is also extractelement |
| 3829 | // from lane equivalent to 0. |
| 3830 | const auto *BO = cast<BinaryOperator>(U); |
| 3831 | const auto *OtherEE = dyn_cast<ExtractElementInst>( |
| 3832 | BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0)); |
| 3833 | if (OtherEE) { |
| 3834 | const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand()); |
| 3835 | if (!IdxOp) |
| 3836 | return false; |
| 3837 | return IsExtractLaneEquivalentToZero( |
| 3838 | cast<ConstantInt>(OtherEE->getIndexOperand()) |
| 3839 | ->getValue() |
| 3840 | .getZExtValue(), |
| 3841 | OtherEE->getType()->getScalarSizeInBits()); |
| 3842 | } |
| 3843 | return true; |
| 3844 | }); |
| 3845 | } |
| 3846 | return true; |
| 3847 | }; |
| 3848 | |
| 3849 | if (Opcode == Instruction::ExtractElement && (I || Scalar) && |
| 3850 | ExtractCanFuseWithFmul()) |
| 3851 | return 0; |
| 3852 | |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 3853 | // All other insert/extracts cost this much. |
David Green | c6406c8 | 2025-03-27 17:25:02 +0000 | [diff] [blame] | 3854 | return CostKind == TTI::TCK_CodeSize ? 1 |
| 3855 | : ST->getVectorInsertExtractBaseCost(); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 3856 | } |
| 3857 | |
Mingming Liu | 8aa8006 | 2022-06-21 13:38:30 -0700 | [diff] [blame] | 3858 | InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
ShihPo Hung | 5fb3a57 | 2023-01-21 05:29:05 -0800 | [diff] [blame] | 3859 | TTI::TargetCostKind CostKind, |
David Green | abd2c07 | 2025-05-01 15:55:08 +0100 | [diff] [blame] | 3860 | unsigned Index, |
| 3861 | const Value *Op0, |
| 3862 | const Value *Op1) const { |
Alexey Bataev | 8cf0290 | 2023-04-14 09:35:03 -0700 | [diff] [blame] | 3863 | bool HasRealUse = |
| 3864 | Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0); |
David Green | c6406c8 | 2025-03-27 17:25:02 +0000 | [diff] [blame] | 3865 | return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse); |
Sushant Gokhale | 9991ea2 | 2024-11-13 11:10:49 +0530 | [diff] [blame] | 3866 | } |
| 3867 | |
| 3868 | InstructionCost AArch64TTIImpl::getVectorInstrCost( |
| 3869 | unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, |
| 3870 | Value *Scalar, |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 3871 | ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const { |
David Green | c6406c8 | 2025-03-27 17:25:02 +0000 | [diff] [blame] | 3872 | return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr, |
| 3873 | Scalar, ScalarUserAndIdx); |
Mingming Liu | 8aa8006 | 2022-06-21 13:38:30 -0700 | [diff] [blame] | 3874 | } |
| 3875 | |
| 3876 | InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, |
ShihPo Hung | 5fb3a57 | 2023-01-21 05:29:05 -0800 | [diff] [blame] | 3877 | Type *Val, |
| 3878 | TTI::TargetCostKind CostKind, |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 3879 | unsigned Index) const { |
David Green | c6406c8 | 2025-03-27 17:25:02 +0000 | [diff] [blame] | 3880 | return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, |
Sushant Gokhale | 9991ea2 | 2024-11-13 11:10:49 +0530 | [diff] [blame] | 3881 | true /* HasRealUse */, &I); |
Mingming Liu | 8aa8006 | 2022-06-21 13:38:30 -0700 | [diff] [blame] | 3882 | } |
| 3883 | |
David Green | 2a859b2 | 2023-07-28 21:26:50 +0100 | [diff] [blame] | 3884 | InstructionCost AArch64TTIImpl::getScalarizationOverhead( |
| 3885 | VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, |
Jonas Paulsson | f5c8c1e | 2025-04-30 17:11:27 +0200 | [diff] [blame] | 3886 | TTI::TargetCostKind CostKind, bool ForPoisonSrc, |
| 3887 | ArrayRef<Value *> VL) const { |
David Green | 2a859b2 | 2023-07-28 21:26:50 +0100 | [diff] [blame] | 3888 | if (isa<ScalableVectorType>(Ty)) |
| 3889 | return InstructionCost::getInvalid(); |
| 3890 | if (Ty->getElementType()->isFloatingPointTy()) |
| 3891 | return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, |
| 3892 | CostKind); |
David Green | 052225d | 2025-04-11 20:18:26 +0100 | [diff] [blame] | 3893 | unsigned VecInstCost = |
| 3894 | CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost(); |
| 3895 | return DemandedElts.popcount() * (Insert + Extract) * VecInstCost; |
David Green | 2a859b2 | 2023-07-28 21:26:50 +0100 | [diff] [blame] | 3896 | } |
| 3897 | |
Sander de Smalen | 4f42d87 | 2021-04-14 16:53:01 +0100 | [diff] [blame] | 3898 | InstructionCost AArch64TTIImpl::getArithmeticInstrCost( |
Sam Parker | 40574fe | 2020-04-28 14:11:27 +0100 | [diff] [blame] | 3899 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
Philip Reames | 104fa36 | 2022-08-20 08:07:28 -0700 | [diff] [blame] | 3900 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 3901 | ArrayRef<const Value *> Args, const Instruction *CxtI) const { |
Philip Reames | 478cf94 | 2022-08-22 12:03:36 -0700 | [diff] [blame] | 3902 | |
David Green | 0b745a1 | 2024-08-09 14:25:07 +0100 | [diff] [blame] | 3903 | // The code-generator is currently not able to handle scalable vectors |
| 3904 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 3905 | // it. This change will be removed when code-generation for these types is |
| 3906 | // sufficiently reliable. |
| 3907 | if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) |
| 3908 | if (VTy->getElementCount() == ElementCount::getScalable(1)) |
| 3909 | return InstructionCost::getInvalid(); |
| 3910 | |
Sam Parker | fa8bff0 | 2020-06-05 08:42:03 +0100 | [diff] [blame] | 3911 | // TODO: Handle more cost kinds. |
| 3912 | if (CostKind != TTI::TCK_RecipThroughput) |
Philip Reames | 104fa36 | 2022-08-20 08:07:28 -0700 | [diff] [blame] | 3913 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, |
| 3914 | Op2Info, Args, CxtI); |
Sam Parker | fa8bff0 | 2020-06-05 08:42:03 +0100 | [diff] [blame] | 3915 | |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 3916 | // Legalize the type. |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 3917 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 3918 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 3919 | |
| 3920 | switch (ISD) { |
| 3921 | default: |
Philip Reames | 104fa36 | 2022-08-20 08:07:28 -0700 | [diff] [blame] | 3922 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, |
| 3923 | Op2Info); |
Sushant Gokhale | c480874 | 2025-03-09 22:26:39 -0700 | [diff] [blame] | 3924 | case ISD::SREM: |
Evandro Menezes | f9bd871 | 2018-03-07 22:35:32 +0000 | [diff] [blame] | 3925 | case ISD::SDIV: |
Sushant Gokhale | c480874 | 2025-03-09 22:26:39 -0700 | [diff] [blame] | 3926 | /* |
| 3927 | Notes for sdiv/srem specific costs: |
| 3928 | 1. This only considers the cases where the divisor is constant, uniform and |
| 3929 | (pow-of-2/non-pow-of-2). Other cases are not important since they either |
| 3930 | result in some form of (ldr + adrp), corresponding to constant vectors, or |
| 3931 | scalarization of the division operation. |
| 3932 | 2. Constant divisors, either negative in whole or partially, don't result in |
| 3933 | significantly different codegen as compared to positive constant divisors. |
| 3934 | So, we don't consider negative divisors seperately. |
| 3935 | 3. If the codegen is significantly different with SVE, it has been indicated |
| 3936 | using comments at appropriate places. |
| 3937 | |
| 3938 | sdiv specific cases: |
| 3939 | ----------------------------------------------------------------------- |
| 3940 | codegen | pow-of-2 | Type |
| 3941 | ----------------------------------------------------------------------- |
| 3942 | add + cmp + csel + asr | Y | i64 |
| 3943 | add + cmp + csel + asr | Y | i32 |
| 3944 | ----------------------------------------------------------------------- |
| 3945 | |
| 3946 | srem specific cases: |
| 3947 | ----------------------------------------------------------------------- |
| 3948 | codegen | pow-of-2 | Type |
| 3949 | ----------------------------------------------------------------------- |
| 3950 | negs + and + and + csneg | Y | i64 |
| 3951 | negs + and + and + csneg | Y | i32 |
| 3952 | ----------------------------------------------------------------------- |
| 3953 | |
| 3954 | other sdiv/srem cases: |
| 3955 | ------------------------------------------------------------------------- |
| 3956 | commom codegen | + srem | + sdiv | pow-of-2 | Type |
| 3957 | ------------------------------------------------------------------------- |
| 3958 | smulh + asr + add + add | - | - | N | i64 |
| 3959 | smull + lsr + add + add | - | - | N | i32 |
| 3960 | usra | and + sub | sshr | Y | <2 x i64> |
| 3961 | 2 * (scalar code) | - | - | N | <2 x i64> |
| 3962 | usra | bic + sub | sshr + neg | Y | <4 x i32> |
| 3963 | smull2 + smull + uzp2 | mls | - | N | <4 x i32> |
| 3964 | + sshr + usra | | | | |
| 3965 | ------------------------------------------------------------------------- |
| 3966 | */ |
| 3967 | if (Op2Info.isConstant() && Op2Info.isUniform()) { |
| 3968 | InstructionCost AddCost = |
| 3969 | getArithmeticInstrCost(Instruction::Add, Ty, CostKind, |
| 3970 | Op1Info.getNoProps(), Op2Info.getNoProps()); |
| 3971 | InstructionCost AsrCost = |
| 3972 | getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, |
| 3973 | Op1Info.getNoProps(), Op2Info.getNoProps()); |
| 3974 | InstructionCost MulCost = |
| 3975 | getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, |
| 3976 | Op1Info.getNoProps(), Op2Info.getNoProps()); |
| 3977 | // add/cmp/csel/csneg should have similar cost while asr/negs/and should |
| 3978 | // have similar cost. |
| 3979 | auto VT = TLI->getValueType(DL, Ty); |
David Green | 9c6eca2 | 2025-03-29 19:25:17 +0000 | [diff] [blame] | 3980 | if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) { |
Sushant Gokhale | c480874 | 2025-03-09 22:26:39 -0700 | [diff] [blame] | 3981 | if (Op2Info.isPowerOf2()) { |
| 3982 | return ISD == ISD::SDIV ? (3 * AddCost + AsrCost) |
| 3983 | : (3 * AsrCost + AddCost); |
| 3984 | } else { |
| 3985 | return MulCost + AsrCost + 2 * AddCost; |
| 3986 | } |
| 3987 | } else if (VT.isVector()) { |
| 3988 | InstructionCost UsraCost = 2 * AsrCost; |
| 3989 | if (Op2Info.isPowerOf2()) { |
| 3990 | // Division with scalable types corresponds to native 'asrd' |
| 3991 | // instruction when SVE is available. |
| 3992 | // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8) |
| 3993 | if (Ty->isScalableTy() && ST->hasSVE()) |
| 3994 | return 2 * AsrCost; |
| 3995 | return UsraCost + |
| 3996 | (ISD == ISD::SDIV |
| 3997 | ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * |
| 3998 | AsrCost |
| 3999 | : 2 * AddCost); |
| 4000 | } else if (LT.second == MVT::v2i64) { |
| 4001 | return VT.getVectorNumElements() * |
| 4002 | getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind, |
| 4003 | Op1Info.getNoProps(), |
| 4004 | Op2Info.getNoProps()); |
| 4005 | } else { |
| 4006 | // When SVE is available, we get: |
| 4007 | // smulh + lsr + add/sub + asr + add/sub. |
| 4008 | if (Ty->isScalableTy() && ST->hasSVE()) |
| 4009 | return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost; |
| 4010 | return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost; |
| 4011 | } |
| 4012 | } |
| 4013 | } |
| 4014 | if (Op2Info.isConstant() && !Op2Info.isUniform() && |
| 4015 | LT.second.isFixedLengthVector()) { |
| 4016 | // FIXME: When the constant vector is non-uniform, this may result in |
| 4017 | // loading the vector from constant pool or in some cases, may also result |
| 4018 | // in scalarization. For now, we are approximating this with the |
| 4019 | // scalarization cost. |
| 4020 | auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty, |
| 4021 | CostKind, -1, nullptr, nullptr); |
| 4022 | auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty, |
| 4023 | CostKind, -1, nullptr, nullptr); |
| 4024 | unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements(); |
| 4025 | return ExtractCost + InsertCost + |
| 4026 | NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(), |
| 4027 | CostKind, Op1Info.getNoProps(), |
| 4028 | Op2Info.getNoProps()); |
Evandro Menezes | f9bd871 | 2018-03-07 22:35:32 +0000 | [diff] [blame] | 4029 | } |
Fangrui Song | de9d80c | 2022-08-08 11:24:15 -0700 | [diff] [blame] | 4030 | [[fallthrough]]; |
David Green | a5d8b7a | 2025-02-26 13:49:48 +0000 | [diff] [blame] | 4031 | case ISD::UDIV: |
| 4032 | case ISD::UREM: { |
Jon Roelofs | bded3b3 | 2024-09-05 07:42:23 -0700 | [diff] [blame] | 4033 | auto VT = TLI->getValueType(DL, Ty); |
David Green | a5d8b7a | 2025-02-26 13:49:48 +0000 | [diff] [blame] | 4034 | if (Op2Info.isConstant()) { |
| 4035 | // If the operand is a power of 2 we can use the shift or and cost. |
| 4036 | if (ISD == ISD::UDIV && Op2Info.isPowerOf2()) |
| 4037 | return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, |
| 4038 | Op1Info.getNoProps(), |
| 4039 | Op2Info.getNoProps()); |
| 4040 | if (ISD == ISD::UREM && Op2Info.isPowerOf2()) |
| 4041 | return getArithmeticInstrCost(Instruction::And, Ty, CostKind, |
| 4042 | Op1Info.getNoProps(), |
| 4043 | Op2Info.getNoProps()); |
| 4044 | |
| 4045 | if (ISD == ISD::UDIV || ISD == ISD::UREM) { |
| 4046 | // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL. |
| 4047 | // The MULHU will be expanded to UMULL for the types not listed below, |
| 4048 | // and will become a pair of UMULL+MULL2 for 128bit vectors. |
| 4049 | bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 || |
| 4050 | LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 || |
| 4051 | LT.second == MVT::nxv16i8; |
| 4052 | bool Is128bit = LT.second.is128BitVector(); |
| 4053 | |
| 4054 | InstructionCost MulCost = |
| 4055 | getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, |
| 4056 | Op1Info.getNoProps(), Op2Info.getNoProps()); |
| 4057 | InstructionCost AddCost = |
| 4058 | getArithmeticInstrCost(Instruction::Add, Ty, CostKind, |
| 4059 | Op1Info.getNoProps(), Op2Info.getNoProps()); |
| 4060 | InstructionCost ShrCost = |
| 4061 | getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, |
| 4062 | Op1Info.getNoProps(), Op2Info.getNoProps()); |
| 4063 | InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH |
| 4064 | (HasMULH ? 0 : ShrCost) + // UMULL shift |
| 4065 | AddCost * 2 + ShrCost; |
| 4066 | return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0); |
| 4067 | } |
Adhemerval Zanella | f384bc7 | 2018-05-09 12:48:22 +0000 | [diff] [blame] | 4068 | } |
| 4069 | |
Jon Roelofs | bded3b3 | 2024-09-05 07:42:23 -0700 | [diff] [blame] | 4070 | // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are |
| 4071 | // emitted by the backend even when those functions are not declared in the |
| 4072 | // module. |
| 4073 | if (!VT.isVector() && VT.getSizeInBits() > 64) |
| 4074 | return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind); |
| 4075 | |
David Green | 3c88ff4 | 2022-04-03 22:16:39 +0100 | [diff] [blame] | 4076 | InstructionCost Cost = BaseT::getArithmeticInstrCost( |
Philip Reames | 104fa36 | 2022-08-20 08:07:28 -0700 | [diff] [blame] | 4077 | Opcode, Ty, CostKind, Op1Info, Op2Info); |
David Green | a5d8b7a | 2025-02-26 13:49:48 +0000 | [diff] [blame] | 4078 | if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) { |
Hassnaa Hamdi | f2072e0 | 2022-08-23 15:22:52 +0000 | [diff] [blame] | 4079 | if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) { |
Hassnaa Hamdi | 181f200 | 2022-09-23 11:51:19 +0000 | [diff] [blame] | 4080 | // SDIV/UDIV operations are lowered using SVE, then we can have less |
| 4081 | // costs. |
Guillaume Chatelet | 8fd5558 | 2023-01-11 16:48:35 +0000 | [diff] [blame] | 4082 | if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty) |
| 4083 | ->getPrimitiveSizeInBits() |
| 4084 | .getFixedValue() < 128) { |
Hassnaa Hamdi | f2072e0 | 2022-08-23 15:22:52 +0000 | [diff] [blame] | 4085 | EVT VT = TLI->getValueType(DL, Ty); |
| 4086 | static const CostTblEntry DivTbl[]{ |
| 4087 | {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8}, |
| 4088 | {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5}, |
| 4089 | {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1}, |
| 4090 | {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8}, |
| 4091 | {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5}, |
| 4092 | {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}}; |
| 4093 | |
| 4094 | const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT()); |
| 4095 | if (nullptr != Entry) |
| 4096 | return Entry->Cost; |
| 4097 | } |
| 4098 | // For 8/16-bit elements, the cost is higher because the type |
| 4099 | // requires promotion and possibly splitting: |
| 4100 | if (LT.second.getScalarType() == MVT::i8) |
| 4101 | Cost *= 8; |
| 4102 | else if (LT.second.getScalarType() == MVT::i16) |
| 4103 | Cost *= 4; |
| 4104 | return Cost; |
| 4105 | } else { |
Zain Jaffal | 6e4cea5 | 2022-11-28 10:37:31 +0200 | [diff] [blame] | 4106 | // If one of the operands is a uniform constant then the cost for each |
| 4107 | // element is Cost for insertion, extraction and division. |
| 4108 | // Insertion cost = 2, Extraction Cost = 2, Division = cost for the |
| 4109 | // operation with scalar type |
| 4110 | if ((Op1Info.isConstant() && Op1Info.isUniform()) || |
| 4111 | (Op2Info.isConstant() && Op2Info.isUniform())) { |
| 4112 | if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { |
| 4113 | InstructionCost DivCost = BaseT::getArithmeticInstrCost( |
| 4114 | Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info); |
| 4115 | return (4 + DivCost) * VTy->getNumElements(); |
| 4116 | } |
| 4117 | } |
Hassnaa Hamdi | f2072e0 | 2022-08-23 15:22:52 +0000 | [diff] [blame] | 4118 | // On AArch64, without SVE, vector divisions are expanded |
| 4119 | // into scalar divisions of each pair of elements. |
David Green | c51b24c | 2025-04-02 14:51:22 +0100 | [diff] [blame] | 4120 | Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, |
| 4121 | -1, nullptr, nullptr); |
| 4122 | Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1, |
| 4123 | nullptr, nullptr); |
Hassnaa Hamdi | f2072e0 | 2022-08-23 15:22:52 +0000 | [diff] [blame] | 4124 | } |
| 4125 | |
Evandro Menezes | f9bd871 | 2018-03-07 22:35:32 +0000 | [diff] [blame] | 4126 | // TODO: if one of the arguments is scalar, then it's not necessary to |
| 4127 | // double the cost of handling the vector elements. |
| 4128 | Cost += Cost; |
| 4129 | } |
| 4130 | return Cost; |
David Green | 3c88ff4 | 2022-04-03 22:16:39 +0100 | [diff] [blame] | 4131 | } |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4132 | case ISD::MUL: |
Hassnaa Hamdi | f2072e0 | 2022-08-23 15:22:52 +0000 | [diff] [blame] | 4133 | // When SVE is available, then we can lower the v2i64 operation using |
| 4134 | // the SVE mul instruction, which has a lower cost. |
| 4135 | if (LT.second == MVT::v2i64 && ST->hasSVE()) |
| 4136 | return LT.first; |
| 4137 | |
| 4138 | // When SVE is not available, there is no MUL.2d instruction, |
| 4139 | // which means mul <2 x i64> is expensive as elements are extracted |
| 4140 | // from the vectors and the muls scalarized. |
| 4141 | // As getScalarizationOverhead is a bit too pessimistic, we |
| 4142 | // estimate the cost for a i64 vector directly here, which is: |
David Green | 750bf35 | 2022-04-04 17:42:20 +0100 | [diff] [blame] | 4143 | // - four 2-cost i64 extracts, |
| 4144 | // - two 2-cost i64 inserts, and |
| 4145 | // - two 1-cost muls. |
| 4146 | // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with |
| 4147 | // LT.first = 2 the cost is 28. If both operands are extensions it will not |
David Green | 2abaa02 | 2022-04-04 12:45:04 +0100 | [diff] [blame] | 4148 | // need to scalarize so the cost can be cheaper (smull or umull). |
Hassnaa Hamdi | f2072e0 | 2022-08-23 15:22:52 +0000 | [diff] [blame] | 4149 | // so the cost can be cheaper (smull or umull). |
David Green | 1712ae6 | 2023-07-12 13:13:06 +0100 | [diff] [blame] | 4150 | if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) |
David Green | 2abaa02 | 2022-04-04 12:45:04 +0100 | [diff] [blame] | 4151 | return LT.first; |
David Green | 27a2d3d | 2025-01-20 11:43:57 +0000 | [diff] [blame] | 4152 | return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() * |
| 4153 | (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) + |
| 4154 | getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1, |
| 4155 | nullptr, nullptr) * |
| 4156 | 2 + |
| 4157 | getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1, |
| 4158 | nullptr, nullptr)); |
Sjoerd Meijer | 5110ff0 | 2020-11-30 11:16:10 +0000 | [diff] [blame] | 4159 | case ISD::ADD: |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4160 | case ISD::XOR: |
| 4161 | case ISD::OR: |
| 4162 | case ISD::AND: |
David Green | 65c0e45 | 2022-03-03 10:42:57 +0000 | [diff] [blame] | 4163 | case ISD::SRL: |
| 4164 | case ISD::SRA: |
| 4165 | case ISD::SHL: |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4166 | // These nodes are marked as 'custom' for combining purposes only. |
| 4167 | // We know that they are legal. See LowerAdd in ISelLowering. |
David Green | 3c88ff4 | 2022-04-03 22:16:39 +0100 | [diff] [blame] | 4168 | return LT.first; |
Paul Walker | 3a98d5d | 2020-06-20 20:23:31 +0100 | [diff] [blame] | 4169 | |
Sjoerd Meijer | d827865 | 2023-04-11 12:40:14 +0100 | [diff] [blame] | 4170 | case ISD::FNEG: |
David Green | c61d565 | 2024-08-21 18:10:16 +0100 | [diff] [blame] | 4171 | // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul |
| 4172 | if ((Ty->isFloatTy() || Ty->isDoubleTy() || |
| 4173 | (Ty->isHalfTy() && ST->hasFullFP16())) && |
| 4174 | CxtI && |
| 4175 | ((CxtI->hasOneUse() && |
| 4176 | match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) || |
| 4177 | match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value())))) |
| 4178 | return 0; |
| 4179 | [[fallthrough]]; |
Paul Walker | 3a98d5d | 2020-06-20 20:23:31 +0100 | [diff] [blame] | 4180 | case ISD::FADD: |
David Sherwood | d581d94 | 2021-08-31 14:07:50 +0100 | [diff] [blame] | 4181 | case ISD::FSUB: |
Sjoerd Meijer | d827865 | 2023-04-11 12:40:14 +0100 | [diff] [blame] | 4182 | // Increase the cost for half and bfloat types if not architecturally |
| 4183 | // supported. |
| 4184 | if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) || |
| 4185 | (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16())) |
| 4186 | return 2 * LT.first; |
| 4187 | if (!Ty->getScalarType()->isFP128Ty()) |
| 4188 | return LT.first; |
Craig Topper | 6006d43 | 2023-05-24 12:15:23 -0700 | [diff] [blame] | 4189 | [[fallthrough]]; |
David Sherwood | d581d94 | 2021-08-31 14:07:50 +0100 | [diff] [blame] | 4190 | case ISD::FMUL: |
| 4191 | case ISD::FDIV: |
Paul Walker | 3a98d5d | 2020-06-20 20:23:31 +0100 | [diff] [blame] | 4192 | // These nodes are marked as 'custom' just to lower them to SVE. |
| 4193 | // We know said lowering will incur no additional cost. |
David Sherwood | d581d94 | 2021-08-31 14:07:50 +0100 | [diff] [blame] | 4194 | if (!Ty->getScalarType()->isFP128Ty()) |
David Green | 3c88ff4 | 2022-04-03 22:16:39 +0100 | [diff] [blame] | 4195 | return 2 * LT.first; |
Paul Walker | 3a98d5d | 2020-06-20 20:23:31 +0100 | [diff] [blame] | 4196 | |
Philip Reames | 104fa36 | 2022-08-20 08:07:28 -0700 | [diff] [blame] | 4197 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, |
| 4198 | Op2Info); |
Paschalis Mpeis | bbdc62e | 2024-02-23 09:29:45 +0000 | [diff] [blame] | 4199 | case ISD::FREM: |
| 4200 | // Pass nullptr as fmod/fmodf calls are emitted by the backend even when |
| 4201 | // those functions are not declared in the module. |
| 4202 | if (!Ty->isVectorTy()) |
| 4203 | return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind); |
| 4204 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, |
| 4205 | Op2Info); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4206 | } |
| 4207 | } |
| 4208 | |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 4209 | InstructionCost |
| 4210 | AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, |
| 4211 | const SCEV *Ptr) const { |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4212 | // Address computations in vectorized code with non-consecutive addresses will |
| 4213 | // likely result in more instructions compared to scalar code where the |
| 4214 | // computation can more often be merged into the index mode. The resulting |
| 4215 | // extra micro-ops can significantly decrease throughput. |
zhongyunde | df19d87 | 2023-06-07 21:50:54 +0800 | [diff] [blame] | 4216 | unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead; |
Mohammed Agabaria | 23599ba | 2017-01-05 14:03:41 +0000 | [diff] [blame] | 4217 | int MaxMergeDistance = 64; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4218 | |
Fangrui Song | f78650a | 2018-07-30 19:41:25 +0000 | [diff] [blame] | 4219 | if (Ty->isVectorTy() && SE && |
Mohammed Agabaria | 23599ba | 2017-01-05 14:03:41 +0000 | [diff] [blame] | 4220 | !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4221 | return NumVectorInstToHideOverhead; |
| 4222 | |
| 4223 | // In many cases the address computation is not merged into the instruction |
| 4224 | // addressing mode. |
| 4225 | return 1; |
| 4226 | } |
| 4227 | |
Philip Reames | d288574 | 2024-09-25 07:25:57 -0700 | [diff] [blame] | 4228 | InstructionCost AArch64TTIImpl::getCmpSelInstrCost( |
| 4229 | unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, |
| 4230 | TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 4231 | TTI::OperandValueInfo Op2Info, const Instruction *I) const { |
Sam Parker | 3728961 | 2020-05-26 14:28:34 +0100 | [diff] [blame] | 4232 | // TODO: Handle other cost kinds. |
| 4233 | if (CostKind != TTI::TCK_RecipThroughput) |
Florian Hahn | b3b993a | 2020-11-02 12:40:34 +0000 | [diff] [blame] | 4234 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
Philip Reames | d288574 | 2024-09-25 07:25:57 -0700 | [diff] [blame] | 4235 | Op1Info, Op2Info, I); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4236 | |
| 4237 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
Silviu Baranga | a3e27ed | 2015-09-09 15:35:02 +0000 | [diff] [blame] | 4238 | // We don't lower some vector selects well that are wider than the register |
| 4239 | // width. |
David Sherwood | 2e080eb | 2021-01-19 15:38:03 +0000 | [diff] [blame] | 4240 | if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4241 | // We would need this many instructions to hide the scalarization happening. |
Chandler Carruth | 93205eb | 2015-08-05 18:08:10 +0000 | [diff] [blame] | 4242 | const int AmortizationCost = 20; |
Florian Hahn | b3b993a | 2020-11-02 12:40:34 +0000 | [diff] [blame] | 4243 | |
| 4244 | // If VecPred is not set, check if we can get a predicate from the context |
| 4245 | // instruction, if its type matches the requested ValTy. |
| 4246 | if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { |
Ramkumar Ramachandra | 4a0d53a | 2024-12-13 14:18:33 +0000 | [diff] [blame] | 4247 | CmpPredicate CurrentPred; |
Florian Hahn | b3b993a | 2020-11-02 12:40:34 +0000 | [diff] [blame] | 4248 | if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), |
| 4249 | m_Value()))) |
| 4250 | VecPred = CurrentPred; |
| 4251 | } |
Florian Hahn | 17ebd68 | 2022-01-31 10:18:28 +0000 | [diff] [blame] | 4252 | // Check if we have a compare/select chain that can be lowered using |
| 4253 | // a (F)CMxx & BFI pair. |
| 4254 | if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || |
| 4255 | VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || |
| 4256 | VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || |
| 4257 | VecPred == CmpInst::FCMP_UNE) { |
| 4258 | static const auto ValidMinMaxTys = { |
| 4259 | MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, |
| 4260 | MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; |
| 4261 | static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; |
| 4262 | |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 4263 | auto LT = getTypeLegalizationCost(ValTy); |
Florian Hahn | 17ebd68 | 2022-01-31 10:18:28 +0000 | [diff] [blame] | 4264 | if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || |
| 4265 | (ST->hasFullFP16() && |
| 4266 | any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) |
Florian Hahn | b3b993a | 2020-11-02 12:40:34 +0000 | [diff] [blame] | 4267 | return LT.first; |
| 4268 | } |
| 4269 | |
Craig Topper | 4b27576 | 2015-10-28 04:02:12 +0000 | [diff] [blame] | 4270 | static const TypeConversionCostTblEntry |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4271 | VectorSelectTbl[] = { |
Zhongyunde | cb353dc | 2023-06-20 13:12:02 +0800 | [diff] [blame] | 4272 | { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 }, |
| 4273 | { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 }, |
| 4274 | { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 }, |
| 4275 | { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 }, |
| 4276 | { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 }, |
Silviu Baranga | a3e27ed | 2015-09-09 15:35:02 +0000 | [diff] [blame] | 4277 | { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, |
| 4278 | { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, |
| 4279 | { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4280 | { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, |
| 4281 | { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, |
| 4282 | { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } |
| 4283 | }; |
| 4284 | |
Mehdi Amini | 44ede33 | 2015-07-09 02:09:04 +0000 | [diff] [blame] | 4285 | EVT SelCondTy = TLI->getValueType(DL, CondTy); |
| 4286 | EVT SelValTy = TLI->getValueType(DL, ValTy); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4287 | if (SelCondTy.isSimple() && SelValTy.isSimple()) { |
Craig Topper | ee0c859 | 2015-10-27 04:14:24 +0000 | [diff] [blame] | 4288 | if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, |
| 4289 | SelCondTy.getSimpleVT(), |
| 4290 | SelValTy.getSimpleVT())) |
| 4291 | return Entry->Cost; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4292 | } |
| 4293 | } |
Craig Topper | 9ad9380 | 2023-05-13 23:33:00 -0700 | [diff] [blame] | 4294 | |
David Green | 1ba9ec0 | 2023-05-14 23:28:11 +0100 | [diff] [blame] | 4295 | if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) { |
Paul Walker | a095ebc | 2025-04-22 11:20:17 +0100 | [diff] [blame] | 4296 | Type *ValScalarTy = ValTy->getScalarType(); |
| 4297 | if ((ValScalarTy->isHalfTy() && !ST->hasFullFP16()) || |
| 4298 | ValScalarTy->isBFloatTy()) { |
| 4299 | auto *ValVTy = cast<FixedVectorType>(ValTy); |
| 4300 | |
Paul Walker | a095ebc | 2025-04-22 11:20:17 +0100 | [diff] [blame] | 4301 | // Without dedicated instructions we promote [b]f16 compares to f32. |
| 4302 | auto *PromotedTy = |
| 4303 | VectorType::get(Type::getFloatTy(ValTy->getContext()), ValVTy); |
| 4304 | |
| 4305 | InstructionCost Cost = 0; |
| 4306 | // Promote operands to float vectors. |
| 4307 | Cost += 2 * getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy, |
| 4308 | TTI::CastContextHint::None, CostKind); |
| 4309 | // Compare float vectors. |
| 4310 | Cost += getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind, |
| 4311 | Op1Info, Op2Info); |
| 4312 | // During codegen we'll truncate the vector result from i32 to i16. |
| 4313 | Cost += |
| 4314 | getCastInstrCost(Instruction::Trunc, VectorType::getInteger(ValVTy), |
| 4315 | VectorType::getInteger(PromotedTy), |
| 4316 | TTI::CastContextHint::None, CostKind); |
| 4317 | return Cost; |
| 4318 | } |
Craig Topper | 9ad9380 | 2023-05-13 23:33:00 -0700 | [diff] [blame] | 4319 | } |
| 4320 | |
David Green | 5106b22 | 2023-07-01 21:59:54 +0100 | [diff] [blame] | 4321 | // Treat the icmp in icmp(and, 0) as free, as we can make use of ands. |
| 4322 | // FIXME: This can apply to more conditions and add/sub if it can be shown to |
| 4323 | // be profitable. |
| 4324 | if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I && |
| 4325 | ICmpInst::isEquality(VecPred) && |
| 4326 | TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) && |
| 4327 | match(I->getOperand(1), m_Zero()) && |
| 4328 | match(I->getOperand(0), m_And(m_Value(), m_Value()))) |
| 4329 | return 0; |
| 4330 | |
David Sherwood | 2e080eb | 2021-01-19 15:38:03 +0000 | [diff] [blame] | 4331 | // The base case handles scalable vectors fine for now, since it treats the |
| 4332 | // cost as 1 * legalization cost. |
Philip Reames | d288574 | 2024-09-25 07:25:57 -0700 | [diff] [blame] | 4333 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
| 4334 | Op1Info, Op2Info, I); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4335 | } |
| 4336 | |
Evandro Menezes | a005c1a | 2019-08-05 18:09:14 +0000 | [diff] [blame] | 4337 | AArch64TTIImpl::TTI::MemCmpExpansionOptions |
| 4338 | AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { |
| 4339 | TTI::MemCmpExpansionOptions Options; |
Eli Friedman | e9ac757 | 2020-04-06 15:17:02 -0700 | [diff] [blame] | 4340 | if (ST->requiresStrictAlign()) { |
| 4341 | // TODO: Add cost modeling for strict align. Misaligned loads expand to |
| 4342 | // a bunch of instructions when strict align is enabled. |
| 4343 | return Options; |
| 4344 | } |
| 4345 | Options.AllowOverlappingLoads = true; |
Evandro Menezes | a005c1a | 2019-08-05 18:09:14 +0000 | [diff] [blame] | 4346 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); |
| 4347 | Options.NumLoadsPerBlock = Options.MaxNumLoads; |
| 4348 | // TODO: Though vector loads usually perform well on AArch64, in some targets |
| 4349 | // they may wake up the FP unit, which raises the power consumption. Perhaps |
| 4350 | // they could be used with no holds barred (-O3). |
| 4351 | Options.LoadSizes = {8, 4, 2, 1}; |
Igor Kirillov | 849f963 | 2023-10-30 18:40:48 +0000 | [diff] [blame] | 4352 | Options.AllowedTailExpansions = {3, 5, 6}; |
Evandro Menezes | a005c1a | 2019-08-05 18:09:14 +0000 | [diff] [blame] | 4353 | return Options; |
| 4354 | } |
| 4355 | |
Tiehu Zhang | b329156 | 2022-06-17 18:24:23 +0800 | [diff] [blame] | 4356 | bool AArch64TTIImpl::prefersVectorizedAddressing() const { |
| 4357 | return ST->hasSVE(); |
| 4358 | } |
| 4359 | |
David Sherwood | a458b78 | 2021-04-16 16:08:38 +0100 | [diff] [blame] | 4360 | InstructionCost |
| 4361 | AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, |
| 4362 | Align Alignment, unsigned AddressSpace, |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 4363 | TTI::TargetCostKind CostKind) const { |
Matthew Devereau | e00f22c | 2021-08-19 11:42:20 +0100 | [diff] [blame] | 4364 | if (useNeonVector(Src)) |
David Sherwood | a458b78 | 2021-04-16 16:08:38 +0100 | [diff] [blame] | 4365 | return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
| 4366 | CostKind); |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 4367 | auto LT = getTypeLegalizationCost(Src); |
Kerry McLaughlin | 5db5275 | 2021-06-08 10:49:22 +0100 | [diff] [blame] | 4368 | if (!LT.first.isValid()) |
| 4369 | return InstructionCost::getInvalid(); |
Sander de Smalen | eac1670 | 2021-07-14 09:43:30 +0100 | [diff] [blame] | 4370 | |
David Sherwood | 2dd4167 | 2024-06-25 15:04:24 +0100 | [diff] [blame] | 4371 | // Return an invalid cost for element types that we are unable to lower. |
| 4372 | auto *VT = cast<VectorType>(Src); |
| 4373 | if (VT->getElementType()->isIntegerTy(1)) |
| 4374 | return InstructionCost::getInvalid(); |
| 4375 | |
Sander de Smalen | eac1670 | 2021-07-14 09:43:30 +0100 | [diff] [blame] | 4376 | // The code-generator is currently not able to handle scalable vectors |
| 4377 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 4378 | // it. This change will be removed when code-generation for these types is |
| 4379 | // sufficiently reliable. |
David Sherwood | 2dd4167 | 2024-06-25 15:04:24 +0100 | [diff] [blame] | 4380 | if (VT->getElementCount() == ElementCount::getScalable(1)) |
Sander de Smalen | eac1670 | 2021-07-14 09:43:30 +0100 | [diff] [blame] | 4381 | return InstructionCost::getInvalid(); |
| 4382 | |
liqinweng | 6efb45f | 2022-12-09 12:45:42 +0800 | [diff] [blame] | 4383 | return LT.first; |
David Sherwood | a458b78 | 2021-04-16 16:08:38 +0100 | [diff] [blame] | 4384 | } |
| 4385 | |
Madhur Amilkanthwar | b73771c | 2024-08-14 10:12:40 +0530 | [diff] [blame] | 4386 | // This function returns gather/scatter overhead either from |
| 4387 | // user-provided value or specialized values per-target from \p ST. |
| 4388 | static unsigned getSVEGatherScatterOverhead(unsigned Opcode, |
| 4389 | const AArch64Subtarget *ST) { |
| 4390 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && |
| 4391 | "Should be called on only load or stores."); |
| 4392 | switch (Opcode) { |
| 4393 | case Instruction::Load: |
| 4394 | if (SVEGatherOverhead.getNumOccurrences() > 0) |
| 4395 | return SVEGatherOverhead; |
| 4396 | return ST->getGatherOverhead(); |
| 4397 | break; |
| 4398 | case Instruction::Store: |
| 4399 | if (SVEScatterOverhead.getNumOccurrences() > 0) |
| 4400 | return SVEScatterOverhead; |
| 4401 | return ST->getScatterOverhead(); |
| 4402 | break; |
| 4403 | default: |
| 4404 | llvm_unreachable("Shouldn't have reached here"); |
| 4405 | } |
David Sherwood | 8b0448c | 2021-12-06 11:02:29 +0000 | [diff] [blame] | 4406 | } |
| 4407 | |
Sander de Smalen | fd1f8a5 | 2021-01-22 21:25:50 +0000 | [diff] [blame] | 4408 | InstructionCost AArch64TTIImpl::getGatherScatterOpCost( |
Caroline Concatto | 060cfd9 | 2020-12-17 16:15:28 +0000 | [diff] [blame] | 4409 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 4410 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const { |
Antonio Frighetto | 138e6c1 | 2023-10-27 17:30:31 +0200 | [diff] [blame] | 4411 | if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy)) |
Caroline Concatto | 01c190e | 2021-01-07 09:07:06 +0000 | [diff] [blame] | 4412 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
| 4413 | Alignment, CostKind, I); |
Caroline Concatto | 060cfd9 | 2020-12-17 16:15:28 +0000 | [diff] [blame] | 4414 | auto *VT = cast<VectorType>(DataTy); |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 4415 | auto LT = getTypeLegalizationCost(DataTy); |
Kerry McLaughlin | 5db5275 | 2021-06-08 10:49:22 +0100 | [diff] [blame] | 4416 | if (!LT.first.isValid()) |
| 4417 | return InstructionCost::getInvalid(); |
| 4418 | |
David Sherwood | 2dd4167 | 2024-06-25 15:04:24 +0100 | [diff] [blame] | 4419 | // Return an invalid cost for element types that we are unable to lower. |
Antonio Frighetto | 138e6c1 | 2023-10-27 17:30:31 +0200 | [diff] [blame] | 4420 | if (!LT.second.isVector() || |
David Sherwood | 2dd4167 | 2024-06-25 15:04:24 +0100 | [diff] [blame] | 4421 | !isElementTypeLegalForScalableVector(VT->getElementType()) || |
| 4422 | VT->getElementType()->isIntegerTy(1)) |
Antonio Frighetto | 138e6c1 | 2023-10-27 17:30:31 +0200 | [diff] [blame] | 4423 | return InstructionCost::getInvalid(); |
| 4424 | |
Sander de Smalen | eac1670 | 2021-07-14 09:43:30 +0100 | [diff] [blame] | 4425 | // The code-generator is currently not able to handle scalable vectors |
| 4426 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 4427 | // it. This change will be removed when code-generation for these types is |
| 4428 | // sufficiently reliable. |
David Sherwood | 2dd4167 | 2024-06-25 15:04:24 +0100 | [diff] [blame] | 4429 | if (VT->getElementCount() == ElementCount::getScalable(1)) |
Sander de Smalen | eac1670 | 2021-07-14 09:43:30 +0100 | [diff] [blame] | 4430 | return InstructionCost::getInvalid(); |
| 4431 | |
Caroline Concatto | 060cfd9 | 2020-12-17 16:15:28 +0000 | [diff] [blame] | 4432 | ElementCount LegalVF = LT.second.getVectorElementCount(); |
Sander de Smalen | 03f47bd | 2021-01-23 12:14:21 +0000 | [diff] [blame] | 4433 | InstructionCost MemOpCost = |
Alexey Bataev | d53e245 | 2022-08-19 05:13:25 -0700 | [diff] [blame] | 4434 | getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, |
Philip Reames | c9608d5 | 2022-08-22 15:26:36 -0700 | [diff] [blame] | 4435 | {TTI::OK_AnyValue, TTI::OP_None}, I); |
David Sherwood | 8b0448c | 2021-12-06 11:02:29 +0000 | [diff] [blame] | 4436 | // Add on an overhead cost for using gathers/scatters. |
Madhur Amilkanthwar | b73771c | 2024-08-14 10:12:40 +0530 | [diff] [blame] | 4437 | MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST); |
David Sherwood | 9448cdc | 2021-09-22 10:54:05 +0100 | [diff] [blame] | 4438 | return LT.first * MemOpCost * getMaxNumElements(LegalVF); |
Caroline Concatto | 060cfd9 | 2020-12-17 16:15:28 +0000 | [diff] [blame] | 4439 | } |
| 4440 | |
Caroline Concatto | 37f4ccb | 2020-11-06 15:53:59 +0000 | [diff] [blame] | 4441 | bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { |
| 4442 | return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); |
| 4443 | } |
| 4444 | |
Sander de Smalen | 03f47bd | 2021-01-23 12:14:21 +0000 | [diff] [blame] | 4445 | InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, |
Sergei Barannikov | 3334c35 | 2025-04-22 11:40:12 +0300 | [diff] [blame] | 4446 | Align Alignment, |
Sander de Smalen | 03f47bd | 2021-01-23 12:14:21 +0000 | [diff] [blame] | 4447 | unsigned AddressSpace, |
| 4448 | TTI::TargetCostKind CostKind, |
Philip Reames | c9608d5 | 2022-08-22 15:26:36 -0700 | [diff] [blame] | 4449 | TTI::OperandValueInfo OpInfo, |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 4450 | const Instruction *I) const { |
Sjoerd Meijer | ee75213 | 2021-07-01 14:45:54 +0100 | [diff] [blame] | 4451 | EVT VT = TLI->getValueType(DL, Ty, true); |
Sam Parker | 5b5e78ad | 2020-06-08 15:25:03 +0100 | [diff] [blame] | 4452 | // Type legalization can't handle structs |
Sjoerd Meijer | ee75213 | 2021-07-01 14:45:54 +0100 | [diff] [blame] | 4453 | if (VT == MVT::Other) |
Sam Parker | 5b5e78ad | 2020-06-08 15:25:03 +0100 | [diff] [blame] | 4454 | return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, |
| 4455 | CostKind); |
| 4456 | |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 4457 | auto LT = getTypeLegalizationCost(Ty); |
Kerry McLaughlin | 5db5275 | 2021-06-08 10:49:22 +0100 | [diff] [blame] | 4458 | if (!LT.first.isValid()) |
| 4459 | return InstructionCost::getInvalid(); |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4460 | |
Sander de Smalen | eac1670 | 2021-07-14 09:43:30 +0100 | [diff] [blame] | 4461 | // The code-generator is currently not able to handle scalable vectors |
| 4462 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 4463 | // it. This change will be removed when code-generation for these types is |
| 4464 | // sufficiently reliable. |
David Sherwood | 2dd4167 | 2024-06-25 15:04:24 +0100 | [diff] [blame] | 4465 | // We also only support full register predicate loads and stores. |
Sander de Smalen | eac1670 | 2021-07-14 09:43:30 +0100 | [diff] [blame] | 4466 | if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) |
David Sherwood | 2dd4167 | 2024-06-25 15:04:24 +0100 | [diff] [blame] | 4467 | if (VTy->getElementCount() == ElementCount::getScalable(1) || |
| 4468 | (VTy->getElementType()->isIntegerTy(1) && |
| 4469 | !VTy->getElementCount().isKnownMultipleOf( |
| 4470 | ElementCount::getScalable(16)))) |
Sander de Smalen | eac1670 | 2021-07-14 09:43:30 +0100 | [diff] [blame] | 4471 | return InstructionCost::getInvalid(); |
| 4472 | |
Florian Hahn | acd9cc7 | 2021-04-15 09:22:32 +0100 | [diff] [blame] | 4473 | // TODO: consider latency as well for TCK_SizeAndLatency. |
| 4474 | if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) |
| 4475 | return LT.first; |
| 4476 | |
| 4477 | if (CostKind != TTI::TCK_RecipThroughput) |
| 4478 | return 1; |
| 4479 | |
Matthew Simpson | 2c8de19 | 2016-12-15 18:36:59 +0000 | [diff] [blame] | 4480 | if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && |
Sergei Barannikov | 3334c35 | 2025-04-22 11:40:12 +0300 | [diff] [blame] | 4481 | LT.second.is128BitVector() && Alignment < Align(16)) { |
Evandro Menezes | 330e1b8 | 2017-01-10 23:42:21 +0000 | [diff] [blame] | 4482 | // Unaligned stores are extremely inefficient. We don't split all |
| 4483 | // unaligned 128-bit stores because the negative impact that has shown in |
| 4484 | // practice on inlined block copy code. |
| 4485 | // We make such stores expensive so that we will only vectorize if there |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4486 | // are 6 other instructions getting vectorized. |
Evandro Menezes | 330e1b8 | 2017-01-10 23:42:21 +0000 | [diff] [blame] | 4487 | const int AmortizationCost = 6; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4488 | |
| 4489 | return LT.first * 2 * AmortizationCost; |
| 4490 | } |
| 4491 | |
Sjoerd Meijer | 5c94fab | 2022-12-16 09:20:37 +0000 | [diff] [blame] | 4492 | // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs. |
| 4493 | if (Ty->isPtrOrPtrVectorTy()) |
| 4494 | return LT.first; |
| 4495 | |
Florian Hahn | e473daa | 2024-01-17 21:32:06 +0000 | [diff] [blame] | 4496 | if (useNeonVector(Ty)) { |
| 4497 | // Check truncating stores and extending loads. |
| 4498 | if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { |
| 4499 | // v4i8 types are lowered to scalar a load/store and sshll/xtn. |
| 4500 | if (VT == MVT::v4i8) |
| 4501 | return 2; |
| 4502 | // Otherwise we need to scalarize. |
| 4503 | return cast<FixedVectorType>(Ty)->getNumElements() * 2; |
| 4504 | } |
| 4505 | EVT EltVT = VT.getVectorElementType(); |
| 4506 | unsigned EltSize = EltVT.getScalarSizeInBits(); |
| 4507 | if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 || |
Sergei Barannikov | 3334c35 | 2025-04-22 11:40:12 +0300 | [diff] [blame] | 4508 | VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1)) |
Florian Hahn | e473daa | 2024-01-17 21:32:06 +0000 | [diff] [blame] | 4509 | return LT.first; |
| 4510 | // FIXME: v3i8 lowering currently is very inefficient, due to automatic |
| 4511 | // widening to v4i8, which produces suboptimal results. |
| 4512 | if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8) |
| 4513 | return LT.first; |
| 4514 | |
| 4515 | // Check non-power-of-2 loads/stores for legal vector element types with |
| 4516 | // NEON. Non-power-of-2 memory ops will get broken down to a set of |
| 4517 | // operations on smaller power-of-2 ops, including ld1/st1. |
| 4518 | LLVMContext &C = Ty->getContext(); |
| 4519 | InstructionCost Cost(0); |
| 4520 | SmallVector<EVT> TypeWorklist; |
| 4521 | TypeWorklist.push_back(VT); |
| 4522 | while (!TypeWorklist.empty()) { |
| 4523 | EVT CurrVT = TypeWorklist.pop_back_val(); |
| 4524 | unsigned CurrNumElements = CurrVT.getVectorNumElements(); |
| 4525 | if (isPowerOf2_32(CurrNumElements)) { |
| 4526 | Cost += 1; |
| 4527 | continue; |
| 4528 | } |
| 4529 | |
| 4530 | unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2; |
| 4531 | TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2)); |
| 4532 | TypeWorklist.push_back( |
| 4533 | EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2)); |
| 4534 | } |
| 4535 | return Cost; |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 4536 | } |
| 4537 | |
| 4538 | return LT.first; |
| 4539 | } |
James Molloy | 2b8933c | 2014-08-05 12:30:34 +0000 | [diff] [blame] | 4540 | |
Sander de Smalen | 03f47bd | 2021-01-23 12:14:21 +0000 | [diff] [blame] | 4541 | InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( |
Guillaume Chatelet | fdc7c7f | 2020-06-26 11:00:53 +0000 | [diff] [blame] | 4542 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
| 4543 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 4544 | bool UseMaskForCond, bool UseMaskForGaps) const { |
Hao Liu | 7ec8ee3 | 2015-06-26 02:32:07 +0000 | [diff] [blame] | 4545 | assert(Factor >= 2 && "Invalid interleave factor"); |
Graham Hunter | 95bfb19 | 2023-03-21 11:48:49 +0000 | [diff] [blame] | 4546 | auto *VecVTy = cast<VectorType>(VecTy); |
| 4547 | |
Philip Reames | b3c687b | 2024-10-15 07:37:46 -0700 | [diff] [blame] | 4548 | if (VecTy->isScalableTy() && !ST->hasSVE()) |
Graham Hunter | 95bfb19 | 2023-03-21 11:48:49 +0000 | [diff] [blame] | 4549 | return InstructionCost::getInvalid(); |
Hao Liu | 7ec8ee3 | 2015-06-26 02:32:07 +0000 | [diff] [blame] | 4550 | |
Igor Kirillov | 17bde32 | 2023-06-12 10:18:16 +0000 | [diff] [blame] | 4551 | // Vectorization for masked interleaved accesses is only enabled for scalable |
| 4552 | // VF. |
| 4553 | if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps)) |
| 4554 | return InstructionCost::getInvalid(); |
| 4555 | |
| 4556 | if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { |
Graham Hunter | 95bfb19 | 2023-03-21 11:48:49 +0000 | [diff] [blame] | 4557 | unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); |
Christopher Tetreault | 616d8d9 | 2020-06-16 13:42:58 -0700 | [diff] [blame] | 4558 | auto *SubVecTy = |
Graham Hunter | 95bfb19 | 2023-03-21 11:48:49 +0000 | [diff] [blame] | 4559 | VectorType::get(VecVTy->getElementType(), |
| 4560 | VecVTy->getElementCount().divideCoefficientBy(Factor)); |
Hao Liu | 7ec8ee3 | 2015-06-26 02:32:07 +0000 | [diff] [blame] | 4561 | |
| 4562 | // ldN/stN only support legal vector types of size 64 or 128 in bits. |
Matthew Simpson | aee9771 | 2017-03-02 15:15:35 +0000 | [diff] [blame] | 4563 | // Accesses having vector types that are a multiple of 128 bits can be |
| 4564 | // matched to more than one ldN/stN instruction. |
Bradley Smith | 13faa5f | 2021-10-18 12:29:26 +0000 | [diff] [blame] | 4565 | bool UseScalable; |
Graham Hunter | 95bfb19 | 2023-03-21 11:48:49 +0000 | [diff] [blame] | 4566 | if (MinElts % Factor == 0 && |
Bradley Smith | 13faa5f | 2021-10-18 12:29:26 +0000 | [diff] [blame] | 4567 | TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) |
| 4568 | return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); |
Hao Liu | 7ec8ee3 | 2015-06-26 02:32:07 +0000 | [diff] [blame] | 4569 | } |
| 4570 | |
| 4571 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
Sam Parker | 40574fe | 2020-04-28 14:11:27 +0100 | [diff] [blame] | 4572 | Alignment, AddressSpace, CostKind, |
Dorit Nuzman | 34da6dd | 2018-10-31 09:57:56 +0000 | [diff] [blame] | 4573 | UseMaskForCond, UseMaskForGaps); |
Hao Liu | 7ec8ee3 | 2015-06-26 02:32:07 +0000 | [diff] [blame] | 4574 | } |
| 4575 | |
Daniil Fukalov | e1cb98b | 2021-05-20 12:09:16 +0300 | [diff] [blame] | 4576 | InstructionCost |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 4577 | AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const { |
Sander de Smalen | 03f47bd | 2021-01-23 12:14:21 +0000 | [diff] [blame] | 4578 | InstructionCost Cost = 0; |
Sam Parker | 40574fe | 2020-04-28 14:11:27 +0100 | [diff] [blame] | 4579 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
James Molloy | 2b8933c | 2014-08-05 12:30:34 +0000 | [diff] [blame] | 4580 | for (auto *I : Tys) { |
| 4581 | if (!I->isVectorTy()) |
| 4582 | continue; |
Christopher Tetreault | ab35ba5 | 2020-06-30 11:07:24 -0700 | [diff] [blame] | 4583 | if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == |
| 4584 | 128) |
Sam Parker | 40574fe | 2020-04-28 14:11:27 +0100 | [diff] [blame] | 4585 | Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + |
| 4586 | getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); |
James Molloy | 2b8933c | 2014-08-05 12:30:34 +0000 | [diff] [blame] | 4587 | } |
Daniil Fukalov | e1cb98b | 2021-05-20 12:09:16 +0300 | [diff] [blame] | 4588 | return Cost; |
James Molloy | 2b8933c | 2014-08-05 12:30:34 +0000 | [diff] [blame] | 4589 | } |
James Molloy | a88896b | 2014-08-21 00:02:51 +0000 | [diff] [blame] | 4590 | |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 4591 | unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) const { |
Matthias Braun | 651cff4 | 2016-06-02 18:03:53 +0000 | [diff] [blame] | 4592 | return ST->getMaxInterleaveFactor(); |
James Molloy | a88896b | 2014-08-21 00:02:51 +0000 | [diff] [blame] | 4593 | } |
Kevin Qin | 72a799a | 2014-10-09 10:13:27 +0000 | [diff] [blame] | 4594 | |
Geoff Berry | 378374d | 2017-06-28 18:53:09 +0000 | [diff] [blame] | 4595 | // For Falkor, we want to avoid having too many strided loads in a loop since |
| 4596 | // that can exhaust the HW prefetcher resources. We adjust the unroller |
| 4597 | // MaxCount preference below to attempt to ensure unrolling doesn't create too |
| 4598 | // many strided loads. |
| 4599 | static void |
| 4600 | getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
| 4601 | TargetTransformInfo::UnrollingPreferences &UP) { |
Geoff Berry | 0abd980 | 2017-06-28 19:36:10 +0000 | [diff] [blame] | 4602 | enum { MaxStridedLoads = 7 }; |
Geoff Berry | 378374d | 2017-06-28 18:53:09 +0000 | [diff] [blame] | 4603 | auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { |
| 4604 | int StridedLoads = 0; |
| 4605 | // FIXME? We could make this more precise by looking at the CFG and |
| 4606 | // e.g. not counting loads in each side of an if-then-else diamond. |
| 4607 | for (const auto BB : L->blocks()) { |
| 4608 | for (auto &I : *BB) { |
| 4609 | LoadInst *LMemI = dyn_cast<LoadInst>(&I); |
| 4610 | if (!LMemI) |
| 4611 | continue; |
| 4612 | |
| 4613 | Value *PtrValue = LMemI->getPointerOperand(); |
| 4614 | if (L->isLoopInvariant(PtrValue)) |
| 4615 | continue; |
| 4616 | |
| 4617 | const SCEV *LSCEV = SE.getSCEV(PtrValue); |
| 4618 | const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); |
| 4619 | if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) |
| 4620 | continue; |
| 4621 | |
| 4622 | // FIXME? We could take pairing of unrolled load copies into account |
| 4623 | // by looking at the AddRec, but we would probably have to limit this |
| 4624 | // to loops with no stores or other memory optimization barriers. |
| 4625 | ++StridedLoads; |
| 4626 | // We've seen enough strided loads that seeing more won't make a |
| 4627 | // difference. |
| 4628 | if (StridedLoads > MaxStridedLoads / 2) |
| 4629 | return StridedLoads; |
| 4630 | } |
| 4631 | } |
| 4632 | return StridedLoads; |
| 4633 | }; |
| 4634 | |
| 4635 | int StridedLoads = countStridedLoads(L, SE); |
Nicola Zaghen | d34e60c | 2018-05-14 12:53:11 +0000 | [diff] [blame] | 4636 | LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads |
| 4637 | << " strided loads\n"); |
Geoff Berry | 378374d | 2017-06-28 18:53:09 +0000 | [diff] [blame] | 4638 | // Pick the largest power of 2 unroll count that won't result in too many |
| 4639 | // strided loads. |
| 4640 | if (StridedLoads) { |
| 4641 | UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); |
Nicola Zaghen | d34e60c | 2018-05-14 12:53:11 +0000 | [diff] [blame] | 4642 | LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " |
| 4643 | << UP.MaxCount << '\n'); |
Geoff Berry | 378374d | 2017-06-28 18:53:09 +0000 | [diff] [blame] | 4644 | } |
| 4645 | } |
| 4646 | |
David Sherwood | 712c213 | 2025-04-09 10:34:27 +0100 | [diff] [blame] | 4647 | // This function returns true if the loop: |
| 4648 | // 1. Has a valid cost, and |
| 4649 | // 2. Has a cost within the supplied budget. |
| 4650 | // Otherwise it returns false. |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 4651 | static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, |
David Sherwood | 712c213 | 2025-04-09 10:34:27 +0100 | [diff] [blame] | 4652 | InstructionCost Budget, |
| 4653 | unsigned *FinalSize) { |
| 4654 | // Estimate the size of the loop. |
| 4655 | InstructionCost LoopCost = 0; |
| 4656 | |
| 4657 | for (auto *BB : L->getBlocks()) { |
| 4658 | for (auto &I : *BB) { |
| 4659 | SmallVector<const Value *, 4> Operands(I.operand_values()); |
| 4660 | InstructionCost Cost = |
| 4661 | TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize); |
| 4662 | // This can happen with intrinsics that don't currently have a cost model |
| 4663 | // or for some operations that require SVE. |
| 4664 | if (!Cost.isValid()) |
| 4665 | return false; |
| 4666 | |
| 4667 | LoopCost += Cost; |
| 4668 | if (LoopCost > Budget) |
| 4669 | return false; |
| 4670 | } |
| 4671 | } |
| 4672 | |
| 4673 | if (FinalSize) |
David Green | 98b6f8d | 2025-04-23 07:46:27 +0100 | [diff] [blame] | 4674 | *FinalSize = LoopCost.getValue(); |
David Sherwood | 712c213 | 2025-04-09 10:34:27 +0100 | [diff] [blame] | 4675 | return true; |
| 4676 | } |
| 4677 | |
| 4678 | static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 4679 | const AArch64TTIImpl &TTI) { |
David Sherwood | 712c213 | 2025-04-09 10:34:27 +0100 | [diff] [blame] | 4680 | // Only consider loops with unknown trip counts for which we can determine |
| 4681 | // a symbolic expression. Multi-exit loops with small known trip counts will |
| 4682 | // likely be unrolled anyway. |
| 4683 | const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L); |
| 4684 | if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC)) |
| 4685 | return false; |
| 4686 | |
| 4687 | // It might not be worth unrolling loops with low max trip counts. Restrict |
| 4688 | // this to max trip counts > 32 for now. |
| 4689 | unsigned MaxTC = SE.getSmallConstantMaxTripCount(L); |
| 4690 | if (MaxTC > 0 && MaxTC <= 32) |
| 4691 | return false; |
| 4692 | |
| 4693 | // Make sure the loop size is <= 5. |
| 4694 | if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr)) |
| 4695 | return false; |
| 4696 | |
| 4697 | // Small search loops with multiple exits can be highly beneficial to unroll. |
| 4698 | // We only care about loops with exactly two exiting blocks, although each |
| 4699 | // block could jump to the same exit block. |
| 4700 | ArrayRef<BasicBlock *> Blocks = L->getBlocks(); |
| 4701 | if (Blocks.size() != 2) |
| 4702 | return false; |
| 4703 | |
| 4704 | if (any_of(Blocks, [](BasicBlock *BB) { |
| 4705 | return !isa<BranchInst>(BB->getTerminator()); |
| 4706 | })) |
| 4707 | return false; |
| 4708 | |
| 4709 | return true; |
| 4710 | } |
| 4711 | |
Florian Hahn | 0bb7bd4 | 2024-12-09 14:28:31 +0000 | [diff] [blame] | 4712 | /// For Apple CPUs, we want to runtime-unroll loops to make better use if the |
| 4713 | /// OOO engine's wide instruction window and various predictors. |
| 4714 | static void |
| 4715 | getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, |
| 4716 | TargetTransformInfo::UnrollingPreferences &UP, |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 4717 | const AArch64TTIImpl &TTI) { |
Florian Hahn | 0bb7bd4 | 2024-12-09 14:28:31 +0000 | [diff] [blame] | 4718 | // Limit loops with structure that is highly likely to benefit from runtime |
Florian Hahn | 46a13a5 | 2025-02-27 14:42:45 +0000 | [diff] [blame] | 4719 | // unrolling; that is we exclude outer loops and loops with many blocks (i.e. |
| 4720 | // likely with complex control flow). Note that the heuristics here may be |
| 4721 | // overly conservative and we err on the side of avoiding runtime unrolling |
| 4722 | // rather than unroll excessively. They are all subject to further refinement. |
| 4723 | if (!L->isInnermost() || L->getNumBlocks() > 8) |
Florian Hahn | 0bb7bd4 | 2024-12-09 14:28:31 +0000 | [diff] [blame] | 4724 | return; |
| 4725 | |
David Sherwood | 712c213 | 2025-04-09 10:34:27 +0100 | [diff] [blame] | 4726 | // Loops with multiple exits are handled by common code. |
| 4727 | if (!L->getExitBlock()) |
| 4728 | return; |
| 4729 | |
Florian Hahn | 46a13a5 | 2025-02-27 14:42:45 +0000 | [diff] [blame] | 4730 | const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L); |
Florian Hahn | 0bb7bd4 | 2024-12-09 14:28:31 +0000 | [diff] [blame] | 4731 | if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) || |
| 4732 | (SE.getSmallConstantMaxTripCount(L) > 0 && |
| 4733 | SE.getSmallConstantMaxTripCount(L) <= 32)) |
| 4734 | return; |
David Sherwood | 712c213 | 2025-04-09 10:34:27 +0100 | [diff] [blame] | 4735 | |
Florian Hahn | 0bb7bd4 | 2024-12-09 14:28:31 +0000 | [diff] [blame] | 4736 | if (findStringMetadataForLoop(L, "llvm.loop.isvectorized")) |
| 4737 | return; |
| 4738 | |
Florian Hahn | 46a13a5 | 2025-02-27 14:42:45 +0000 | [diff] [blame] | 4739 | if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L)) |
| 4740 | return; |
| 4741 | |
Florian Hahn | 0bb7bd4 | 2024-12-09 14:28:31 +0000 | [diff] [blame] | 4742 | // Limit to loops with trip counts that are cheap to expand. |
| 4743 | UP.SCEVExpansionBudget = 1; |
| 4744 | |
| 4745 | // Try to unroll small, single block loops, if they have load/store |
| 4746 | // dependencies, to expose more parallel memory access streams. |
Florian Hahn | d486b76 | 2024-12-22 13:10:54 +0000 | [diff] [blame] | 4747 | BasicBlock *Header = L->getHeader(); |
| 4748 | if (Header == L->getLoopLatch()) { |
David Sherwood | 712c213 | 2025-04-09 10:34:27 +0100 | [diff] [blame] | 4749 | // Estimate the size of the loop. |
| 4750 | unsigned Size; |
| 4751 | if (!isLoopSizeWithinBudget(L, TTI, 8, &Size)) |
Florian Hahn | d486b76 | 2024-12-22 13:10:54 +0000 | [diff] [blame] | 4752 | return; |
Florian Hahn | 0bb7bd4 | 2024-12-09 14:28:31 +0000 | [diff] [blame] | 4753 | |
Florian Hahn | d486b76 | 2024-12-22 13:10:54 +0000 | [diff] [blame] | 4754 | SmallPtrSet<Value *, 8> LoadedValues; |
| 4755 | SmallVector<StoreInst *> Stores; |
| 4756 | for (auto *BB : L->blocks()) { |
| 4757 | for (auto &I : *BB) { |
| 4758 | Value *Ptr = getLoadStorePointerOperand(&I); |
| 4759 | if (!Ptr) |
| 4760 | continue; |
| 4761 | const SCEV *PtrSCEV = SE.getSCEV(Ptr); |
| 4762 | if (SE.isLoopInvariant(PtrSCEV, L)) |
| 4763 | continue; |
| 4764 | if (isa<LoadInst>(&I)) |
| 4765 | LoadedValues.insert(&I); |
| 4766 | else |
| 4767 | Stores.push_back(cast<StoreInst>(&I)); |
| 4768 | } |
Florian Hahn | 0bb7bd4 | 2024-12-09 14:28:31 +0000 | [diff] [blame] | 4769 | } |
Florian Hahn | d486b76 | 2024-12-22 13:10:54 +0000 | [diff] [blame] | 4770 | |
| 4771 | // Try to find an unroll count that maximizes the use of the instruction |
| 4772 | // window, i.e. trying to fetch as many instructions per cycle as possible. |
| 4773 | unsigned MaxInstsPerLine = 16; |
| 4774 | unsigned UC = 1; |
| 4775 | unsigned BestUC = 1; |
| 4776 | unsigned SizeWithBestUC = BestUC * Size; |
| 4777 | while (UC <= 8) { |
| 4778 | unsigned SizeWithUC = UC * Size; |
| 4779 | if (SizeWithUC > 48) |
| 4780 | break; |
| 4781 | if ((SizeWithUC % MaxInstsPerLine) == 0 || |
| 4782 | (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) { |
| 4783 | BestUC = UC; |
| 4784 | SizeWithBestUC = BestUC * Size; |
| 4785 | } |
| 4786 | UC++; |
| 4787 | } |
| 4788 | |
| 4789 | if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) { |
| 4790 | return LoadedValues.contains(SI->getOperand(0)); |
| 4791 | })) |
| 4792 | return; |
| 4793 | |
| 4794 | UP.Runtime = true; |
| 4795 | UP.DefaultUnrollRuntimeCount = BestUC; |
| 4796 | return; |
Florian Hahn | 0bb7bd4 | 2024-12-09 14:28:31 +0000 | [diff] [blame] | 4797 | } |
| 4798 | |
Florian Hahn | d486b76 | 2024-12-22 13:10:54 +0000 | [diff] [blame] | 4799 | // Try to runtime-unroll loops with early-continues depending on loop-varying |
| 4800 | // loads; this helps with branch-prediction for the early-continues. |
| 4801 | auto *Term = dyn_cast<BranchInst>(Header->getTerminator()); |
| 4802 | auto *Latch = L->getLoopLatch(); |
| 4803 | SmallVector<BasicBlock *> Preds(predecessors(Latch)); |
| 4804 | if (!Term || !Term->isConditional() || Preds.size() == 1 || |
| 4805 | none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) || |
| 4806 | none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); })) |
Florian Hahn | 0bb7bd4 | 2024-12-09 14:28:31 +0000 | [diff] [blame] | 4807 | return; |
| 4808 | |
Florian Hahn | d486b76 | 2024-12-22 13:10:54 +0000 | [diff] [blame] | 4809 | std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad = |
| 4810 | [&](Instruction *I, unsigned Depth) -> bool { |
| 4811 | if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8) |
| 4812 | return false; |
| 4813 | |
| 4814 | if (isa<LoadInst>(I)) |
| 4815 | return true; |
| 4816 | |
| 4817 | return any_of(I->operands(), [&](Value *V) { |
| 4818 | auto *I = dyn_cast<Instruction>(V); |
| 4819 | return I && DependsOnLoopLoad(I, Depth + 1); |
| 4820 | }); |
| 4821 | }; |
| 4822 | CmpPredicate Pred; |
| 4823 | Instruction *I; |
| 4824 | if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(), |
| 4825 | m_Value())) && |
| 4826 | DependsOnLoopLoad(I, 0)) { |
| 4827 | UP.Runtime = true; |
| 4828 | } |
Florian Hahn | 0bb7bd4 | 2024-12-09 14:28:31 +0000 | [diff] [blame] | 4829 | } |
| 4830 | |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 4831 | void AArch64TTIImpl::getUnrollingPreferences( |
| 4832 | Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, |
| 4833 | OptimizationRemarkEmitter *ORE) const { |
Kevin Qin | aef6841 | 2015-03-09 06:14:28 +0000 | [diff] [blame] | 4834 | // Enable partial unrolling and runtime unrolling. |
Roman Lebedev | 6f6e9a8 | 2021-08-03 00:57:26 +0300 | [diff] [blame] | 4835 | BaseT::getUnrollingPreferences(L, SE, UP, ORE); |
Kevin Qin | aef6841 | 2015-03-09 06:14:28 +0000 | [diff] [blame] | 4836 | |
Jingu Kang | 94c49529 | 2021-07-14 11:43:29 +0100 | [diff] [blame] | 4837 | UP.UpperBound = true; |
| 4838 | |
Kevin Qin | aef6841 | 2015-03-09 06:14:28 +0000 | [diff] [blame] | 4839 | // For inner loop, it is more likely to be a hot one, and the runtime check |
| 4840 | // can be promoted out from LICM pass, so the overhead is less, let's try |
| 4841 | // a larger threshold to unroll more loops. |
| 4842 | if (L->getLoopDepth() > 1) |
| 4843 | UP.PartialThreshold *= 2; |
| 4844 | |
Kevin Qin | 72a799a | 2014-10-09 10:13:27 +0000 | [diff] [blame] | 4845 | // Disable partial & runtime unrolling on -Os. |
| 4846 | UP.PartialOptSizeThreshold = 0; |
Geoff Berry | 378374d | 2017-06-28 18:53:09 +0000 | [diff] [blame] | 4847 | |
David Sherwood | 712c213 | 2025-04-09 10:34:27 +0100 | [diff] [blame] | 4848 | // Scan the loop: don't unroll loops with calls as this could prevent |
| 4849 | // inlining. Don't unroll vector loops either, as they don't benefit much from |
| 4850 | // unrolling. |
| 4851 | for (auto *BB : L->getBlocks()) { |
| 4852 | for (auto &I : *BB) { |
| 4853 | // Don't unroll vectorised loop. |
| 4854 | if (I.getType()->isVectorTy()) |
| 4855 | return; |
| 4856 | |
| 4857 | if (isa<CallBase>(I)) { |
| 4858 | if (isa<CallInst>(I) || isa<InvokeInst>(I)) |
| 4859 | if (const Function *F = cast<CallBase>(I).getCalledFunction()) |
| 4860 | if (!isLoweredToCall(F)) |
| 4861 | continue; |
| 4862 | return; |
| 4863 | } |
| 4864 | } |
| 4865 | } |
| 4866 | |
Florian Hahn | 0bb7bd4 | 2024-12-09 14:28:31 +0000 | [diff] [blame] | 4867 | // Apply subtarget-specific unrolling preferences. |
| 4868 | switch (ST->getProcFamily()) { |
| 4869 | case AArch64Subtarget::AppleA14: |
| 4870 | case AArch64Subtarget::AppleA15: |
| 4871 | case AArch64Subtarget::AppleA16: |
| 4872 | case AArch64Subtarget::AppleM4: |
| 4873 | getAppleRuntimeUnrollPreferences(L, SE, UP, *this); |
| 4874 | break; |
| 4875 | case AArch64Subtarget::Falkor: |
| 4876 | if (EnableFalkorHWPFUnrollFix) |
| 4877 | getFalkorUnrollingPreferences(L, SE, UP); |
| 4878 | break; |
| 4879 | default: |
| 4880 | break; |
| 4881 | } |
Nicholas Guy | 2b6e0c9 | 2021-03-04 14:36:13 +0000 | [diff] [blame] | 4882 | |
David Sherwood | 712c213 | 2025-04-09 10:34:27 +0100 | [diff] [blame] | 4883 | // If this is a small, multi-exit loop similar to something like std::find, |
| 4884 | // then there is typically a performance improvement achieved by unrolling. |
| 4885 | if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) { |
| 4886 | UP.RuntimeUnrollMultiExit = true; |
| 4887 | UP.Runtime = true; |
| 4888 | // Limit unroll count. |
| 4889 | UP.DefaultUnrollRuntimeCount = 4; |
| 4890 | // Allow slightly more costly trip-count expansion to catch search loops |
| 4891 | // with pointer inductions. |
| 4892 | UP.SCEVExpansionBudget = 5; |
| 4893 | return; |
Nicholas Guy | 2b6e0c9 | 2021-03-04 14:36:13 +0000 | [diff] [blame] | 4894 | } |
| 4895 | |
| 4896 | // Enable runtime unrolling for in-order models |
| 4897 | // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by |
| 4898 | // checking for that case, we can ensure that the default behaviour is |
| 4899 | // unchanged |
David Green | 6424abc | 2025-02-07 10:16:57 +0000 | [diff] [blame] | 4900 | if (ST->getProcFamily() != AArch64Subtarget::Generic && |
Nicholas Guy | 2b6e0c9 | 2021-03-04 14:36:13 +0000 | [diff] [blame] | 4901 | !ST->getSchedModel().isOutOfOrder()) { |
| 4902 | UP.Runtime = true; |
| 4903 | UP.Partial = true; |
Nicholas Guy | 2b6e0c9 | 2021-03-04 14:36:13 +0000 | [diff] [blame] | 4904 | UP.UnrollRemainder = true; |
| 4905 | UP.DefaultUnrollRuntimeCount = 4; |
Nicholas Guy | 3043cbc | 2021-05-26 14:49:58 +0100 | [diff] [blame] | 4906 | |
| 4907 | UP.UnrollAndJam = true; |
| 4908 | UP.UnrollAndJamInnerLoopThreshold = 60; |
Nicholas Guy | 2b6e0c9 | 2021-03-04 14:36:13 +0000 | [diff] [blame] | 4909 | } |
Kevin Qin | 72a799a | 2014-10-09 10:13:27 +0000 | [diff] [blame] | 4910 | } |
Chad Rosier | f9327d6f | 2015-01-26 22:51:15 +0000 | [diff] [blame] | 4911 | |
Sidharth Baveja | e541e1b | 2020-07-10 18:38:08 +0000 | [diff] [blame] | 4912 | void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 4913 | TTI::PeelingPreferences &PP) const { |
Sidharth Baveja | e541e1b | 2020-07-10 18:38:08 +0000 | [diff] [blame] | 4914 | BaseT::getPeelingPreferences(L, SE, PP); |
| 4915 | } |
| 4916 | |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 4917 | Value * |
| 4918 | AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, |
| 4919 | Type *ExpectedType) const { |
Chad Rosier | f9327d6f | 2015-01-26 22:51:15 +0000 | [diff] [blame] | 4920 | switch (Inst->getIntrinsicID()) { |
| 4921 | default: |
| 4922 | return nullptr; |
| 4923 | case Intrinsic::aarch64_neon_st2: |
| 4924 | case Intrinsic::aarch64_neon_st3: |
| 4925 | case Intrinsic::aarch64_neon_st4: { |
| 4926 | // Create a struct type |
| 4927 | StructType *ST = dyn_cast<StructType>(ExpectedType); |
| 4928 | if (!ST) |
| 4929 | return nullptr; |
Kazu Hirata | c1e32b3 | 2021-10-02 12:06:29 -0700 | [diff] [blame] | 4930 | unsigned NumElts = Inst->arg_size() - 1; |
Chad Rosier | f9327d6f | 2015-01-26 22:51:15 +0000 | [diff] [blame] | 4931 | if (ST->getNumElements() != NumElts) |
| 4932 | return nullptr; |
| 4933 | for (unsigned i = 0, e = NumElts; i != e; ++i) { |
| 4934 | if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) |
| 4935 | return nullptr; |
| 4936 | } |
Manuel Brito | 1e55d5b | 2022-11-21 18:41:01 +0000 | [diff] [blame] | 4937 | Value *Res = PoisonValue::get(ExpectedType); |
Chad Rosier | f9327d6f | 2015-01-26 22:51:15 +0000 | [diff] [blame] | 4938 | IRBuilder<> Builder(Inst); |
| 4939 | for (unsigned i = 0, e = NumElts; i != e; ++i) { |
| 4940 | Value *L = Inst->getArgOperand(i); |
| 4941 | Res = Builder.CreateInsertValue(Res, L, i); |
| 4942 | } |
| 4943 | return Res; |
| 4944 | } |
| 4945 | case Intrinsic::aarch64_neon_ld2: |
| 4946 | case Intrinsic::aarch64_neon_ld3: |
| 4947 | case Intrinsic::aarch64_neon_ld4: |
| 4948 | if (Inst->getType() == ExpectedType) |
| 4949 | return Inst; |
| 4950 | return nullptr; |
| 4951 | } |
| 4952 | } |
| 4953 | |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 4954 | bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 4955 | MemIntrinsicInfo &Info) const { |
Chad Rosier | f9327d6f | 2015-01-26 22:51:15 +0000 | [diff] [blame] | 4956 | switch (Inst->getIntrinsicID()) { |
| 4957 | default: |
| 4958 | break; |
| 4959 | case Intrinsic::aarch64_neon_ld2: |
| 4960 | case Intrinsic::aarch64_neon_ld3: |
| 4961 | case Intrinsic::aarch64_neon_ld4: |
| 4962 | Info.ReadMem = true; |
| 4963 | Info.WriteMem = false; |
Chad Rosier | f9327d6f | 2015-01-26 22:51:15 +0000 | [diff] [blame] | 4964 | Info.PtrVal = Inst->getArgOperand(0); |
| 4965 | break; |
| 4966 | case Intrinsic::aarch64_neon_st2: |
| 4967 | case Intrinsic::aarch64_neon_st3: |
| 4968 | case Intrinsic::aarch64_neon_st4: |
| 4969 | Info.ReadMem = false; |
| 4970 | Info.WriteMem = true; |
Kazu Hirata | c1e32b3 | 2021-10-02 12:06:29 -0700 | [diff] [blame] | 4971 | Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); |
Chad Rosier | f9327d6f | 2015-01-26 22:51:15 +0000 | [diff] [blame] | 4972 | break; |
| 4973 | } |
| 4974 | |
| 4975 | switch (Inst->getIntrinsicID()) { |
| 4976 | default: |
| 4977 | return false; |
| 4978 | case Intrinsic::aarch64_neon_ld2: |
| 4979 | case Intrinsic::aarch64_neon_st2: |
| 4980 | Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; |
| 4981 | break; |
| 4982 | case Intrinsic::aarch64_neon_ld3: |
| 4983 | case Intrinsic::aarch64_neon_st3: |
| 4984 | Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; |
| 4985 | break; |
| 4986 | case Intrinsic::aarch64_neon_ld4: |
| 4987 | case Intrinsic::aarch64_neon_st4: |
| 4988 | Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; |
| 4989 | break; |
| 4990 | } |
| 4991 | return true; |
| 4992 | } |
Adam Nemet | 53e758f | 2016-03-18 00:27:29 +0000 | [diff] [blame] | 4993 | |
Jun Bum Lim | dee5565 | 2017-04-03 19:20:07 +0000 | [diff] [blame] | 4994 | /// See if \p I should be considered for address type promotion. We check if \p |
| 4995 | /// I is a sext with right type and used in memory accesses. If it used in a |
| 4996 | /// "complex" getelementptr, we allow it to be promoted without finding other |
| 4997 | /// sext instructions that sign extended the same initial value. A getelementptr |
| 4998 | /// is considered as "complex" if it has more than 2 operands. |
| 4999 | bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 5000 | const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const { |
Jun Bum Lim | dee5565 | 2017-04-03 19:20:07 +0000 | [diff] [blame] | 5001 | bool Considerable = false; |
| 5002 | AllowPromotionWithoutCommonHeader = false; |
| 5003 | if (!isa<SExtInst>(&I)) |
| 5004 | return false; |
| 5005 | Type *ConsideredSExtType = |
| 5006 | Type::getInt64Ty(I.getParent()->getParent()->getContext()); |
| 5007 | if (I.getType() != ConsideredSExtType) |
| 5008 | return false; |
| 5009 | // See if the sext is the one with the right type and used in at least one |
| 5010 | // GetElementPtrInst. |
| 5011 | for (const User *U : I.users()) { |
| 5012 | if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { |
| 5013 | Considerable = true; |
| 5014 | // A getelementptr is considered as "complex" if it has more than 2 |
| 5015 | // operands. We will promote a SExt used in such complex GEP as we |
| 5016 | // expect some computation to be merged if they are done on 64 bits. |
| 5017 | if (GEPInst->getNumOperands() > 2) { |
| 5018 | AllowPromotionWithoutCommonHeader = true; |
| 5019 | break; |
| 5020 | } |
| 5021 | } |
| 5022 | } |
| 5023 | return Considerable; |
| 5024 | } |
| 5025 | |
Simon Pilgrim | 5e6bfb6 | 2021-06-11 10:19:37 +0100 | [diff] [blame] | 5026 | bool AArch64TTIImpl::isLegalToVectorizeReduction( |
| 5027 | const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { |
Kerry McLaughlin | ba1e150 | 2021-02-16 10:43:42 +0000 | [diff] [blame] | 5028 | if (!VF.isScalable()) |
| 5029 | return true; |
| 5030 | |
| 5031 | Type *Ty = RdxDesc.getRecurrenceType(); |
Kerry McLaughlin | a751240 | 2021-07-06 10:49:43 +0100 | [diff] [blame] | 5032 | if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) |
Kerry McLaughlin | ba1e150 | 2021-02-16 10:43:42 +0000 | [diff] [blame] | 5033 | return false; |
| 5034 | |
| 5035 | switch (RdxDesc.getRecurrenceKind()) { |
| 5036 | case RecurKind::Add: |
| 5037 | case RecurKind::FAdd: |
| 5038 | case RecurKind::And: |
| 5039 | case RecurKind::Or: |
| 5040 | case RecurKind::Xor: |
| 5041 | case RecurKind::SMin: |
| 5042 | case RecurKind::SMax: |
| 5043 | case RecurKind::UMin: |
| 5044 | case RecurKind::UMax: |
| 5045 | case RecurKind::FMin: |
| 5046 | case RecurKind::FMax: |
Rosie Sumpter | c2441b6 | 2021-10-11 15:50:44 +0100 | [diff] [blame] | 5047 | case RecurKind::FMulAdd: |
Mel Chen | 425e9e8 | 2023-07-19 02:51:15 -0700 | [diff] [blame] | 5048 | case RecurKind::IAnyOf: |
| 5049 | case RecurKind::FAnyOf: |
Kerry McLaughlin | ba1e150 | 2021-02-16 10:43:42 +0000 | [diff] [blame] | 5050 | return true; |
| 5051 | default: |
| 5052 | return false; |
| 5053 | } |
| 5054 | } |
| 5055 | |
Sander de Smalen | 2285dfb | 2021-01-22 22:07:09 +0000 | [diff] [blame] | 5056 | InstructionCost |
David Green | 12025ce | 2023-07-04 15:02:30 +0100 | [diff] [blame] | 5057 | AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, |
| 5058 | FastMathFlags FMF, |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 5059 | TTI::TargetCostKind CostKind) const { |
David Green | 0b745a1 | 2024-08-09 14:25:07 +0100 | [diff] [blame] | 5060 | // The code-generator is currently not able to handle scalable vectors |
| 5061 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 5062 | // it. This change will be removed when code-generation for these types is |
| 5063 | // sufficiently reliable. |
| 5064 | if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) |
| 5065 | if (VTy->getElementCount() == ElementCount::getScalable(1)) |
| 5066 | return InstructionCost::getInvalid(); |
| 5067 | |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 5068 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
David Green | 649cf45 | 2021-08-05 23:23:24 +0100 | [diff] [blame] | 5069 | |
| 5070 | if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) |
David Green | 12025ce | 2023-07-04 15:02:30 +0100 | [diff] [blame] | 5071 | return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); |
David Green | 649cf45 | 2021-08-05 23:23:24 +0100 | [diff] [blame] | 5072 | |
Sander de Smalen | db134e2 | 2021-01-22 21:44:23 +0000 | [diff] [blame] | 5073 | InstructionCost LegalizationCost = 0; |
Caroline Concatto | 172f1f8 | 2020-12-21 15:04:29 +0000 | [diff] [blame] | 5074 | if (LT.first > 1) { |
| 5075 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); |
David Green | 12025ce | 2023-07-04 15:02:30 +0100 | [diff] [blame] | 5076 | IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF); |
David Green | 649cf45 | 2021-08-05 23:23:24 +0100 | [diff] [blame] | 5077 | LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); |
Caroline Concatto | 172f1f8 | 2020-12-21 15:04:29 +0000 | [diff] [blame] | 5078 | } |
| 5079 | |
| 5080 | return LegalizationCost + /*Cost of horizontal reduction*/ 2; |
| 5081 | } |
| 5082 | |
Sander de Smalen | bd86824 | 2021-01-22 21:33:51 +0000 | [diff] [blame] | 5083 | InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 5084 | unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const { |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 5085 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); |
Sander de Smalen | 4f42d87 | 2021-04-14 16:53:01 +0100 | [diff] [blame] | 5086 | InstructionCost LegalizationCost = 0; |
Caroline Concatto | 172f1f8 | 2020-12-21 15:04:29 +0000 | [diff] [blame] | 5087 | if (LT.first > 1) { |
| 5088 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); |
| 5089 | LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); |
| 5090 | LegalizationCost *= LT.first - 1; |
| 5091 | } |
| 5092 | |
| 5093 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 5094 | assert(ISD && "Invalid opcode"); |
| 5095 | // Add the final reduction cost for the legal horizontal reduction |
| 5096 | switch (ISD) { |
| 5097 | case ISD::ADD: |
| 5098 | case ISD::AND: |
| 5099 | case ISD::OR: |
| 5100 | case ISD::XOR: |
| 5101 | case ISD::FADD: |
| 5102 | return LegalizationCost + 2; |
| 5103 | default: |
Sander de Smalen | bd86824 | 2021-01-22 21:33:51 +0000 | [diff] [blame] | 5104 | return InstructionCost::getInvalid(); |
Caroline Concatto | 172f1f8 | 2020-12-21 15:04:29 +0000 | [diff] [blame] | 5105 | } |
| 5106 | } |
| 5107 | |
Sander de Smalen | bd86824 | 2021-01-22 21:33:51 +0000 | [diff] [blame] | 5108 | InstructionCost |
| 5109 | AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, |
Krzysztof Parzyszek | 86fe4df | 2022-12-02 09:35:05 -0800 | [diff] [blame] | 5110 | std::optional<FastMathFlags> FMF, |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 5111 | TTI::TargetCostKind CostKind) const { |
David Green | 0b745a1 | 2024-08-09 14:25:07 +0100 | [diff] [blame] | 5112 | // The code-generator is currently not able to handle scalable vectors |
| 5113 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 5114 | // it. This change will be removed when code-generation for these types is |
| 5115 | // sufficiently reliable. |
| 5116 | if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy)) |
| 5117 | if (VTy->getElementCount() == ElementCount::getScalable(1)) |
| 5118 | return InstructionCost::getInvalid(); |
| 5119 | |
David Sherwood | 0aff179 | 2021-07-07 13:18:20 +0100 | [diff] [blame] | 5120 | if (TTI::requiresOrderedReduction(FMF)) { |
David Sherwood | 219d451 | 2021-08-18 09:40:21 +0100 | [diff] [blame] | 5121 | if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { |
| 5122 | InstructionCost BaseCost = |
| 5123 | BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); |
| 5124 | // Add on extra cost to reflect the extra overhead on some CPUs. We still |
| 5125 | // end up vectorizing for more computationally intensive loops. |
| 5126 | return BaseCost + FixedVTy->getNumElements(); |
| 5127 | } |
David Sherwood | 0aff179 | 2021-07-07 13:18:20 +0100 | [diff] [blame] | 5128 | |
| 5129 | if (Opcode != Instruction::FAdd) |
| 5130 | return InstructionCost::getInvalid(); |
| 5131 | |
| 5132 | auto *VTy = cast<ScalableVectorType>(ValTy); |
| 5133 | InstructionCost Cost = |
| 5134 | getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); |
| 5135 | Cost *= getMaxNumElements(VTy->getElementCount()); |
| 5136 | return Cost; |
| 5137 | } |
| 5138 | |
Caroline Concatto | 172f1f8 | 2020-12-21 15:04:29 +0000 | [diff] [blame] | 5139 | if (isa<ScalableVectorType>(ValTy)) |
David Green | 38c9a40 | 2021-07-09 11:51:16 +0100 | [diff] [blame] | 5140 | return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); |
Matthew Simpson | eacfefd | 2018-03-16 11:34:15 +0000 | [diff] [blame] | 5141 | |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 5142 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); |
Matthew Simpson | eacfefd | 2018-03-16 11:34:15 +0000 | [diff] [blame] | 5143 | MVT MTy = LT.second; |
| 5144 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 5145 | assert(ISD && "Invalid opcode"); |
| 5146 | |
| 5147 | // Horizontal adds can use the 'addv' instruction. We model the cost of these |
David Green | c9cebda | 2021-07-22 18:19:54 +0100 | [diff] [blame] | 5148 | // instructions as twice a normal vector add, plus 1 for each legalization |
| 5149 | // step (LT.first). This is the only arithmetic vector reduction operation for |
| 5150 | // which we have an instruction. |
Rosie Sumpter | 0c4651f | 2021-06-15 10:29:27 +0100 | [diff] [blame] | 5151 | // OR, XOR and AND costs should match the codegen from: |
| 5152 | // OR: llvm/test/CodeGen/AArch64/reduce-or.ll |
| 5153 | // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll |
| 5154 | // AND: llvm/test/CodeGen/AArch64/reduce-and.ll |
Matthew Simpson | eacfefd | 2018-03-16 11:34:15 +0000 | [diff] [blame] | 5155 | static const CostTblEntry CostTblNoPairwise[]{ |
David Green | c9cebda | 2021-07-22 18:19:54 +0100 | [diff] [blame] | 5156 | {ISD::ADD, MVT::v8i8, 2}, |
| 5157 | {ISD::ADD, MVT::v16i8, 2}, |
| 5158 | {ISD::ADD, MVT::v4i16, 2}, |
| 5159 | {ISD::ADD, MVT::v8i16, 2}, |
| 5160 | {ISD::ADD, MVT::v4i32, 2}, |
Vasileios Porpodas | f669030 | 2022-07-28 17:01:15 -0700 | [diff] [blame] | 5161 | {ISD::ADD, MVT::v2i64, 2}, |
Rosie Sumpter | 0c4651f | 2021-06-15 10:29:27 +0100 | [diff] [blame] | 5162 | {ISD::OR, MVT::v8i8, 15}, |
| 5163 | {ISD::OR, MVT::v16i8, 17}, |
| 5164 | {ISD::OR, MVT::v4i16, 7}, |
| 5165 | {ISD::OR, MVT::v8i16, 9}, |
| 5166 | {ISD::OR, MVT::v2i32, 3}, |
| 5167 | {ISD::OR, MVT::v4i32, 5}, |
| 5168 | {ISD::OR, MVT::v2i64, 3}, |
| 5169 | {ISD::XOR, MVT::v8i8, 15}, |
| 5170 | {ISD::XOR, MVT::v16i8, 17}, |
| 5171 | {ISD::XOR, MVT::v4i16, 7}, |
| 5172 | {ISD::XOR, MVT::v8i16, 9}, |
| 5173 | {ISD::XOR, MVT::v2i32, 3}, |
| 5174 | {ISD::XOR, MVT::v4i32, 5}, |
| 5175 | {ISD::XOR, MVT::v2i64, 3}, |
| 5176 | {ISD::AND, MVT::v8i8, 15}, |
| 5177 | {ISD::AND, MVT::v16i8, 17}, |
| 5178 | {ISD::AND, MVT::v4i16, 7}, |
| 5179 | {ISD::AND, MVT::v8i16, 9}, |
| 5180 | {ISD::AND, MVT::v2i32, 3}, |
| 5181 | {ISD::AND, MVT::v4i32, 5}, |
| 5182 | {ISD::AND, MVT::v2i64, 3}, |
Matthew Simpson | eacfefd | 2018-03-16 11:34:15 +0000 | [diff] [blame] | 5183 | }; |
Rosie Sumpter | 0c4651f | 2021-06-15 10:29:27 +0100 | [diff] [blame] | 5184 | switch (ISD) { |
| 5185 | default: |
| 5186 | break; |
Sushant Gokhale | c5672e2 | 2024-09-24 14:35:01 +0530 | [diff] [blame] | 5187 | case ISD::FADD: |
| 5188 | if (Type *EltTy = ValTy->getScalarType(); |
| 5189 | // FIXME: For half types without fullfp16 support, this could extend and |
| 5190 | // use a fp32 faddp reduction but current codegen unrolls. |
| 5191 | MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() || |
| 5192 | (EltTy->isHalfTy() && ST->hasFullFP16()))) { |
| 5193 | const unsigned NElts = MTy.getVectorNumElements(); |
| 5194 | if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 && |
| 5195 | isPowerOf2_32(NElts)) |
| 5196 | // Reduction corresponding to series of fadd instructions is lowered to |
| 5197 | // series of faddp instructions. faddp has latency/throughput that |
| 5198 | // matches fadd instruction and hence, every faddp instruction can be |
| 5199 | // considered to have a relative cost = 1 with |
| 5200 | // CostKind = TCK_RecipThroughput. |
| 5201 | // An faddp will pairwise add vector elements, so the size of input |
| 5202 | // vector reduces by half every time, requiring |
| 5203 | // #(faddp instructions) = log2_32(NElts). |
| 5204 | return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts); |
| 5205 | } |
| 5206 | break; |
Rosie Sumpter | 0c4651f | 2021-06-15 10:29:27 +0100 | [diff] [blame] | 5207 | case ISD::ADD: |
| 5208 | if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) |
David Green | c9cebda | 2021-07-22 18:19:54 +0100 | [diff] [blame] | 5209 | return (LT.first - 1) + Entry->Cost; |
Rosie Sumpter | 0c4651f | 2021-06-15 10:29:27 +0100 | [diff] [blame] | 5210 | break; |
| 5211 | case ISD::XOR: |
| 5212 | case ISD::AND: |
| 5213 | case ISD::OR: |
| 5214 | const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); |
| 5215 | if (!Entry) |
| 5216 | break; |
| 5217 | auto *ValVTy = cast<FixedVectorType>(ValTy); |
David Green | e79fac2 | 2023-06-01 09:28:48 +0100 | [diff] [blame] | 5218 | if (MTy.getVectorNumElements() <= ValVTy->getNumElements() && |
Rosie Sumpter | 0c4651f | 2021-06-15 10:29:27 +0100 | [diff] [blame] | 5219 | isPowerOf2_32(ValVTy->getNumElements())) { |
| 5220 | InstructionCost ExtraCost = 0; |
| 5221 | if (LT.first != 1) { |
| 5222 | // Type needs to be split, so there is an extra cost of LT.first - 1 |
| 5223 | // arithmetic ops. |
| 5224 | auto *Ty = FixedVectorType::get(ValTy->getElementType(), |
| 5225 | MTy.getVectorNumElements()); |
| 5226 | ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); |
| 5227 | ExtraCost *= LT.first - 1; |
| 5228 | } |
David Green | e79fac2 | 2023-06-01 09:28:48 +0100 | [diff] [blame] | 5229 | // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov |
| 5230 | auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost; |
| 5231 | return Cost + ExtraCost; |
Rosie Sumpter | 0c4651f | 2021-06-15 10:29:27 +0100 | [diff] [blame] | 5232 | } |
| 5233 | break; |
| 5234 | } |
David Sherwood | 0aff179 | 2021-07-07 13:18:20 +0100 | [diff] [blame] | 5235 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); |
Matthew Simpson | eacfefd | 2018-03-16 11:34:15 +0000 | [diff] [blame] | 5236 | } |
Matthew Simpson | b4096eb | 2018-04-26 13:48:33 +0000 | [diff] [blame] | 5237 | |
David Green | e5f4019 | 2025-02-15 20:33:03 +0000 | [diff] [blame] | 5238 | InstructionCost AArch64TTIImpl::getExtendedReductionCost( |
| 5239 | unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy, |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 5240 | std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const { |
David Green | e5f4019 | 2025-02-15 20:33:03 +0000 | [diff] [blame] | 5241 | EVT VecVT = TLI->getValueType(DL, VecTy); |
| 5242 | EVT ResVT = TLI->getValueType(DL, ResTy); |
| 5243 | |
| 5244 | if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() && |
| 5245 | VecVT.getSizeInBits() >= 64) { |
| 5246 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy); |
| 5247 | |
| 5248 | // The legal cases are: |
| 5249 | // UADDLV 8/16/32->32 |
| 5250 | // UADDLP 32->64 |
| 5251 | unsigned RevVTSize = ResVT.getSizeInBits(); |
| 5252 | if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) && |
| 5253 | RevVTSize <= 32) || |
| 5254 | ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) && |
| 5255 | RevVTSize <= 32) || |
| 5256 | ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) && |
| 5257 | RevVTSize <= 64)) |
| 5258 | return (LT.first - 1) * 2 + 2; |
| 5259 | } |
| 5260 | |
| 5261 | return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF, |
| 5262 | CostKind); |
| 5263 | } |
| 5264 | |
| 5265 | InstructionCost |
| 5266 | AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, |
| 5267 | VectorType *VecTy, |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 5268 | TTI::TargetCostKind CostKind) const { |
David Green | e5f4019 | 2025-02-15 20:33:03 +0000 | [diff] [blame] | 5269 | EVT VecVT = TLI->getValueType(DL, VecTy); |
| 5270 | EVT ResVT = TLI->getValueType(DL, ResTy); |
| 5271 | |
| 5272 | if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple()) { |
| 5273 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy); |
| 5274 | |
| 5275 | // The legal cases with dotprod are |
| 5276 | // UDOT 8->32 |
| 5277 | // Which requires an additional uaddv to sum the i32 values. |
| 5278 | if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) && |
| 5279 | ResVT == MVT::i32) |
| 5280 | return LT.first + 2; |
| 5281 | } |
| 5282 | |
| 5283 | return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, VecTy, CostKind); |
| 5284 | } |
| 5285 | |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 5286 | InstructionCost |
| 5287 | AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index, |
| 5288 | TTI::TargetCostKind CostKind) const { |
Caroline Concatto | a2c5c56 | 2021-06-18 15:39:03 +0100 | [diff] [blame] | 5289 | static const CostTblEntry ShuffleTbl[] = { |
| 5290 | { TTI::SK_Splice, MVT::nxv16i8, 1 }, |
| 5291 | { TTI::SK_Splice, MVT::nxv8i16, 1 }, |
| 5292 | { TTI::SK_Splice, MVT::nxv4i32, 1 }, |
| 5293 | { TTI::SK_Splice, MVT::nxv2i64, 1 }, |
| 5294 | { TTI::SK_Splice, MVT::nxv2f16, 1 }, |
| 5295 | { TTI::SK_Splice, MVT::nxv4f16, 1 }, |
| 5296 | { TTI::SK_Splice, MVT::nxv8f16, 1 }, |
| 5297 | { TTI::SK_Splice, MVT::nxv2bf16, 1 }, |
| 5298 | { TTI::SK_Splice, MVT::nxv4bf16, 1 }, |
| 5299 | { TTI::SK_Splice, MVT::nxv8bf16, 1 }, |
| 5300 | { TTI::SK_Splice, MVT::nxv2f32, 1 }, |
| 5301 | { TTI::SK_Splice, MVT::nxv4f32, 1 }, |
| 5302 | { TTI::SK_Splice, MVT::nxv2f64, 1 }, |
| 5303 | }; |
| 5304 | |
Paul Walker | 3bb2287 | 2022-08-26 14:32:46 +0100 | [diff] [blame] | 5305 | // The code-generator is currently not able to handle scalable vectors |
| 5306 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 5307 | // it. This change will be removed when code-generation for these types is |
| 5308 | // sufficiently reliable. |
| 5309 | if (Tp->getElementCount() == ElementCount::getScalable(1)) |
| 5310 | return InstructionCost::getInvalid(); |
| 5311 | |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 5312 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); |
Caroline Concatto | a2c5c56 | 2021-06-18 15:39:03 +0100 | [diff] [blame] | 5313 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); |
Caroline Concatto | a2c5c56 | 2021-06-18 15:39:03 +0100 | [diff] [blame] | 5314 | EVT PromotedVT = LT.second.getScalarType() == MVT::i1 |
| 5315 | ? TLI->getPromotedVTForPredicate(EVT(LT.second)) |
| 5316 | : LT.second; |
| 5317 | Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); |
| 5318 | InstructionCost LegalizationCost = 0; |
| 5319 | if (Index < 0) { |
| 5320 | LegalizationCost = |
| 5321 | getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, |
| 5322 | CmpInst::BAD_ICMP_PREDICATE, CostKind) + |
| 5323 | getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, |
| 5324 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
| 5325 | } |
| 5326 | |
| 5327 | // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp |
| 5328 | // Cost performed on a promoted type. |
| 5329 | if (LT.second.getScalarType() == MVT::i1) { |
| 5330 | LegalizationCost += |
| 5331 | getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, |
| 5332 | TTI::CastContextHint::None, CostKind) + |
| 5333 | getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, |
| 5334 | TTI::CastContextHint::None, CostKind); |
| 5335 | } |
| 5336 | const auto *Entry = |
| 5337 | CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); |
| 5338 | assert(Entry && "Illegal Type for Splice"); |
| 5339 | LegalizationCost += Entry->Cost; |
| 5340 | return LegalizationCost * LT.first; |
| 5341 | } |
| 5342 | |
David Sherwood | a733c1f | 2025-01-20 14:07:03 +0000 | [diff] [blame] | 5343 | InstructionCost AArch64TTIImpl::getPartialReductionCost( |
| 5344 | unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, |
| 5345 | ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, |
| 5346 | TTI::PartialReductionExtendKind OpBExtend, |
| 5347 | std::optional<unsigned> BinOp) const { |
| 5348 | InstructionCost Invalid = InstructionCost::getInvalid(); |
| 5349 | InstructionCost Cost(TTI::TCC_Basic); |
| 5350 | |
Nicholas Guy | 9c89faa6 | 2025-02-13 10:35:45 +0000 | [diff] [blame] | 5351 | // Sub opcodes currently only occur in chained cases. |
| 5352 | // Independent partial reduction subtractions are still costed as an add |
| 5353 | if (Opcode != Instruction::Add && Opcode != Instruction::Sub) |
David Sherwood | a733c1f | 2025-01-20 14:07:03 +0000 | [diff] [blame] | 5354 | return Invalid; |
| 5355 | |
| 5356 | if (InputTypeA != InputTypeB) |
| 5357 | return Invalid; |
| 5358 | |
| 5359 | EVT InputEVT = EVT::getEVT(InputTypeA); |
| 5360 | EVT AccumEVT = EVT::getEVT(AccumType); |
| 5361 | |
Sam Tebbs | c7995a6 | 2025-02-05 13:34:43 +0000 | [diff] [blame] | 5362 | unsigned VFMinValue = VF.getKnownMinValue(); |
| 5363 | |
| 5364 | if (VF.isScalable()) { |
| 5365 | if (!ST->isSVEorStreamingSVEAvailable()) |
| 5366 | return Invalid; |
| 5367 | |
| 5368 | // Don't accept a partial reduction if the scaled accumulator is vscale x 1, |
| 5369 | // since we can't lower that type. |
| 5370 | unsigned Scale = |
| 5371 | AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits(); |
| 5372 | if (VFMinValue == Scale) |
| 5373 | return Invalid; |
| 5374 | } |
David Sherwood | efc7234 | 2025-02-11 15:10:39 +0000 | [diff] [blame] | 5375 | if (VF.isFixed() && |
| 5376 | (!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64)) |
David Sherwood | a733c1f | 2025-01-20 14:07:03 +0000 | [diff] [blame] | 5377 | return Invalid; |
| 5378 | |
| 5379 | if (InputEVT == MVT::i8) { |
Sam Tebbs | c7995a6 | 2025-02-05 13:34:43 +0000 | [diff] [blame] | 5380 | switch (VFMinValue) { |
David Sherwood | a733c1f | 2025-01-20 14:07:03 +0000 | [diff] [blame] | 5381 | default: |
| 5382 | return Invalid; |
| 5383 | case 8: |
| 5384 | if (AccumEVT == MVT::i32) |
| 5385 | Cost *= 2; |
| 5386 | else if (AccumEVT != MVT::i64) |
| 5387 | return Invalid; |
| 5388 | break; |
| 5389 | case 16: |
| 5390 | if (AccumEVT == MVT::i64) |
| 5391 | Cost *= 2; |
| 5392 | else if (AccumEVT != MVT::i32) |
| 5393 | return Invalid; |
| 5394 | break; |
| 5395 | } |
| 5396 | } else if (InputEVT == MVT::i16) { |
| 5397 | // FIXME: Allow i32 accumulator but increase cost, as we would extend |
| 5398 | // it to i64. |
Sam Tebbs | c7995a6 | 2025-02-05 13:34:43 +0000 | [diff] [blame] | 5399 | if (VFMinValue != 8 || AccumEVT != MVT::i64) |
David Sherwood | a733c1f | 2025-01-20 14:07:03 +0000 | [diff] [blame] | 5400 | return Invalid; |
| 5401 | } else |
| 5402 | return Invalid; |
| 5403 | |
Sam Tebbs | 2876dbc | 2025-05-01 16:06:37 +0100 | [diff] [blame] | 5404 | // AArch64 supports lowering mixed fixed-width extensions to a usdot but only |
| 5405 | // if the i8mm feature is available. |
David Sherwood | a733c1f | 2025-01-20 14:07:03 +0000 | [diff] [blame] | 5406 | if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None || |
Sam Tebbs | 2876dbc | 2025-05-01 16:06:37 +0100 | [diff] [blame] | 5407 | (OpAExtend != OpBExtend && !ST->hasMatMulInt8())) |
David Sherwood | a733c1f | 2025-01-20 14:07:03 +0000 | [diff] [blame] | 5408 | return Invalid; |
| 5409 | |
| 5410 | if (!BinOp || *BinOp != Instruction::Mul) |
| 5411 | return Invalid; |
| 5412 | |
| 5413 | return Cost; |
| 5414 | } |
| 5415 | |
David Green | 4ac2721 | 2024-04-09 16:36:08 +0100 | [diff] [blame] | 5416 | InstructionCost AArch64TTIImpl::getShuffleCost( |
| 5417 | TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask, |
| 5418 | TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, |
Sergei Barannikov | e0c1e23 | 2025-04-21 21:42:40 +0300 | [diff] [blame] | 5419 | ArrayRef<const Value *> Args, const Instruction *CxtI) const { |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 5420 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); |
David Green | 4ac2721 | 2024-04-09 16:36:08 +0100 | [diff] [blame] | 5421 | |
David Green | 8e2a0e6 | 2022-04-27 13:51:50 +0100 | [diff] [blame] | 5422 | // If we have a Mask, and the LT is being legalized somehow, split the Mask |
| 5423 | // into smaller vectors and sum the cost of each shuffle. |
David Green | 46cef9a | 2022-04-27 15:36:15 +0100 | [diff] [blame] | 5424 | if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() && |
David Green | 8e2a0e6 | 2022-04-27 13:51:50 +0100 | [diff] [blame] | 5425 | Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && |
Alexey Bataev | 263a00f | 2023-10-02 06:44:01 -0700 | [diff] [blame] | 5426 | Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) { |
David Green | 4ac2721 | 2024-04-09 16:36:08 +0100 | [diff] [blame] | 5427 | |
David Green | 18bb175 | 2024-04-21 13:53:22 +0100 | [diff] [blame] | 5428 | // Check for LD3/LD4 instructions, which are represented in llvm IR as |
| 5429 | // deinterleaving-shuffle(load). The shuffle cost could potentially be free, |
| 5430 | // but we model it with a cost of LT.first so that LD3/LD4 have a higher |
| 5431 | // cost than just the load. |
| 5432 | if (Args.size() >= 1 && isa<LoadInst>(Args[0]) && |
| 5433 | (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 3) || |
| 5434 | ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4))) |
| 5435 | return std::max<InstructionCost>(1, LT.first / 4); |
| 5436 | |
David Green | 4ac2721 | 2024-04-09 16:36:08 +0100 | [diff] [blame] | 5437 | // Check for ST3/ST4 instructions, which are represented in llvm IR as |
| 5438 | // store(interleaving-shuffle). The shuffle cost could potentially be free, |
David Green | 18bb175 | 2024-04-21 13:53:22 +0100 | [diff] [blame] | 5439 | // but we model it with a cost of LT.first so that ST3/ST4 have a higher |
David Green | 4ac2721 | 2024-04-09 16:36:08 +0100 | [diff] [blame] | 5440 | // cost than just the store. |
| 5441 | if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) && |
| 5442 | (ShuffleVectorInst::isInterleaveMask( |
| 5443 | Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) || |
| 5444 | ShuffleVectorInst::isInterleaveMask( |
| 5445 | Mask, 3, Tp->getElementCount().getKnownMinValue() * 2))) |
| 5446 | return LT.first; |
| 5447 | |
Alexey Bataev | 263a00f | 2023-10-02 06:44:01 -0700 | [diff] [blame] | 5448 | unsigned TpNumElts = Mask.size(); |
David Green | 8e2a0e6 | 2022-04-27 13:51:50 +0100 | [diff] [blame] | 5449 | unsigned LTNumElts = LT.second.getVectorNumElements(); |
| 5450 | unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; |
| 5451 | VectorType *NTp = |
| 5452 | VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); |
| 5453 | InstructionCost Cost; |
| 5454 | for (unsigned N = 0; N < NumVecs; N++) { |
| 5455 | SmallVector<int> NMask; |
| 5456 | // Split the existing mask into chunks of size LTNumElts. Track the source |
| 5457 | // sub-vectors to ensure the result has at most 2 inputs. |
| 5458 | unsigned Source1, Source2; |
| 5459 | unsigned NumSources = 0; |
| 5460 | for (unsigned E = 0; E < LTNumElts; E++) { |
| 5461 | int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] |
ManuelJBrito | d22edb9 | 2023-04-27 16:22:57 +0100 | [diff] [blame] | 5462 | : PoisonMaskElem; |
David Green | 8e2a0e6 | 2022-04-27 13:51:50 +0100 | [diff] [blame] | 5463 | if (MaskElt < 0) { |
ManuelJBrito | d22edb9 | 2023-04-27 16:22:57 +0100 | [diff] [blame] | 5464 | NMask.push_back(PoisonMaskElem); |
David Green | 8e2a0e6 | 2022-04-27 13:51:50 +0100 | [diff] [blame] | 5465 | continue; |
| 5466 | } |
| 5467 | |
| 5468 | // Calculate which source from the input this comes from and whether it |
| 5469 | // is new to us. |
| 5470 | unsigned Source = MaskElt / LTNumElts; |
| 5471 | if (NumSources == 0) { |
| 5472 | Source1 = Source; |
| 5473 | NumSources = 1; |
| 5474 | } else if (NumSources == 1 && Source != Source1) { |
| 5475 | Source2 = Source; |
| 5476 | NumSources = 2; |
| 5477 | } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { |
| 5478 | NumSources++; |
| 5479 | } |
| 5480 | |
| 5481 | // Add to the new mask. For the NumSources>2 case these are not correct, |
| 5482 | // but are only used for the modular lane number. |
| 5483 | if (Source == Source1) |
| 5484 | NMask.push_back(MaskElt % LTNumElts); |
| 5485 | else if (Source == Source2) |
| 5486 | NMask.push_back(MaskElt % LTNumElts + LTNumElts); |
| 5487 | else |
| 5488 | NMask.push_back(MaskElt % LTNumElts); |
| 5489 | } |
| 5490 | // If the sub-mask has at most 2 input sub-vectors then re-cost it using |
David Spickett | d1f3a92 | 2024-07-29 11:24:39 +0000 | [diff] [blame] | 5491 | // getShuffleCost. If not then cost it using the worst case as the number |
| 5492 | // of element moves into a new vector. |
David Green | 8e2a0e6 | 2022-04-27 13:51:50 +0100 | [diff] [blame] | 5493 | if (NumSources <= 2) |
| 5494 | Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc |
| 5495 | : TTI::SK_PermuteTwoSrc, |
David Green | 4ac2721 | 2024-04-09 16:36:08 +0100 | [diff] [blame] | 5496 | NTp, NMask, CostKind, 0, nullptr, Args, CxtI); |
David Green | 8e2a0e6 | 2022-04-27 13:51:50 +0100 | [diff] [blame] | 5497 | else |
| 5498 | Cost += LTNumElts; |
| 5499 | } |
| 5500 | return Cost; |
| 5501 | } |
| 5502 | |
Alexey Bataev | 9a20757 | 2023-08-08 09:57:50 -0700 | [diff] [blame] | 5503 | Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); |
Alexey Bataev | 7bc079c | 2024-02-12 07:09:49 -0500 | [diff] [blame] | 5504 | bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector; |
David Green | c05fc9b | 2025-01-09 12:10:43 +0000 | [diff] [blame] | 5505 | // A subvector extract can be implemented with an ext (or trivial extract, if |
David Green | a8dab1a | 2025-01-08 08:13:07 +0000 | [diff] [blame] | 5506 | // from lane 0). This currently only handles low or high extracts to prevent |
| 5507 | // SLP vectorizer regressions. |
| 5508 | if (IsExtractSubvector && LT.second.isFixedLengthVector()) { |
| 5509 | if (LT.second.is128BitVector() && |
| 5510 | cast<FixedVectorType>(SubTp)->getNumElements() == |
| 5511 | LT.second.getVectorNumElements() / 2) { |
| 5512 | if (Index == 0) |
| 5513 | return 0; |
David Green | 32bc029 | 2025-01-08 08:59:15 +0000 | [diff] [blame] | 5514 | if (Index == (int)LT.second.getVectorNumElements() / 2) |
David Green | a8dab1a | 2025-01-08 08:13:07 +0000 | [diff] [blame] | 5515 | return 1; |
| 5516 | } |
Alexey Bataev | 7bc079c | 2024-02-12 07:09:49 -0500 | [diff] [blame] | 5517 | Kind = TTI::SK_PermuteSingleSrc; |
David Green | a8dab1a | 2025-01-08 08:13:07 +0000 | [diff] [blame] | 5518 | } |
David Green | d632705 | 2022-04-27 12:09:01 +0100 | [diff] [blame] | 5519 | |
Sjoerd Meijer | 775451b | 2023-03-13 13:05:34 +0000 | [diff] [blame] | 5520 | // Check for broadcast loads, which are supported by the LD1R instruction. |
| 5521 | // In terms of code-size, the shuffle vector is free when a load + dup get |
| 5522 | // folded into a LD1R. That's what we check and return here. For performance |
| 5523 | // and reciprocal throughput, a LD1R is not completely free. In this case, we |
| 5524 | // return the cost for the broadcast below (i.e. 1 for most/all types), so |
| 5525 | // that we model the load + dup sequence slightly higher because LD1R is a |
| 5526 | // high latency instruction. |
| 5527 | if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) { |
David Green | d632705 | 2022-04-27 12:09:01 +0100 | [diff] [blame] | 5528 | bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]); |
| 5529 | if (IsLoad && LT.second.isVector() && |
| 5530 | isLegalBroadcastLoad(Tp->getElementType(), |
David Green | 8e2a0e6 | 2022-04-27 13:51:50 +0100 | [diff] [blame] | 5531 | LT.second.getVectorElementCount())) |
Sjoerd Meijer | 775451b | 2023-03-13 13:05:34 +0000 | [diff] [blame] | 5532 | return 0; |
David Green | d632705 | 2022-04-27 12:09:01 +0100 | [diff] [blame] | 5533 | } |
| 5534 | |
| 5535 | // If we have 4 elements for the shuffle and a Mask, get the cost straight |
| 5536 | // from the perfect shuffle tables. |
| 5537 | if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && |
| 5538 | (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && |
| 5539 | all_of(Mask, [](int E) { return E < 8; })) |
| 5540 | return getPerfectShuffleCost(Mask); |
| 5541 | |
David Green | f0e79d9 | 2024-04-09 17:16:14 +0100 | [diff] [blame] | 5542 | // Check for identity masks, which we can treat as free. |
| 5543 | if (!Mask.empty() && LT.second.isFixedLengthVector() && |
| 5544 | (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && |
| 5545 | all_of(enumerate(Mask), [](const auto &M) { |
| 5546 | return M.value() < 0 || M.value() == (int)M.index(); |
| 5547 | })) |
| 5548 | return 0; |
| 5549 | |
David Green | a536743 | 2024-04-11 08:45:28 +0100 | [diff] [blame] | 5550 | // Check for other shuffles that are not SK_ kinds but we have native |
| 5551 | // instructions for, for example ZIP and UZP. |
| 5552 | unsigned Unused; |
| 5553 | if (LT.second.isFixedLengthVector() && |
| 5554 | LT.second.getVectorNumElements() == Mask.size() && |
| 5555 | (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && |
David Green | 363ec6f | 2024-05-06 18:37:04 +0100 | [diff] [blame] | 5556 | (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) || |
| 5557 | isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) || |
David Green | 48397fe | 2025-02-25 10:32:45 +0000 | [diff] [blame] | 5558 | isREVMask(Mask, LT.second.getScalarSizeInBits(), |
| 5559 | LT.second.getVectorNumElements(), 16) || |
| 5560 | isREVMask(Mask, LT.second.getScalarSizeInBits(), |
| 5561 | LT.second.getVectorNumElements(), 32) || |
| 5562 | isREVMask(Mask, LT.second.getScalarSizeInBits(), |
| 5563 | LT.second.getVectorNumElements(), 64) || |
David Green | 6c2cc82 | 2024-04-14 12:09:14 +0100 | [diff] [blame] | 5564 | // Check for non-zero lane splats |
| 5565 | all_of(drop_begin(Mask), |
| 5566 | [&Mask](int M) { return M < 0 || M == Mask[0]; }))) |
David Green | a536743 | 2024-04-11 08:45:28 +0100 | [diff] [blame] | 5567 | return 1; |
| 5568 | |
Simon Pilgrim | 071e822 | 2018-10-25 10:52:36 +0000 | [diff] [blame] | 5569 | if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || |
Caroline Concatto | b52e6c5 | 2021-01-27 15:59:27 +0000 | [diff] [blame] | 5570 | Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || |
David Green | 0cf9e47 | 2022-08-22 12:44:57 +0100 | [diff] [blame] | 5571 | Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) { |
Simon Pilgrim | 9c8f937 | 2018-06-22 09:45:31 +0000 | [diff] [blame] | 5572 | static const CostTblEntry ShuffleTbl[] = { |
liqinweng | 723245b | 2022-09-08 18:33:29 +0800 | [diff] [blame] | 5573 | // Broadcast shuffle kinds can be performed with 'dup'. |
| 5574 | {TTI::SK_Broadcast, MVT::v8i8, 1}, |
| 5575 | {TTI::SK_Broadcast, MVT::v16i8, 1}, |
| 5576 | {TTI::SK_Broadcast, MVT::v4i16, 1}, |
| 5577 | {TTI::SK_Broadcast, MVT::v8i16, 1}, |
| 5578 | {TTI::SK_Broadcast, MVT::v2i32, 1}, |
| 5579 | {TTI::SK_Broadcast, MVT::v4i32, 1}, |
| 5580 | {TTI::SK_Broadcast, MVT::v2i64, 1}, |
David Green | 180865a | 2023-03-14 21:25:18 +0000 | [diff] [blame] | 5581 | {TTI::SK_Broadcast, MVT::v4f16, 1}, |
| 5582 | {TTI::SK_Broadcast, MVT::v8f16, 1}, |
liqinweng | 723245b | 2022-09-08 18:33:29 +0800 | [diff] [blame] | 5583 | {TTI::SK_Broadcast, MVT::v2f32, 1}, |
| 5584 | {TTI::SK_Broadcast, MVT::v4f32, 1}, |
| 5585 | {TTI::SK_Broadcast, MVT::v2f64, 1}, |
| 5586 | // Transpose shuffle kinds can be performed with 'trn1/trn2' and |
| 5587 | // 'zip1/zip2' instructions. |
| 5588 | {TTI::SK_Transpose, MVT::v8i8, 1}, |
| 5589 | {TTI::SK_Transpose, MVT::v16i8, 1}, |
| 5590 | {TTI::SK_Transpose, MVT::v4i16, 1}, |
| 5591 | {TTI::SK_Transpose, MVT::v8i16, 1}, |
| 5592 | {TTI::SK_Transpose, MVT::v2i32, 1}, |
| 5593 | {TTI::SK_Transpose, MVT::v4i32, 1}, |
| 5594 | {TTI::SK_Transpose, MVT::v2i64, 1}, |
David Green | 180865a | 2023-03-14 21:25:18 +0000 | [diff] [blame] | 5595 | {TTI::SK_Transpose, MVT::v4f16, 1}, |
| 5596 | {TTI::SK_Transpose, MVT::v8f16, 1}, |
liqinweng | 723245b | 2022-09-08 18:33:29 +0800 | [diff] [blame] | 5597 | {TTI::SK_Transpose, MVT::v2f32, 1}, |
| 5598 | {TTI::SK_Transpose, MVT::v4f32, 1}, |
| 5599 | {TTI::SK_Transpose, MVT::v2f64, 1}, |
| 5600 | // Select shuffle kinds. |
| 5601 | // TODO: handle vXi8/vXi16. |
| 5602 | {TTI::SK_Select, MVT::v2i32, 1}, // mov. |
| 5603 | {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar). |
| 5604 | {TTI::SK_Select, MVT::v2i64, 1}, // mov. |
| 5605 | {TTI::SK_Select, MVT::v2f32, 1}, // mov. |
| 5606 | {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar). |
| 5607 | {TTI::SK_Select, MVT::v2f64, 1}, // mov. |
| 5608 | // PermuteSingleSrc shuffle kinds. |
| 5609 | {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov. |
| 5610 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case. |
| 5611 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov. |
| 5612 | {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov. |
| 5613 | {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case. |
| 5614 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov. |
| 5615 | {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case. |
| 5616 | {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case. |
David Green | 3875c38 | 2022-09-08 19:54:12 +0100 | [diff] [blame] | 5617 | {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same |
| 5618 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl |
| 5619 | {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl |
liqinweng | 723245b | 2022-09-08 18:33:29 +0800 | [diff] [blame] | 5620 | {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl |
| 5621 | {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl |
| 5622 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl |
| 5623 | // Reverse can be lowered with `rev`. |
| 5624 | {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64 |
| 5625 | {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT |
| 5626 | {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT |
| 5627 | {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64 |
| 5628 | {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT |
| 5629 | {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT |
| 5630 | {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT |
David Green | 5c6453d | 2025-02-24 08:37:15 +0000 | [diff] [blame] | 5631 | {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT |
liqinweng | 723245b | 2022-09-08 18:33:29 +0800 | [diff] [blame] | 5632 | {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT |
| 5633 | {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT |
| 5634 | {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64 |
David Green | 5c6453d | 2025-02-24 08:37:15 +0000 | [diff] [blame] | 5635 | {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64 |
liqinweng | 723245b | 2022-09-08 18:33:29 +0800 | [diff] [blame] | 5636 | {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64 |
| 5637 | {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64 |
| 5638 | // Splice can all be lowered as `ext`. |
| 5639 | {TTI::SK_Splice, MVT::v2i32, 1}, |
| 5640 | {TTI::SK_Splice, MVT::v4i32, 1}, |
| 5641 | {TTI::SK_Splice, MVT::v2i64, 1}, |
| 5642 | {TTI::SK_Splice, MVT::v2f32, 1}, |
| 5643 | {TTI::SK_Splice, MVT::v4f32, 1}, |
| 5644 | {TTI::SK_Splice, MVT::v2f64, 1}, |
| 5645 | {TTI::SK_Splice, MVT::v8f16, 1}, |
| 5646 | {TTI::SK_Splice, MVT::v8bf16, 1}, |
| 5647 | {TTI::SK_Splice, MVT::v8i16, 1}, |
| 5648 | {TTI::SK_Splice, MVT::v16i8, 1}, |
liqinweng | 723245b | 2022-09-08 18:33:29 +0800 | [diff] [blame] | 5649 | {TTI::SK_Splice, MVT::v4f16, 1}, |
David Green | 5c6453d | 2025-02-24 08:37:15 +0000 | [diff] [blame] | 5650 | {TTI::SK_Splice, MVT::v4bf16, 1}, |
liqinweng | 723245b | 2022-09-08 18:33:29 +0800 | [diff] [blame] | 5651 | {TTI::SK_Splice, MVT::v4i16, 1}, |
| 5652 | {TTI::SK_Splice, MVT::v8i8, 1}, |
| 5653 | // Broadcast shuffle kinds for scalable vectors |
| 5654 | {TTI::SK_Broadcast, MVT::nxv16i8, 1}, |
| 5655 | {TTI::SK_Broadcast, MVT::nxv8i16, 1}, |
| 5656 | {TTI::SK_Broadcast, MVT::nxv4i32, 1}, |
| 5657 | {TTI::SK_Broadcast, MVT::nxv2i64, 1}, |
| 5658 | {TTI::SK_Broadcast, MVT::nxv2f16, 1}, |
| 5659 | {TTI::SK_Broadcast, MVT::nxv4f16, 1}, |
| 5660 | {TTI::SK_Broadcast, MVT::nxv8f16, 1}, |
| 5661 | {TTI::SK_Broadcast, MVT::nxv2bf16, 1}, |
| 5662 | {TTI::SK_Broadcast, MVT::nxv4bf16, 1}, |
| 5663 | {TTI::SK_Broadcast, MVT::nxv8bf16, 1}, |
| 5664 | {TTI::SK_Broadcast, MVT::nxv2f32, 1}, |
| 5665 | {TTI::SK_Broadcast, MVT::nxv4f32, 1}, |
| 5666 | {TTI::SK_Broadcast, MVT::nxv2f64, 1}, |
| 5667 | {TTI::SK_Broadcast, MVT::nxv16i1, 1}, |
| 5668 | {TTI::SK_Broadcast, MVT::nxv8i1, 1}, |
| 5669 | {TTI::SK_Broadcast, MVT::nxv4i1, 1}, |
| 5670 | {TTI::SK_Broadcast, MVT::nxv2i1, 1}, |
| 5671 | // Handle the cases for vector.reverse with scalable vectors |
| 5672 | {TTI::SK_Reverse, MVT::nxv16i8, 1}, |
| 5673 | {TTI::SK_Reverse, MVT::nxv8i16, 1}, |
| 5674 | {TTI::SK_Reverse, MVT::nxv4i32, 1}, |
| 5675 | {TTI::SK_Reverse, MVT::nxv2i64, 1}, |
| 5676 | {TTI::SK_Reverse, MVT::nxv2f16, 1}, |
| 5677 | {TTI::SK_Reverse, MVT::nxv4f16, 1}, |
| 5678 | {TTI::SK_Reverse, MVT::nxv8f16, 1}, |
| 5679 | {TTI::SK_Reverse, MVT::nxv2bf16, 1}, |
| 5680 | {TTI::SK_Reverse, MVT::nxv4bf16, 1}, |
| 5681 | {TTI::SK_Reverse, MVT::nxv8bf16, 1}, |
| 5682 | {TTI::SK_Reverse, MVT::nxv2f32, 1}, |
| 5683 | {TTI::SK_Reverse, MVT::nxv4f32, 1}, |
| 5684 | {TTI::SK_Reverse, MVT::nxv2f64, 1}, |
| 5685 | {TTI::SK_Reverse, MVT::nxv16i1, 1}, |
| 5686 | {TTI::SK_Reverse, MVT::nxv8i1, 1}, |
| 5687 | {TTI::SK_Reverse, MVT::nxv4i1, 1}, |
| 5688 | {TTI::SK_Reverse, MVT::nxv2i1, 1}, |
Matthew Simpson | b4096eb | 2018-04-26 13:48:33 +0000 | [diff] [blame] | 5689 | }; |
Simon Pilgrim | 9c8f937 | 2018-06-22 09:45:31 +0000 | [diff] [blame] | 5690 | if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) |
Matthew Simpson | b4096eb | 2018-04-26 13:48:33 +0000 | [diff] [blame] | 5691 | return LT.first * Entry->Cost; |
| 5692 | } |
David Green | fa784f6 | 2022-04-07 19:27:41 +0100 | [diff] [blame] | 5693 | |
Caroline Concatto | a2c5c56 | 2021-06-18 15:39:03 +0100 | [diff] [blame] | 5694 | if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) |
David Green | 2ba455f | 2025-04-21 06:31:03 +0100 | [diff] [blame] | 5695 | return getSpliceCost(Tp, Index, CostKind); |
David Green | fa784f6 | 2022-04-07 19:27:41 +0100 | [diff] [blame] | 5696 | |
| 5697 | // Inserting a subvector can often be done with either a D, S or H register |
| 5698 | // move, so long as the inserted vector is "aligned". |
| 5699 | if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && |
| 5700 | LT.second.getSizeInBits() <= 128 && SubTp) { |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 5701 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); |
David Green | fa784f6 | 2022-04-07 19:27:41 +0100 | [diff] [blame] | 5702 | if (SubLT.second.isVector()) { |
| 5703 | int NumElts = LT.second.getVectorNumElements(); |
| 5704 | int NumSubElts = SubLT.second.getVectorNumElements(); |
| 5705 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) |
| 5706 | return SubLT.first; |
| 5707 | } |
| 5708 | } |
| 5709 | |
Alexey Bataev | 7bc079c | 2024-02-12 07:09:49 -0500 | [diff] [blame] | 5710 | // Restore optimal kind. |
| 5711 | if (IsExtractSubvector) |
| 5712 | Kind = TTI::SK_ExtractSubvector; |
David Green | 4ac2721 | 2024-04-09 16:36:08 +0100 | [diff] [blame] | 5713 | return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args, |
| 5714 | CxtI); |
Matthew Simpson | b4096eb | 2018-04-26 13:48:33 +0000 | [diff] [blame] | 5715 | } |
David Sherwood | f15b6b2 | 2022-07-12 12:03:39 +0100 | [diff] [blame] | 5716 | |
David Sherwood | 636efd2 | 2023-03-14 18:15:03 +0000 | [diff] [blame] | 5717 | static bool containsDecreasingPointers(Loop *TheLoop, |
| 5718 | PredicatedScalarEvolution *PSE) { |
Philip Reames | e41dce4 | 2023-05-11 09:47:37 -0700 | [diff] [blame] | 5719 | const auto &Strides = DenseMap<Value *, const SCEV *>(); |
David Sherwood | 636efd2 | 2023-03-14 18:15:03 +0000 | [diff] [blame] | 5720 | for (BasicBlock *BB : TheLoop->blocks()) { |
| 5721 | // Scan the instructions in the block and look for addresses that are |
| 5722 | // consecutive and decreasing. |
| 5723 | for (Instruction &I : *BB) { |
| 5724 | if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) { |
| 5725 | Value *Ptr = getLoadStorePointerOperand(&I); |
| 5726 | Type *AccessTy = getLoadStoreType(&I); |
| 5727 | if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true, |
| 5728 | /*ShouldCheckWrap=*/false) |
| 5729 | .value_or(0) < 0) |
| 5730 | return true; |
| 5731 | } |
| 5732 | } |
| 5733 | } |
| 5734 | return false; |
| 5735 | } |
| 5736 | |
David Green | b35d345 | 2024-12-31 11:07:42 +0000 | [diff] [blame] | 5737 | bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const { |
| 5738 | if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences()) |
| 5739 | return SVEPreferFixedOverScalableIfEqualCost; |
| 5740 | return ST->useFixedOverScalableIfEqualCost(); |
| 5741 | } |
| 5742 | |
Sjoerd Meijer | 9bccf61 | 2024-11-20 09:33:39 +0000 | [diff] [blame] | 5743 | unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const { |
| 5744 | return ST->getEpilogueVectorizationMinVF(); |
| 5745 | } |
| 5746 | |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 5747 | bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { |
David Sherwood | 7beb2ca | 2023-04-20 12:34:55 +0000 | [diff] [blame] | 5748 | if (!ST->hasSVE()) |
David Sherwood | f15b6b2 | 2022-07-12 12:03:39 +0100 | [diff] [blame] | 5749 | return false; |
| 5750 | |
David Sherwood | 4ef9cb6 | 2022-07-18 10:36:11 +0100 | [diff] [blame] | 5751 | // We don't currently support vectorisation with interleaving for SVE - with |
| 5752 | // such loops we're better off not using tail-folding. This gives us a chance |
| 5753 | // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc. |
David Sherwood | b4089cf | 2023-04-04 13:58:58 +0000 | [diff] [blame] | 5754 | if (TFI->IAI->hasGroups()) |
David Sherwood | 4ef9cb6 | 2022-07-18 10:36:11 +0100 | [diff] [blame] | 5755 | return false; |
| 5756 | |
David Sherwood | 7beb2ca | 2023-04-20 12:34:55 +0000 | [diff] [blame] | 5757 | TailFoldingOpts Required = TailFoldingOpts::Disabled; |
David Sherwood | b4089cf | 2023-04-04 13:58:58 +0000 | [diff] [blame] | 5758 | if (TFI->LVL->getReductionVars().size()) |
David Sherwood | 7beb2ca | 2023-04-20 12:34:55 +0000 | [diff] [blame] | 5759 | Required |= TailFoldingOpts::Reductions; |
David Sherwood | b4089cf | 2023-04-04 13:58:58 +0000 | [diff] [blame] | 5760 | if (TFI->LVL->getFixedOrderRecurrences().size()) |
David Sherwood | 7beb2ca | 2023-04-20 12:34:55 +0000 | [diff] [blame] | 5761 | Required |= TailFoldingOpts::Recurrences; |
David Sherwood | 636efd2 | 2023-03-14 18:15:03 +0000 | [diff] [blame] | 5762 | |
| 5763 | // We call this to discover whether any load/store pointers in the loop have |
| 5764 | // negative strides. This will require extra work to reverse the loop |
| 5765 | // predicate, which may be expensive. |
David Sherwood | b4089cf | 2023-04-04 13:58:58 +0000 | [diff] [blame] | 5766 | if (containsDecreasingPointers(TFI->LVL->getLoop(), |
| 5767 | TFI->LVL->getPredicatedScalarEvolution())) |
David Sherwood | 7beb2ca | 2023-04-20 12:34:55 +0000 | [diff] [blame] | 5768 | Required |= TailFoldingOpts::Reverse; |
| 5769 | if (Required == TailFoldingOpts::Disabled) |
| 5770 | Required |= TailFoldingOpts::Simple; |
David Sherwood | f15b6b2 | 2022-07-12 12:03:39 +0100 | [diff] [blame] | 5771 | |
David Sherwood | c7dbe32 | 2023-04-25 08:46:41 +0000 | [diff] [blame] | 5772 | if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(), |
| 5773 | Required)) |
| 5774 | return false; |
| 5775 | |
| 5776 | // Don't tail-fold for tight loops where we would be better off interleaving |
| 5777 | // with an unpredicated loop. |
| 5778 | unsigned NumInsns = 0; |
| 5779 | for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) { |
| 5780 | NumInsns += BB->sizeWithoutDebug(); |
| 5781 | } |
| 5782 | |
| 5783 | // We expect 4 of these to be a IV PHI, IV add, IV compare and branch. |
| 5784 | return NumInsns >= SVETailFoldInsnThreshold; |
David Sherwood | f15b6b2 | 2022-07-12 12:03:39 +0100 | [diff] [blame] | 5785 | } |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 5786 | |
| 5787 | InstructionCost |
| 5788 | AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, |
Graham Hunter | 2e8d815 | 2024-05-10 11:22:11 +0100 | [diff] [blame] | 5789 | StackOffset BaseOffset, bool HasBaseReg, |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 5790 | int64_t Scale, unsigned AddrSpace) const { |
| 5791 | // Scaling factors are not free at all. |
| 5792 | // Operands | Rt Latency |
| 5793 | // ------------------------------------------- |
| 5794 | // Rt, [Xn, Xm] | 4 |
| 5795 | // ------------------------------------------- |
| 5796 | // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 |
| 5797 | // Rt, [Xn, Wm, <extend> #imm] | |
| 5798 | TargetLoweringBase::AddrMode AM; |
| 5799 | AM.BaseGV = BaseGV; |
Graham Hunter | 2e8d815 | 2024-05-10 11:22:11 +0100 | [diff] [blame] | 5800 | AM.BaseOffs = BaseOffset.getFixed(); |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 5801 | AM.HasBaseReg = HasBaseReg; |
| 5802 | AM.Scale = Scale; |
Graham Hunter | 2e8d815 | 2024-05-10 11:22:11 +0100 | [diff] [blame] | 5803 | AM.ScalableOffset = BaseOffset.getScalable(); |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 5804 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) |
| 5805 | // Scale represents reg2 * scale, thus account for 1 if |
| 5806 | // it is not equal to 0 or 1. |
| 5807 | return AM.Scale != 0 && AM.Scale != 1; |
Craig Topper | 39c454a | 2025-03-05 09:10:45 -0800 | [diff] [blame] | 5808 | return InstructionCost::getInvalid(); |
Daniil Fukalov | 7ed3d81 | 2022-08-18 00:38:34 +0300 | [diff] [blame] | 5809 | } |
David Green | a2d68b4 | 2024-01-22 23:46:58 +0000 | [diff] [blame] | 5810 | |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 5811 | bool AArch64TTIImpl::shouldTreatInstructionLikeSelect( |
| 5812 | const Instruction *I) const { |
Florian Hahn | 9a0f251 | 2024-11-30 21:05:41 +0000 | [diff] [blame] | 5813 | if (EnableOrLikeSelectOpt) { |
| 5814 | // For the binary operators (e.g. or) we need to be more careful than |
| 5815 | // selects, here we only transform them if they are already at a natural |
| 5816 | // break point in the code - the end of a block with an unconditional |
| 5817 | // terminator. |
| 5818 | if (I->getOpcode() == Instruction::Or && |
| 5819 | isa<BranchInst>(I->getNextNode()) && |
| 5820 | cast<BranchInst>(I->getNextNode())->isUnconditional()) |
| 5821 | return true; |
| 5822 | |
| 5823 | if (I->getOpcode() == Instruction::Add || |
| 5824 | I->getOpcode() == Instruction::Sub) |
| 5825 | return true; |
| 5826 | } |
David Green | a2d68b4 | 2024-01-22 23:46:58 +0000 | [diff] [blame] | 5827 | return BaseT::shouldTreatInstructionLikeSelect(I); |
Sander de Smalen | 3abf55a | 2024-01-31 11:38:29 +0000 | [diff] [blame] | 5828 | } |
Graham Hunter | e16f2f5 | 2024-06-06 14:45:36 +0100 | [diff] [blame] | 5829 | |
Sergei Barannikov | 0014b49 | 2025-04-22 06:27:29 +0300 | [diff] [blame] | 5830 | bool AArch64TTIImpl::isLSRCostLess( |
| 5831 | const TargetTransformInfo::LSRCost &C1, |
| 5832 | const TargetTransformInfo::LSRCost &C2) const { |
Graham Hunter | e16f2f5 | 2024-06-06 14:45:36 +0100 | [diff] [blame] | 5833 | // AArch64 specific here is adding the number of instructions to the |
| 5834 | // comparison (though not as the first consideration, as some targets do) |
| 5835 | // along with changing the priority of the base additions. |
| 5836 | // TODO: Maybe a more nuanced tradeoff between instruction count |
| 5837 | // and number of registers? To be investigated at a later date. |
| 5838 | if (EnableLSRCostOpt) |
| 5839 | return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost, |
| 5840 | C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) < |
| 5841 | std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost, |
| 5842 | C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost); |
| 5843 | |
| 5844 | return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); |
Sander de Smalen | 738533c | 2024-06-24 11:06:16 +0100 | [diff] [blame] | 5845 | } |
Jeffrey Byrnes | 853c43d | 2024-10-09 14:30:09 -0700 | [diff] [blame] | 5846 | |
| 5847 | static bool isSplatShuffle(Value *V) { |
| 5848 | if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V)) |
| 5849 | return all_equal(Shuf->getShuffleMask()); |
| 5850 | return false; |
| 5851 | } |
| 5852 | |
| 5853 | /// Check if both Op1 and Op2 are shufflevector extracts of either the lower |
| 5854 | /// or upper half of the vector elements. |
| 5855 | static bool areExtractShuffleVectors(Value *Op1, Value *Op2, |
| 5856 | bool AllowSplat = false) { |
| 5857 | // Scalable types can't be extract shuffle vectors. |
| 5858 | if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy()) |
| 5859 | return false; |
| 5860 | |
| 5861 | auto areTypesHalfed = [](Value *FullV, Value *HalfV) { |
| 5862 | auto *FullTy = FullV->getType(); |
| 5863 | auto *HalfTy = HalfV->getType(); |
| 5864 | return FullTy->getPrimitiveSizeInBits().getFixedValue() == |
| 5865 | 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue(); |
| 5866 | }; |
| 5867 | |
| 5868 | auto extractHalf = [](Value *FullV, Value *HalfV) { |
| 5869 | auto *FullVT = cast<FixedVectorType>(FullV->getType()); |
| 5870 | auto *HalfVT = cast<FixedVectorType>(HalfV->getType()); |
| 5871 | return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); |
| 5872 | }; |
| 5873 | |
| 5874 | ArrayRef<int> M1, M2; |
| 5875 | Value *S1Op1 = nullptr, *S2Op1 = nullptr; |
| 5876 | if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) || |
| 5877 | !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2)))) |
| 5878 | return false; |
| 5879 | |
| 5880 | // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that |
| 5881 | // it is not checked as an extract below. |
| 5882 | if (AllowSplat && isSplatShuffle(Op1)) |
| 5883 | S1Op1 = nullptr; |
| 5884 | if (AllowSplat && isSplatShuffle(Op2)) |
| 5885 | S2Op1 = nullptr; |
| 5886 | |
| 5887 | // Check that the operands are half as wide as the result and we extract |
| 5888 | // half of the elements of the input vectors. |
| 5889 | if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) || |
| 5890 | (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2)))) |
| 5891 | return false; |
| 5892 | |
| 5893 | // Check the mask extracts either the lower or upper half of vector |
| 5894 | // elements. |
| 5895 | int M1Start = 0; |
| 5896 | int M2Start = 0; |
| 5897 | int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2; |
| 5898 | if ((S1Op1 && |
| 5899 | !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) || |
| 5900 | (S2Op1 && |
| 5901 | !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start))) |
| 5902 | return false; |
| 5903 | |
| 5904 | if ((M1Start != 0 && M1Start != (NumElements / 2)) || |
| 5905 | (M2Start != 0 && M2Start != (NumElements / 2))) |
| 5906 | return false; |
| 5907 | if (S1Op1 && S2Op1 && M1Start != M2Start) |
| 5908 | return false; |
| 5909 | |
| 5910 | return true; |
| 5911 | } |
| 5912 | |
| 5913 | /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth |
| 5914 | /// of the vector elements. |
| 5915 | static bool areExtractExts(Value *Ext1, Value *Ext2) { |
| 5916 | auto areExtDoubled = [](Instruction *Ext) { |
| 5917 | return Ext->getType()->getScalarSizeInBits() == |
| 5918 | 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); |
| 5919 | }; |
| 5920 | |
| 5921 | if (!match(Ext1, m_ZExtOrSExt(m_Value())) || |
| 5922 | !match(Ext2, m_ZExtOrSExt(m_Value())) || |
| 5923 | !areExtDoubled(cast<Instruction>(Ext1)) || |
| 5924 | !areExtDoubled(cast<Instruction>(Ext2))) |
| 5925 | return false; |
| 5926 | |
| 5927 | return true; |
| 5928 | } |
| 5929 | |
| 5930 | /// Check if Op could be used with vmull_high_p64 intrinsic. |
| 5931 | static bool isOperandOfVmullHighP64(Value *Op) { |
| 5932 | Value *VectorOperand = nullptr; |
| 5933 | ConstantInt *ElementIndex = nullptr; |
| 5934 | return match(Op, m_ExtractElt(m_Value(VectorOperand), |
| 5935 | m_ConstantInt(ElementIndex))) && |
| 5936 | ElementIndex->getValue() == 1 && |
| 5937 | isa<FixedVectorType>(VectorOperand->getType()) && |
| 5938 | cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2; |
| 5939 | } |
| 5940 | |
| 5941 | /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. |
| 5942 | static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { |
| 5943 | return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); |
| 5944 | } |
| 5945 | |
| 5946 | static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) { |
| 5947 | // Restrict ourselves to the form CodeGenPrepare typically constructs. |
| 5948 | auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs); |
| 5949 | if (!GEP || GEP->getNumOperands() != 2) |
| 5950 | return false; |
| 5951 | |
| 5952 | Value *Base = GEP->getOperand(0); |
| 5953 | Value *Offsets = GEP->getOperand(1); |
| 5954 | |
| 5955 | // We only care about scalar_base+vector_offsets. |
| 5956 | if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy()) |
| 5957 | return false; |
| 5958 | |
| 5959 | // Sink extends that would allow us to use 32-bit offset vectors. |
| 5960 | if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) { |
| 5961 | auto *OffsetsInst = cast<Instruction>(Offsets); |
| 5962 | if (OffsetsInst->getType()->getScalarSizeInBits() > 32 && |
| 5963 | OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32) |
| 5964 | Ops.push_back(&GEP->getOperandUse(1)); |
| 5965 | } |
| 5966 | |
| 5967 | // Sink the GEP. |
| 5968 | return true; |
| 5969 | } |
| 5970 | |
| 5971 | /// We want to sink following cases: |
| 5972 | /// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale; |
| 5973 | /// (add|sub|gep) A, ((mul|shl) zext(vscale), imm); |
| 5974 | static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) { |
| 5975 | if (match(Op, m_VScale())) |
| 5976 | return true; |
| 5977 | if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) || |
| 5978 | match(Op, m_Mul(m_VScale(), m_ConstantInt()))) { |
| 5979 | Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0)); |
| 5980 | return true; |
| 5981 | } |
| 5982 | if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) || |
| 5983 | match(Op, m_Mul(m_ZExt(m_VScale()), m_ConstantInt()))) { |
| 5984 | Value *ZExtOp = cast<Instruction>(Op)->getOperand(0); |
| 5985 | Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0)); |
| 5986 | Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0)); |
| 5987 | return true; |
| 5988 | } |
| 5989 | return false; |
| 5990 | } |
| 5991 | |
| 5992 | /// Check if sinking \p I's operands to I's basic block is profitable, because |
| 5993 | /// the operands can be folded into a target instruction, e.g. |
| 5994 | /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). |
| 5995 | bool AArch64TTIImpl::isProfitableToSinkOperands( |
| 5996 | Instruction *I, SmallVectorImpl<Use *> &Ops) const { |
| 5997 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { |
| 5998 | switch (II->getIntrinsicID()) { |
| 5999 | case Intrinsic::aarch64_neon_smull: |
| 6000 | case Intrinsic::aarch64_neon_umull: |
| 6001 | if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1), |
| 6002 | /*AllowSplat=*/true)) { |
| 6003 | Ops.push_back(&II->getOperandUse(0)); |
| 6004 | Ops.push_back(&II->getOperandUse(1)); |
| 6005 | return true; |
| 6006 | } |
| 6007 | [[fallthrough]]; |
| 6008 | |
| 6009 | case Intrinsic::fma: |
| 6010 | case Intrinsic::fmuladd: |
| 6011 | if (isa<VectorType>(I->getType()) && |
| 6012 | cast<VectorType>(I->getType())->getElementType()->isHalfTy() && |
| 6013 | !ST->hasFullFP16()) |
| 6014 | return false; |
| 6015 | [[fallthrough]]; |
| 6016 | case Intrinsic::aarch64_neon_sqdmull: |
| 6017 | case Intrinsic::aarch64_neon_sqdmulh: |
| 6018 | case Intrinsic::aarch64_neon_sqrdmulh: |
| 6019 | // Sink splats for index lane variants |
| 6020 | if (isSplatShuffle(II->getOperand(0))) |
| 6021 | Ops.push_back(&II->getOperandUse(0)); |
| 6022 | if (isSplatShuffle(II->getOperand(1))) |
| 6023 | Ops.push_back(&II->getOperandUse(1)); |
| 6024 | return !Ops.empty(); |
| 6025 | case Intrinsic::aarch64_neon_fmlal: |
| 6026 | case Intrinsic::aarch64_neon_fmlal2: |
| 6027 | case Intrinsic::aarch64_neon_fmlsl: |
| 6028 | case Intrinsic::aarch64_neon_fmlsl2: |
| 6029 | // Sink splats for index lane variants |
| 6030 | if (isSplatShuffle(II->getOperand(1))) |
| 6031 | Ops.push_back(&II->getOperandUse(1)); |
| 6032 | if (isSplatShuffle(II->getOperand(2))) |
| 6033 | Ops.push_back(&II->getOperandUse(2)); |
| 6034 | return !Ops.empty(); |
| 6035 | case Intrinsic::aarch64_sve_ptest_first: |
| 6036 | case Intrinsic::aarch64_sve_ptest_last: |
| 6037 | if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0))) |
| 6038 | if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue) |
| 6039 | Ops.push_back(&II->getOperandUse(0)); |
| 6040 | return !Ops.empty(); |
| 6041 | case Intrinsic::aarch64_sme_write_horiz: |
| 6042 | case Intrinsic::aarch64_sme_write_vert: |
| 6043 | case Intrinsic::aarch64_sme_writeq_horiz: |
| 6044 | case Intrinsic::aarch64_sme_writeq_vert: { |
| 6045 | auto *Idx = dyn_cast<Instruction>(II->getOperand(1)); |
| 6046 | if (!Idx || Idx->getOpcode() != Instruction::Add) |
| 6047 | return false; |
| 6048 | Ops.push_back(&II->getOperandUse(1)); |
| 6049 | return true; |
| 6050 | } |
| 6051 | case Intrinsic::aarch64_sme_read_horiz: |
| 6052 | case Intrinsic::aarch64_sme_read_vert: |
| 6053 | case Intrinsic::aarch64_sme_readq_horiz: |
| 6054 | case Intrinsic::aarch64_sme_readq_vert: |
| 6055 | case Intrinsic::aarch64_sme_ld1b_vert: |
| 6056 | case Intrinsic::aarch64_sme_ld1h_vert: |
| 6057 | case Intrinsic::aarch64_sme_ld1w_vert: |
| 6058 | case Intrinsic::aarch64_sme_ld1d_vert: |
| 6059 | case Intrinsic::aarch64_sme_ld1q_vert: |
| 6060 | case Intrinsic::aarch64_sme_st1b_vert: |
| 6061 | case Intrinsic::aarch64_sme_st1h_vert: |
| 6062 | case Intrinsic::aarch64_sme_st1w_vert: |
| 6063 | case Intrinsic::aarch64_sme_st1d_vert: |
| 6064 | case Intrinsic::aarch64_sme_st1q_vert: |
| 6065 | case Intrinsic::aarch64_sme_ld1b_horiz: |
| 6066 | case Intrinsic::aarch64_sme_ld1h_horiz: |
| 6067 | case Intrinsic::aarch64_sme_ld1w_horiz: |
| 6068 | case Intrinsic::aarch64_sme_ld1d_horiz: |
| 6069 | case Intrinsic::aarch64_sme_ld1q_horiz: |
| 6070 | case Intrinsic::aarch64_sme_st1b_horiz: |
| 6071 | case Intrinsic::aarch64_sme_st1h_horiz: |
| 6072 | case Intrinsic::aarch64_sme_st1w_horiz: |
| 6073 | case Intrinsic::aarch64_sme_st1d_horiz: |
| 6074 | case Intrinsic::aarch64_sme_st1q_horiz: { |
| 6075 | auto *Idx = dyn_cast<Instruction>(II->getOperand(3)); |
| 6076 | if (!Idx || Idx->getOpcode() != Instruction::Add) |
| 6077 | return false; |
| 6078 | Ops.push_back(&II->getOperandUse(3)); |
| 6079 | return true; |
| 6080 | } |
| 6081 | case Intrinsic::aarch64_neon_pmull: |
| 6082 | if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) |
| 6083 | return false; |
| 6084 | Ops.push_back(&II->getOperandUse(0)); |
| 6085 | Ops.push_back(&II->getOperandUse(1)); |
| 6086 | return true; |
| 6087 | case Intrinsic::aarch64_neon_pmull64: |
| 6088 | if (!areOperandsOfVmullHighP64(II->getArgOperand(0), |
| 6089 | II->getArgOperand(1))) |
| 6090 | return false; |
| 6091 | Ops.push_back(&II->getArgOperandUse(0)); |
| 6092 | Ops.push_back(&II->getArgOperandUse(1)); |
| 6093 | return true; |
| 6094 | case Intrinsic::masked_gather: |
| 6095 | if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops)) |
| 6096 | return false; |
| 6097 | Ops.push_back(&II->getArgOperandUse(0)); |
| 6098 | return true; |
| 6099 | case Intrinsic::masked_scatter: |
| 6100 | if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops)) |
| 6101 | return false; |
| 6102 | Ops.push_back(&II->getArgOperandUse(1)); |
| 6103 | return true; |
| 6104 | default: |
| 6105 | return false; |
| 6106 | } |
| 6107 | } |
| 6108 | |
David Sherwood | 346185c | 2025-01-06 13:17:14 +0000 | [diff] [blame] | 6109 | auto ShouldSinkCondition = [](Value *Cond) -> bool { |
| 6110 | auto *II = dyn_cast<IntrinsicInst>(Cond); |
| 6111 | return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or && |
| 6112 | isa<ScalableVectorType>(II->getOperand(0)->getType()); |
| 6113 | }; |
| 6114 | |
Jeffrey Byrnes | 853c43d | 2024-10-09 14:30:09 -0700 | [diff] [blame] | 6115 | switch (I->getOpcode()) { |
| 6116 | case Instruction::GetElementPtr: |
| 6117 | case Instruction::Add: |
| 6118 | case Instruction::Sub: |
David Sherwood | 346185c | 2025-01-06 13:17:14 +0000 | [diff] [blame] | 6119 | // Sink vscales closer to uses for better isel |
Jeffrey Byrnes | 853c43d | 2024-10-09 14:30:09 -0700 | [diff] [blame] | 6120 | for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) { |
| 6121 | if (shouldSinkVScale(I->getOperand(Op), Ops)) { |
| 6122 | Ops.push_back(&I->getOperandUse(Op)); |
| 6123 | return true; |
| 6124 | } |
| 6125 | } |
| 6126 | break; |
David Sherwood | 346185c | 2025-01-06 13:17:14 +0000 | [diff] [blame] | 6127 | case Instruction::Select: { |
| 6128 | if (!ShouldSinkCondition(I->getOperand(0))) |
| 6129 | return false; |
| 6130 | |
| 6131 | Ops.push_back(&I->getOperandUse(0)); |
| 6132 | return true; |
| 6133 | } |
| 6134 | case Instruction::Br: { |
| 6135 | if (cast<BranchInst>(I)->isUnconditional()) |
| 6136 | return false; |
| 6137 | |
| 6138 | if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition())) |
| 6139 | return false; |
| 6140 | |
| 6141 | Ops.push_back(&I->getOperandUse(0)); |
| 6142 | return true; |
| 6143 | } |
Jeffrey Byrnes | 853c43d | 2024-10-09 14:30:09 -0700 | [diff] [blame] | 6144 | default: |
| 6145 | break; |
| 6146 | } |
| 6147 | |
| 6148 | if (!I->getType()->isVectorTy()) |
| 6149 | return false; |
| 6150 | |
| 6151 | switch (I->getOpcode()) { |
| 6152 | case Instruction::Sub: |
| 6153 | case Instruction::Add: { |
| 6154 | if (!areExtractExts(I->getOperand(0), I->getOperand(1))) |
| 6155 | return false; |
| 6156 | |
| 6157 | // If the exts' operands extract either the lower or upper elements, we |
| 6158 | // can sink them too. |
| 6159 | auto Ext1 = cast<Instruction>(I->getOperand(0)); |
| 6160 | auto Ext2 = cast<Instruction>(I->getOperand(1)); |
| 6161 | if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) { |
| 6162 | Ops.push_back(&Ext1->getOperandUse(0)); |
| 6163 | Ops.push_back(&Ext2->getOperandUse(0)); |
| 6164 | } |
| 6165 | |
| 6166 | Ops.push_back(&I->getOperandUse(0)); |
| 6167 | Ops.push_back(&I->getOperandUse(1)); |
| 6168 | |
| 6169 | return true; |
| 6170 | } |
| 6171 | case Instruction::Or: { |
| 6172 | // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) -> |
| 6173 | // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1) |
| 6174 | if (ST->hasNEON()) { |
| 6175 | Instruction *OtherAnd, *IA, *IB; |
| 6176 | Value *MaskValue; |
| 6177 | // MainAnd refers to And instruction that has 'Not' as one of its operands |
| 6178 | if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)), |
| 6179 | m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))), |
| 6180 | m_Instruction(IA)))))) { |
| 6181 | if (match(OtherAnd, |
| 6182 | m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) { |
| 6183 | Instruction *MainAnd = I->getOperand(0) == OtherAnd |
| 6184 | ? cast<Instruction>(I->getOperand(1)) |
| 6185 | : cast<Instruction>(I->getOperand(0)); |
| 6186 | |
| 6187 | // Both Ands should be in same basic block as Or |
| 6188 | if (I->getParent() != MainAnd->getParent() || |
| 6189 | I->getParent() != OtherAnd->getParent()) |
| 6190 | return false; |
| 6191 | |
| 6192 | // Non-mask operands of both Ands should also be in same basic block |
| 6193 | if (I->getParent() != IA->getParent() || |
| 6194 | I->getParent() != IB->getParent()) |
| 6195 | return false; |
| 6196 | |
| 6197 | Ops.push_back( |
| 6198 | &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0)); |
| 6199 | Ops.push_back(&I->getOperandUse(0)); |
| 6200 | Ops.push_back(&I->getOperandUse(1)); |
| 6201 | |
| 6202 | return true; |
| 6203 | } |
| 6204 | } |
| 6205 | } |
| 6206 | |
| 6207 | return false; |
| 6208 | } |
| 6209 | case Instruction::Mul: { |
Hari Limaye | 8bc9551 | 2024-12-06 12:45:18 +0000 | [diff] [blame] | 6210 | auto ShouldSinkSplatForIndexedVariant = [](Value *V) { |
| 6211 | auto *Ty = cast<VectorType>(V->getType()); |
| 6212 | // For SVE the lane-indexing is within 128-bits, so we can't fold splats. |
| 6213 | if (Ty->isScalableTy()) |
| 6214 | return false; |
| 6215 | |
| 6216 | // Indexed variants of Mul exist for i16 and i32 element types only. |
| 6217 | return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32; |
| 6218 | }; |
| 6219 | |
Jeffrey Byrnes | 853c43d | 2024-10-09 14:30:09 -0700 | [diff] [blame] | 6220 | int NumZExts = 0, NumSExts = 0; |
| 6221 | for (auto &Op : I->operands()) { |
| 6222 | // Make sure we are not already sinking this operand |
| 6223 | if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) |
| 6224 | continue; |
| 6225 | |
Hari Limaye | 8bc9551 | 2024-12-06 12:45:18 +0000 | [diff] [blame] | 6226 | if (match(&Op, m_ZExtOrSExt(m_Value()))) { |
| 6227 | auto *Ext = cast<Instruction>(Op); |
| 6228 | auto *ExtOp = Ext->getOperand(0); |
| 6229 | if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp)) |
| 6230 | Ops.push_back(&Ext->getOperandUse(0)); |
| 6231 | Ops.push_back(&Op); |
| 6232 | |
| 6233 | if (isa<SExtInst>(Ext)) |
| 6234 | NumSExts++; |
| 6235 | else |
| 6236 | NumZExts++; |
| 6237 | |
Jeffrey Byrnes | 853c43d | 2024-10-09 14:30:09 -0700 | [diff] [blame] | 6238 | continue; |
| 6239 | } |
| 6240 | |
| 6241 | ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op); |
Hari Limaye | 8bc9551 | 2024-12-06 12:45:18 +0000 | [diff] [blame] | 6242 | if (!Shuffle) |
| 6243 | continue; |
Jeffrey Byrnes | 853c43d | 2024-10-09 14:30:09 -0700 | [diff] [blame] | 6244 | |
| 6245 | // If the Shuffle is a splat and the operand is a zext/sext, sinking the |
| 6246 | // operand and the s/zext can help create indexed s/umull. This is |
| 6247 | // especially useful to prevent i64 mul being scalarized. |
Hari Limaye | 8bc9551 | 2024-12-06 12:45:18 +0000 | [diff] [blame] | 6248 | if (isSplatShuffle(Shuffle) && |
Jeffrey Byrnes | 853c43d | 2024-10-09 14:30:09 -0700 | [diff] [blame] | 6249 | match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) { |
| 6250 | Ops.push_back(&Shuffle->getOperandUse(0)); |
| 6251 | Ops.push_back(&Op); |
| 6252 | if (match(Shuffle->getOperand(0), m_SExt(m_Value()))) |
| 6253 | NumSExts++; |
| 6254 | else |
| 6255 | NumZExts++; |
| 6256 | continue; |
| 6257 | } |
| 6258 | |
Jeffrey Byrnes | 853c43d | 2024-10-09 14:30:09 -0700 | [diff] [blame] | 6259 | Value *ShuffleOperand = Shuffle->getOperand(0); |
| 6260 | InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand); |
| 6261 | if (!Insert) |
| 6262 | continue; |
| 6263 | |
| 6264 | Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1)); |
| 6265 | if (!OperandInstr) |
| 6266 | continue; |
| 6267 | |
| 6268 | ConstantInt *ElementConstant = |
| 6269 | dyn_cast<ConstantInt>(Insert->getOperand(2)); |
| 6270 | // Check that the insertelement is inserting into element 0 |
| 6271 | if (!ElementConstant || !ElementConstant->isZero()) |
| 6272 | continue; |
| 6273 | |
| 6274 | unsigned Opcode = OperandInstr->getOpcode(); |
| 6275 | if (Opcode == Instruction::SExt) |
| 6276 | NumSExts++; |
| 6277 | else if (Opcode == Instruction::ZExt) |
| 6278 | NumZExts++; |
| 6279 | else { |
| 6280 | // If we find that the top bits are known 0, then we can sink and allow |
| 6281 | // the backend to generate a umull. |
| 6282 | unsigned Bitwidth = I->getType()->getScalarSizeInBits(); |
| 6283 | APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2); |
| 6284 | const DataLayout &DL = I->getDataLayout(); |
| 6285 | if (!MaskedValueIsZero(OperandInstr, UpperMask, DL)) |
| 6286 | continue; |
| 6287 | NumZExts++; |
| 6288 | } |
| 6289 | |
David Green | 5a069ea | 2025-01-10 11:54:46 +0000 | [diff] [blame] | 6290 | // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking |
| 6291 | // the And, just to hoist it again back to the load. |
| 6292 | if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value()))) |
| 6293 | Ops.push_back(&Insert->getOperandUse(1)); |
Jeffrey Byrnes | 853c43d | 2024-10-09 14:30:09 -0700 | [diff] [blame] | 6294 | Ops.push_back(&Shuffle->getOperandUse(0)); |
| 6295 | Ops.push_back(&Op); |
| 6296 | } |
| 6297 | |
Hari Limaye | 8bc9551 | 2024-12-06 12:45:18 +0000 | [diff] [blame] | 6298 | // It is profitable to sink if we found two of the same type of extends. |
| 6299 | if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2)) |
| 6300 | return true; |
| 6301 | |
| 6302 | // Otherwise, see if we should sink splats for indexed variants. |
| 6303 | if (!ShouldSinkSplatForIndexedVariant(I)) |
| 6304 | return false; |
| 6305 | |
| 6306 | Ops.clear(); |
| 6307 | if (isSplatShuffle(I->getOperand(0))) |
| 6308 | Ops.push_back(&I->getOperandUse(0)); |
| 6309 | if (isSplatShuffle(I->getOperand(1))) |
| 6310 | Ops.push_back(&I->getOperandUse(1)); |
| 6311 | |
| 6312 | return !Ops.empty(); |
Jeffrey Byrnes | 853c43d | 2024-10-09 14:30:09 -0700 | [diff] [blame] | 6313 | } |
Hari Limaye | 4f0403f | 2024-11-19 12:59:22 +0000 | [diff] [blame] | 6314 | case Instruction::FMul: { |
| 6315 | // For SVE the lane-indexing is within 128-bits, so we can't fold splats. |
| 6316 | if (I->getType()->isScalableTy()) |
| 6317 | return false; |
| 6318 | |
| 6319 | if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() && |
| 6320 | !ST->hasFullFP16()) |
| 6321 | return false; |
| 6322 | |
| 6323 | // Sink splats for index lane variants |
| 6324 | if (isSplatShuffle(I->getOperand(0))) |
| 6325 | Ops.push_back(&I->getOperandUse(0)); |
| 6326 | if (isSplatShuffle(I->getOperand(1))) |
| 6327 | Ops.push_back(&I->getOperandUse(1)); |
| 6328 | return !Ops.empty(); |
| 6329 | } |
Jeffrey Byrnes | 853c43d | 2024-10-09 14:30:09 -0700 | [diff] [blame] | 6330 | default: |
| 6331 | return false; |
| 6332 | } |
| 6333 | return false; |
| 6334 | } |