Blame - llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp - llvm-project

2014-05-24 12:50:23 +0000

[diff] [blame]

2

//

Chandler Carruth

2946cd7

2019-01-19 08:50:56 +0000

[diff] [blame]

3

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

4

// See https://llvm.org/LICENSE.txt for license information.

5

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

6

//

7

//===----------------------------------------------------------------------===//

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

8

Florian Hahn

408c440

2020-10-30 21:19:52 +0000

[diff] [blame]

9

#include "AArch64TargetTransformInfo.h"

Florian Hahn

2020-11-02 12:40:34 +0000

[diff] [blame]

10

#include "AArch64ExpandImm.h"

David Green

2022-04-27 12:09:01 +0100

[diff] [blame]

11

#include "AArch64PerfectShuffle.h"

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

12

#include "MCTargetDesc/AArch64AddressingModes.h"

David Green

8274be5

2024-10-28 18:53:38 +0000

[diff] [blame]

13

#include "Utils/AArch64SMEAttributes.h"

Sushant Gokhale

2024-11-13 11:10:49 +0530

[diff] [blame]

14

#include "llvm/ADT/DenseMap.h"

Kevin Qin

2015-03-09 06:14:28 +0000

[diff] [blame]

15

#include "llvm/Analysis/LoopInfo.h"

Chandler Carruth

6bda14b

2017-06-06 11:49:48 +0000

[diff] [blame]

16

#include "llvm/Analysis/TargetTransformInfo.h"

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

17

#include "llvm/CodeGen/BasicTTIImpl.h"

David Blaikie

b3bde2e

2017-11-17 01:07:10 +0000

[diff] [blame]

18

#include "llvm/CodeGen/CostTable.h"

19

#include "llvm/CodeGen/TargetLowering.h"

Sushant Gokhale

2025-03-09 22:26:39 -0700

[diff] [blame]

20

#include "llvm/IR/DerivedTypes.h"

Reid Kleckner

0e8c4bb

2017-09-07 23:27:44 +0000

[diff] [blame]

21

#include "llvm/IR/IntrinsicInst.h"

David Green

2022-04-27 13:51:50 +0100

[diff] [blame]

22

#include "llvm/IR/Intrinsics.h"

Reid Kleckner

5d98695

2019-12-11 07:55:26 -0800

[diff] [blame]

23

#include "llvm/IR/IntrinsicsAArch64.h"

Florian Hahn

2020-11-02 12:40:34 +0000

[diff] [blame]

24

#include "llvm/IR/PatternMatch.h"

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

25

#include "llvm/Support/Debug.h"

Alexandros Lamprineas

831527a

2025-01-17 10:49:43 +0000

[diff] [blame]

26

#include "llvm/TargetParser/AArch64TargetParser.h"

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

27

#include "llvm/Transforms/InstCombine/InstCombiner.h"

David Sherwood

2022-07-12 12:03:39 +0100

[diff] [blame]

28

#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

29

#include <algorithm>

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

30

#include <optional>

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

31

using namespace llvm;

Florian Hahn

2020-11-02 12:40:34 +0000

[diff] [blame]

32

using namespace llvm::PatternMatch;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

33

34

#define DEBUG_TYPE "aarch64tti"

35

Geoff Berry

2017-06-28 18:53:09 +0000

[diff] [blame]

36

static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",

37

cl::init(true), cl::Hidden);

38

David Green

b35d345

2024-12-31 11:07:42 +0000

[diff] [blame]

39

static cl::opt<bool> SVEPreferFixedOverScalableIfEqualCost(

40

"sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);

41

David Sherwood

8b0448c

2021-12-06 11:02:29 +0000

[diff] [blame]

42

static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),

43

cl::Hidden);

44

45

static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",

46

cl::init(10), cl::Hidden);

47

David Sherwood

c7dbe32

2023-04-25 08:46:41 +0000

[diff] [blame]

48

static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",

49

cl::init(15), cl::Hidden);

50

zhongyunde

df19d87

2023-06-07 21:50:54 +0800

[diff] [blame]

51

static cl::opt<unsigned>

52

NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),

53

cl::Hidden);

54

Sander de Smalen

00a8314

2023-10-31 10:28:40 +0000

[diff] [blame]

55

static cl::opt<unsigned> CallPenaltyChangeSM(

56

"call-penalty-sm-change", cl::init(5), cl::Hidden,

57

cl::desc(

58

"Penalty of calling a function that requires a change to PSTATE.SM"));

59

60

static cl::opt<unsigned> InlineCallPenaltyChangeSM(

61

"inline-call-penalty-sm-change", cl::init(10), cl::Hidden,

62

cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));

63

David Green

a2d68b4

2024-01-22 23:46:58 +0000

[diff] [blame]

64

static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",

65

cl::init(true), cl::Hidden);

66

Graham Hunter

e16f2f5

2024-06-06 14:45:36 +0100

[diff] [blame]

67

static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",

68

cl::init(true), cl::Hidden);

69

Graham Hunter

2024-07-04 10:59:21 +0100

[diff] [blame]

70

// A complete guess as to a reasonable cost.

71

static cl::opt<unsigned>

72

BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,

73

cl::desc("The cost of a histcnt instruction"));

74

Danila Malyutin

1a60905

2024-10-17 21:04:04 +0400

[diff] [blame]

75

static cl::opt<unsigned> DMBLookaheadThreshold(

76

"dmb-lookahead-threshold", cl::init(10), cl::Hidden,

77

cl::desc("The number of instructions to search for a redundant dmb"));

78

Benjamin Kramer

b6942a2

2023-01-08 17:25:29 +0100

[diff] [blame]

79

namespace {

David Sherwood

2023-04-20 12:34:55 +0000

[diff] [blame]

80

class TailFoldingOption {

81

// These bitfields will only ever be set to something non-zero in operator=,

82

// when setting the -sve-tail-folding option. This option should always be of

83

84

// InitialBits is one of (disabled|all|simple). EnableBits represents

85

// additional flags we're enabling, and DisableBits for those flags we're

86

// disabling. The default flag is tracked in the variable NeedsDefault, since

87

// at the time of setting the option we may not know what the default value

88

// for the CPU is.

89

TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;

90

TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;

91

TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;

92

93

// This value needs to be initialised to true in case the user does not

94

// explicitly set the -sve-tail-folding option.

95

bool NeedsDefault = true;

96

97

void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }

98

99

void setNeedsDefault(bool V) { NeedsDefault = V; }

100

101

void setEnableBit(TailFoldingOpts Bit) {

EnableBits |= Bit;

DisableBits &= ~Bit;

}

void setDisableBit(TailFoldingOpts Bit) {

EnableBits &= ~Bit;

DisableBits |= Bit;

}

TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {

112

TailFoldingOpts Bits = TailFoldingOpts::Disabled;

113

114

assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&

115

"Initial bits should only include one of "

116

"(disabled|all|simple|default)");

117

Bits = NeedsDefault ? DefaultBits : InitialBits;

118

Bits |= EnableBits;

119

Bits &= ~DisableBits;

return Bits;

}

void reportError(std::string Opt) {

125

errs() << "invalid argument '" << Opt

126

<< "' to -sve-tail-folding=; the option should be of the form\n"

127

" (disabled|all|default|simple)[+(reductions|recurrences"

128

"|reverse|noreductions|norecurrences|noreverse)]\n";

129

report_fatal_error("Unrecognised tail-folding option");

130

}

David Sherwood

2022-07-12 12:03:39 +0100

[diff] [blame]

131

132

public:

David Sherwood

2022-07-12 12:03:39 +0100

[diff] [blame]

133

134

void operator=(const std::string &Val) {

David Sherwood

2023-04-20 12:34:55 +0000

[diff] [blame]

135

// If the user explicitly sets -sve-tail-folding= then treat as an error.

136

if (Val.empty()) {

137

reportError("");

David Sherwood

2022-07-12 12:03:39 +0100

[diff] [blame]

138

return;

David Sherwood

2023-04-20 12:34:55 +0000

[diff] [blame]

139

}

140

141

// Since the user is explicitly setting the option we don't automatically

142

// need the default unless they require it.

143

setNeedsDefault(false);

144

145

SmallVector<StringRef, 4> TailFoldTypes;

David Sherwood

2022-07-12 12:03:39 +0100

[diff] [blame]

146

StringRef(Val).split(TailFoldTypes, '+', -1, false);

David Sherwood

2023-04-20 12:34:55 +0000

[diff] [blame]

147

148

unsigned StartIdx = 1;

149

if (TailFoldTypes[0] == "disabled")

150

setInitialBits(TailFoldingOpts::Disabled);

151

else if (TailFoldTypes[0] == "all")

152

setInitialBits(TailFoldingOpts::All);

153

else if (TailFoldTypes[0] == "default")

154

setNeedsDefault(true);

155

else if (TailFoldTypes[0] == "simple")

156

setInitialBits(TailFoldingOpts::Simple);

157

else {

158

StartIdx = 0;

159

setInitialBits(TailFoldingOpts::Disabled);

160

}

161

162

for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {

163

if (TailFoldTypes[I] == "reductions")

164

setEnableBit(TailFoldingOpts::Reductions);

165

else if (TailFoldTypes[I] == "recurrences")

166

setEnableBit(TailFoldingOpts::Recurrences);

167

else if (TailFoldTypes[I] == "reverse")

168

setEnableBit(TailFoldingOpts::Reverse);

169

else if (TailFoldTypes[I] == "noreductions")

170

setDisableBit(TailFoldingOpts::Reductions);

171

else if (TailFoldTypes[I] == "norecurrences")

172

setDisableBit(TailFoldingOpts::Recurrences);

173

else if (TailFoldTypes[I] == "noreverse")

174

setDisableBit(TailFoldingOpts::Reverse);

175

else

176

reportError(Val);

David Sherwood

2022-07-12 12:03:39 +0100

[diff] [blame]

}

}

David Sherwood

2023-04-20 12:34:55 +0000

[diff] [blame]

180

bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {

181

return (getBits(DefaultBits) & Required) == Required;

182

}

David Sherwood

2022-07-12 12:03:39 +0100

[diff] [blame]

183

};

Benjamin Kramer

b6942a2

2023-01-08 17:25:29 +0100

[diff] [blame]

184

} // namespace

David Sherwood

2022-07-12 12:03:39 +0100

[diff] [blame]

185

David Sherwood

2023-04-20 12:34:55 +0000

[diff] [blame]

186

TailFoldingOption TailFoldingOptionLoc;

David Sherwood

2022-07-12 12:03:39 +0100

[diff] [blame]

187

chrisPyr

71f4c7d

2025-03-03 14:46:33 +0800

[diff] [blame]

188

static cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(

David Sherwood

2022-07-12 12:03:39 +0100

[diff] [blame]

189

"sve-tail-folding",

190

cl::desc(

David Sherwood

2023-04-20 12:34:55 +0000

[diff] [blame]

191

"Control the use of vectorisation using tail-folding for SVE where the"

192

" option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"

193

"\ndisabled (Initial) No loop types will vectorize using "

194

"tail-folding"

195

"\ndefault (Initial) Uses the default tail-folding settings for "

196

"the target CPU"

197

"\nall (Initial) All legal loop types will vectorize using "

198

"tail-folding"

199

"\nsimple (Initial) Use tail-folding for simple loops (not "

200

"reductions or recurrences)"

201

"\nreductions Use tail-folding for loops containing reductions"

202

"\nnoreductions Inverse of above"

203

"\nrecurrences Use tail-folding for loops containing fixed order "

David Sherwood

2023-03-14 18:15:03 +0000

[diff] [blame]

204

"recurrences"

David Sherwood

2023-04-20 12:34:55 +0000

[diff] [blame]

205

"\nnorecurrences Inverse of above"

206

"\nreverse Use tail-folding for loops requiring reversed "

207

"predicates"

208

"\nnoreverse Inverse of above"),

209

cl::location(TailFoldingOptionLoc));

David Sherwood

2022-07-12 12:03:39 +0100

[diff] [blame]

210

Sander de Smalen

2022-10-19 14:14:00 +0000

[diff] [blame]

211

// Experimental option that will only be fully functional when the

212

// code-generator is changed to use SVE instead of NEON for all fixed-width

213

// operations.

214

static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(

215

"enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);

216

217

// Experimental option that will only be fully functional when the cost-model

218

// and code-generator have been changed to avoid using scalable vector

219

// instructions that are not legal in streaming SVE mode.

220

static cl::opt<bool> EnableScalableAutovecInStreamingMode(

221

"enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);

222

Sander de Smalen

6d30bc0

2023-10-30 10:47:07 +0000

[diff] [blame]

223

static bool isSMEABIRoutineCall(const CallInst &CI) {

224

const auto *F = CI.getCalledFunction();

225

return F && StringSwitch<bool>(F->getName())

226

.Case("__arm_sme_state", true)

227

.Case("__arm_tpidr2_save", true)

228

.Case("__arm_tpidr2_restore", true)

229

.Case("__arm_za_disable", true)

.Default(false);

}

/// Returns true if the function has explicit operations that can only be

234

/// lowered using incompatible instructions for the selected mode. This also

235

/// returns true if the function F may use or modify ZA state.

236

static bool hasPossibleIncompatibleOps(const Function *F) {

237

for (const BasicBlock &BB : *F) {

238

for (const Instruction &I : BB) {

239

// Be conservative for now and assume that any call to inline asm or to

240

// intrinsics could could result in non-streaming ops (e.g. calls to

241

// @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that

242

// all native LLVM instructions can be lowered to compatible instructions.

243

if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&

244

(cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||

245

isSMEABIRoutineCall(cast<CallInst>(I))))

return true;

}

}

return false;

}

Alexandros Lamprineas

831527a

2025-01-17 10:49:43 +0000

[diff] [blame]

252

uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const {

253

StringRef AttributeStr =

254

isMultiversionedFunction(F) ? "fmv-features" : "target-features";

255

StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();

256

SmallVector<StringRef, 8> Features;

257

FeatureStr.split(Features, ",");

258

return AArch64::getFMVPriority(Features);

259

}

260

261

bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const {

262

return F.hasFnAttribute("fmv-features");

263

}

264

Csanád Hajdú

a190f15

2025-02-14 09:56:07 +0100

[diff] [blame]

265

const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {

266

AArch64::FeatureExecuteOnly,

267

};

268

Florian Hahn

2665feb

2017-06-27 22:27:32 +0000

[diff] [blame]

269

bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,

270

const Function *Callee) const {

Sander de Smalen

3abf55a

2024-01-31 11:38:29 +0000

[diff] [blame]

271

SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);

272

273

// When inlining, we should consider the body of the function, not the

274

// interface.

275

if (CalleeAttrs.hasStreamingBody()) {

276

CalleeAttrs.set(SMEAttrs::SM_Compatible, false);

277

CalleeAttrs.set(SMEAttrs::SM_Enabled, true);

278

}

279

Kerry McLaughlin

d8d4c18

2025-01-06 12:02:28 +0000

[diff] [blame]

280

if (CalleeAttrs.isNewZA() || CalleeAttrs.isNewZT0())

David Sherwood

64bef3d

2022-09-20 16:28:34 +0100

[diff] [blame]

281

return false;

282

Sander de Smalen

6d30bc0

2023-10-30 10:47:07 +0000

[diff] [blame]

283

if (CallerAttrs.requiresLazySave(CalleeAttrs) ||

Sander de Smalen

fb470db

2024-08-02 10:29:08 +0100

[diff] [blame]

284

CallerAttrs.requiresSMChange(CalleeAttrs) ||

Sander de Smalen

2ce168b

2024-12-23 19:10:21 +0000

[diff] [blame]

285

CallerAttrs.requiresPreservingZT0(CalleeAttrs) ||

286

CallerAttrs.requiresPreservingAllZAState(CalleeAttrs)) {

Sander de Smalen

6d30bc0

2023-10-30 10:47:07 +0000

[diff] [blame]

287

if (hasPossibleIncompatibleOps(Callee))

return false;

}

Csanád Hajdú

2025-02-14 09:56:07 +0100

[diff] [blame]

291

const TargetMachine &TM = getTLI()->getTargetMachine();

292

const FeatureBitset &CallerBits =

293

TM.getSubtargetImpl(*Caller)->getFeatureBits();

294

const FeatureBitset &CalleeBits =

295

TM.getSubtargetImpl(*Callee)->getFeatureBits();

296

// Adjust the feature bitsets by inverting some of the bits. This is needed

297

// for target features that represent restrictions rather than capabilities,

298

// for example a "+execute-only" callee can be inlined into a caller without

299

// "+execute-only", but not vice versa.

300

FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;

301

FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;

302

303

return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;

Florian Hahn

2665feb

2017-06-27 22:27:32 +0000

[diff] [blame]

304

}

305

KAWASHIMA Takahiro

926173c

2023-10-26 14:51:20 +0900

[diff] [blame]

306

bool AArch64TTIImpl::areTypesABICompatible(

307

const Function *Caller, const Function *Callee,

308

const ArrayRef<Type *> &Types) const {

309

if (!BaseT::areTypesABICompatible(Caller, Callee, Types))

310

return false;

311

312

// We need to ensure that argument promotion does not attempt to promote

313

// pointers to fixed-length vector types larger than 128 bits like

314

// <8 x float> (and pointers to aggregate types which have such fixed-length

315

// vector type members) into the values of the pointees. Such vector types

316

// are used for SVE VLS but there is no ABI for SVE VLS arguments and the

317

// backend cannot lower such value arguments. The 128-bit fixed-length SVE

318

// types can be safely treated as 128-bit NEON types and they cannot be

319

// distinguished in IR.

320

if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {

321

auto FVTy = dyn_cast<FixedVectorType>(Ty);

322

return FVTy &&

323

FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;

}))

return false;

return true;

}

Sander de Smalen

2023-10-31 10:28:40 +0000

[diff] [blame]

330

unsigned

331

AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,

332

unsigned DefaultCallPenalty) const {

333

// This function calculates a penalty for executing Call in F.

334

//

335

// There are two ways this function can be called:

336

// (1) F:

337

// call from F -> G (the call here is Call)

338

//

339

// For (1), Call.getCaller() == F, so it will always return a high cost if

340

// a streaming-mode change is required (thus promoting the need to inline the

// function)

//

// (2) F:

// call from F -> G (the call here is not Call)

345

// G:

346

// call from G -> H (the call here is Call)

347

//

348

// For (2), if after inlining the body of G into F the call to H requires a

349

// streaming-mode change, and the call to G from F would also require a

350

// streaming-mode change, then there is benefit to do the streaming-mode

351

// change only once and avoid inlining of G into F.

352

SMEAttrs FAttrs(*F);

353

SMEAttrs CalleeAttrs(Call);

354

if (FAttrs.requiresSMChange(CalleeAttrs)) {

355

if (F == Call.getCaller()) // (1)

356

return CallPenaltyChangeSM * DefaultCallPenalty;

357

if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)

358

return InlineCallPenaltyChangeSM * DefaultCallPenalty;

359

}

360

361

return DefaultCallPenalty;

362

}

363

Jingu Kang

bb82f74

2022-05-23 12:33:48 +0100

[diff] [blame]

364

bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(

365

TargetTransformInfo::RegisterKind K) const {

366

assert(K != TargetTransformInfo::RGK_Scalar);

Graham Hunter

091a235

2024-10-18 11:05:55 +0100

[diff] [blame]

367

return (K == TargetTransformInfo::RGK_FixedWidthVector &&

368

ST->isNeonAvailable());

Jingu Kang

bb82f74

2022-05-23 12:33:48 +0100

[diff] [blame]

369

}

370

Adrian Prantl

5f8f34e4

2018-05-01 15:54:18 +0000

[diff] [blame]

371

/// Calculate the cost of materializing a 64-bit value. This helper

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

372

/// method might only calculate a fraction of a larger immediate. Therefore it

373

/// is valid to return a cost of ZERO.

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

374

InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) const {

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

375

// Check if the immediate can be encoded within an instruction.

376

if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))

return 0;

if (Val < 0)

Val = ~Val;

// Calculate how many moves we will need to materialize this constant.

Adhemerval Zanella

270249d

2019-03-18 18:50:58 +0000

[diff] [blame]

383

SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;

384

AArch64_IMM::expandMOVImm(Val, 64, Insn);

385

return Insn.size();

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

386

}

387

Adrian Prantl

5f8f34e4

2018-05-01 15:54:18 +0000

[diff] [blame]

388

/// Calculate the cost of materializing the given constant.

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

389

InstructionCost

390

AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,

391

TTI::TargetCostKind CostKind) const {

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

392

assert(Ty->isIntegerTy());

393

394

unsigned BitSize = Ty->getPrimitiveSizeInBits();

if (BitSize == 0)

return ~0U;

// Sign-extend all constants to a multiple of 64-bit.

399

APInt ImmVal = Imm;

400

if (BitSize & 0x3f)

401

ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);

402

403

// Split the constant into 64-bit chunks and calculate the cost for each

404

// chunk.

Sander de Smalen

2021-01-27 15:01:16 +0000

[diff] [blame]

405

InstructionCost Cost = 0;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

406

for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {

407

APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);

408

int64_t Val = Tmp.getSExtValue();

409

Cost += getIntImmCost(Val);

410

}

411

// We need at least one instruction to materialze the constant.

Sander de Smalen

2021-01-27 15:01:16 +0000

[diff] [blame]

412

return std::max<InstructionCost>(1, Cost);

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

413

}

414

Sander de Smalen

2021-01-27 15:01:16 +0000

[diff] [blame]

415

InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,

416

const APInt &Imm, Type *Ty,

417

TTI::TargetCostKind CostKind,

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

418

Instruction *Inst) const {

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

419

assert(Ty->isIntegerTy());

420

421

unsigned BitSize = Ty->getPrimitiveSizeInBits();

422

// There is no cost model for constants with a bit size of 0. Return TCC_Free

423

// here, so that constant hoisting will ignore this constant.

424

if (BitSize == 0)

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

425

return TTI::TCC_Free;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

426

427

unsigned ImmIdx = ~0U;

428

switch (Opcode) {

429

default:

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

430

return TTI::TCC_Free;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

431

case Instruction::GetElementPtr:

432

// Always hoist the base address of a GetElementPtr.

433

if (Idx == 0)

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

434

return 2 * TTI::TCC_Basic;

435

return TTI::TCC_Free;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

436

case Instruction::Store:

437

ImmIdx = 0;

438

break;

439

case Instruction::Add:

440

case Instruction::Sub:

441

case Instruction::Mul:

442

case Instruction::UDiv:

443

case Instruction::SDiv:

444

case Instruction::URem:

445

case Instruction::SRem:

446

case Instruction::And:

447

case Instruction::Or:

448

case Instruction::Xor:

449

case Instruction::ICmp:

450

ImmIdx = 1;

451

break;

452

// Always return TCC_Free for the shift value of a shift instruction.

453

case Instruction::Shl:

454

case Instruction::LShr:

455

case Instruction::AShr:

456

if (Idx == 1)

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

457

return TTI::TCC_Free;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

458

break;

459

case Instruction::Trunc:

460

case Instruction::ZExt:

461

case Instruction::SExt:

462

case Instruction::IntToPtr:

463

case Instruction::PtrToInt:

464

case Instruction::BitCast:

465

case Instruction::PHI:

466

case Instruction::Call:

467

case Instruction::Select:

468

case Instruction::Ret:

469

case Instruction::Load:

break;

}

if (Idx == ImmIdx) {

Chandler Carruth

2015-08-05 18:08:10 +0000

[diff] [blame]

474

int NumConstants = (BitSize + 63) / 64;

Sander de Smalen

2021-01-27 15:01:16 +0000

[diff] [blame]

475

InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

476

return (Cost <= NumConstants * TTI::TCC_Basic)

Chandler Carruth

2015-08-05 18:08:10 +0000

[diff] [blame]

477

? static_cast<int>(TTI::TCC_Free)

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

478

: Cost;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

479

}

Sam Parker

2020-04-28 14:11:27 +0100

[diff] [blame]

480

return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

481

}

482

Sander de Smalen

2021-01-27 15:01:16 +0000

[diff] [blame]

483

InstructionCost

484

AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,

485

const APInt &Imm, Type *Ty,

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

486

TTI::TargetCostKind CostKind) const {

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

487

assert(Ty->isIntegerTy());

488

489

unsigned BitSize = Ty->getPrimitiveSizeInBits();

490

// There is no cost model for constants with a bit size of 0. Return TCC_Free

491

// here, so that constant hoisting will ignore this constant.

492

if (BitSize == 0)

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

493

return TTI::TCC_Free;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

494

Florian Hahn

93c8235

2019-12-04 10:49:24 +0000

[diff] [blame]

495

// Most (all?) AArch64 intrinsics do not support folding immediates into the

496

// selected instruction, so we compute the materialization cost for the

497

// immediate directly.

498

if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)

Sam Parker

2020-04-28 14:11:27 +0100

[diff] [blame]

499

return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);

Florian Hahn

93c8235

2019-12-04 10:49:24 +0000

[diff] [blame]

500

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

501

switch (IID) {

502

default:

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

503

return TTI::TCC_Free;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

504

case Intrinsic::sadd_with_overflow:

505

case Intrinsic::uadd_with_overflow:

506

case Intrinsic::ssub_with_overflow:

507

case Intrinsic::usub_with_overflow:

508

case Intrinsic::smul_with_overflow:

509

case Intrinsic::umul_with_overflow:

510

if (Idx == 1) {

Chandler Carruth

2015-08-05 18:08:10 +0000

[diff] [blame]

511

int NumConstants = (BitSize + 63) / 64;

Sander de Smalen

2021-01-27 15:01:16 +0000

[diff] [blame]

512

InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

513

return (Cost <= NumConstants * TTI::TCC_Basic)

Chandler Carruth

2015-08-05 18:08:10 +0000

[diff] [blame]

514

? static_cast<int>(TTI::TCC_Free)

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

515

: Cost;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

516

}

517

break;

518

case Intrinsic::experimental_stackmap:

519

if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

520

return TTI::TCC_Free;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

521

break;

522

case Intrinsic::experimental_patchpoint_void:

Il-Capitano

308ed02

2024-03-26 14:38:52 +0100

[diff] [blame]

523

case Intrinsic::experimental_patchpoint:

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

524

if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

525

return TTI::TCC_Free;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

526

break;

Philip Reames

e6bc703

2020-09-14 16:38:48 -0700

[diff] [blame]

527

case Intrinsic::experimental_gc_statepoint:

528

if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))

529

return TTI::TCC_Free;

530

break;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

531

}

Sam Parker

2020-04-28 14:11:27 +0100

[diff] [blame]

532

return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

533

}

534

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

535

TargetTransformInfo::PopcntSupportKind

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

536

AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

537

assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");

538

if (TyWidth == 32 || TyWidth == 64)

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

539

return TTI::PSK_FastHardware;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

540

// TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

541

return TTI::PSK_Software;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

542

}

543

Graham Hunter

2024-03-04 16:17:01 +0000

[diff] [blame]

544

static bool isUnpackedVectorVT(EVT VecVT) {

545

return VecVT.isScalableVector() &&

546

VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;

547

}

548

Graham Hunter

2024-07-04 10:59:21 +0100

[diff] [blame]

549

static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {

550

Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers

551

Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements

Sam Tebbs

2024-09-19 13:56:52 +0100

[diff] [blame]

552

unsigned TotalHistCnts = 1;

Graham Hunter

2024-07-04 10:59:21 +0100

[diff] [blame]

553

Sam Tebbs

2024-09-19 13:56:52 +0100

[diff] [blame]

554

unsigned EltSize = EltTy->getScalarSizeInBits();

555

// Only allow (up to 64b) integers or pointers

556

if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)

Graham Hunter

2024-07-04 10:59:21 +0100

[diff] [blame]

557

return InstructionCost::getInvalid();

558

Graham Hunter

2024-07-04 10:59:21 +0100

[diff] [blame]

559

// FIXME: We should be able to generate histcnt for fixed-length vectors

560

// using ptrue with a specific VL.

Sam Tebbs

2024-09-19 13:56:52 +0100

[diff] [blame]

561

if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {

562

unsigned EC = VTy->getElementCount().getKnownMinValue();

563

if (!isPowerOf2_64(EC) || !VTy->isScalableTy())

Graham Hunter

2024-07-04 10:59:21 +0100

[diff] [blame]

564

return InstructionCost::getInvalid();

565

Sam Tebbs

2024-09-19 13:56:52 +0100

[diff] [blame]

566

// HistCnt only supports 32b and 64b element types

567

unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;

568

Samuel Tebbs

b1b436c1

2024-09-19 14:43:22 +0100

[diff] [blame]

569

if (EC == 2 || (LegalEltSize == 32 && EC == 4))

Sam Tebbs

2024-09-19 13:56:52 +0100

[diff] [blame]

570

return InstructionCost(BaseHistCntCost);

571

572

unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;

573

TotalHistCnts = EC / NaturalVectorWidth;

574

}

575

576

return InstructionCost(BaseHistCntCost * TotalHistCnts);

Graham Hunter

2024-07-04 10:59:21 +0100

[diff] [blame]

577

}

578

Sander de Smalen

2f6f249

2021-01-22 17:14:44 +0000

[diff] [blame]

579

InstructionCost

Florian Hahn

2020-10-23 09:00:20 +0100

[diff] [blame]

580

AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

581

TTI::TargetCostKind CostKind) const {

David Green

2024-08-09 14:25:07 +0100

[diff] [blame]

582

// The code-generator is currently not able to handle scalable vectors

583

// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting

584

// it. This change will be removed when code-generation for these types is

585

// sufficiently reliable.

Florian Hahn

2020-10-23 09:00:20 +0100

[diff] [blame]

586

auto *RetTy = ICA.getReturnType();

David Green

2024-08-09 14:25:07 +0100

[diff] [blame]

587

if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))

588

if (VTy->getElementCount() == ElementCount::getScalable(1))

589

return InstructionCost::getInvalid();

590

Florian Hahn

2020-10-23 09:00:20 +0100

[diff] [blame]

591

switch (ICA.getID()) {

Graham Hunter

2024-07-04 10:59:21 +0100

[diff] [blame]

592

case Intrinsic::experimental_vector_histogram_add:

593

if (!ST->hasSVE2())

594

return InstructionCost::getInvalid();

595

return getHistogramCost(ICA);

Florian Hahn

2020-10-23 09:00:20 +0100

[diff] [blame]

596

case Intrinsic::umin:

Irina Dobrescu

b01417d

2021-07-22 16:21:48 +0100

[diff] [blame]

597

case Intrinsic::umax:

Simon Pilgrim

969918e

2020-11-27 11:00:34 +0000

[diff] [blame]

598

case Intrinsic::smin:

599

case Intrinsic::smax: {

Florian Hahn

2020-10-23 09:00:20 +0100

[diff] [blame]

600

static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,

Tuan Chuong Goh

e36dd3e

2023-07-12 07:46:12 +0100

[diff] [blame]

601

MVT::v8i16, MVT::v2i32, MVT::v4i32,

602

MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,

603

MVT::nxv2i64};

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

604

auto LT = getTypeLegalizationCost(RetTy);

Irina Dobrescu

b01417d

2021-07-22 16:21:48 +0100

[diff] [blame]

605

// v2i64 types get converted to cmp+bif hence the cost of 2

606

if (LT.second == MVT::v2i64)

607

return LT.first * 2;

Florian Hahn

2020-10-23 09:00:20 +0100

[diff] [blame]

608

if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))

609

return LT.first;

610

break;

611

}

David Green

0175cd0

2021-01-27 10:38:32 +0000

[diff] [blame]

612

case Intrinsic::sadd_sat:

613

case Intrinsic::ssub_sat:

614

case Intrinsic::uadd_sat:

615

case Intrinsic::usub_sat: {

616

static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,

617

MVT::v8i16, MVT::v2i32, MVT::v4i32,

618

MVT::v2i64};

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

619

auto LT = getTypeLegalizationCost(RetTy);

David Green

0175cd0

2021-01-27 10:38:32 +0000

[diff] [blame]

620

// This is a base cost of 1 for the vadd, plus 3 extract shifts if we

621

// need to extend the type, as it uses shr(qadd(shl, shl)).

622

unsigned Instrs =

623

LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;

624

if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))

625

return LT.first * Instrs;

626

break;

627

}

Stelios Ioannou

30cb9c0

2021-02-24 12:51:30 +0000

[diff] [blame]

628

case Intrinsic::abs: {

629

static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,

630

MVT::v8i16, MVT::v2i32, MVT::v4i32,

631

MVT::v2i64};

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

632

auto LT = getTypeLegalizationCost(RetTy);

Stelios Ioannou

30cb9c0

2021-02-24 12:51:30 +0000

[diff] [blame]

633

if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))

634

return LT.first;

635

break;

636

}

David Green

8da62b8

2023-07-21 08:48:53 +0100

[diff] [blame]

637

case Intrinsic::bswap: {

638

static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,

639

MVT::v4i32, MVT::v2i64};

640

auto LT = getTypeLegalizationCost(RetTy);

641

if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&

642

LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())

643

return LT.first;

644

break;

645

}

Maciej Gabka

95d2d1c

2024-08-28 12:48:20 +0100

[diff] [blame]

646

case Intrinsic::stepvector: {

Sander de Smalen

2021-04-14 16:53:01 +0100

[diff] [blame]

647

InstructionCost Cost = 1; // Cost of the `index' instruction

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

648

auto LT = getTypeLegalizationCost(RetTy);

David Sherwood

748ae52

2021-02-08 15:46:24 +0000

[diff] [blame]

649

// Legalisation of illegal vectors involves an `index' instruction plus

650

// (LT.first - 1) vector adds.

651

if (LT.first > 1) {

652

Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());

Sander de Smalen

2021-04-14 16:53:01 +0100

[diff] [blame]

653

InstructionCost AddCost =

David Sherwood

748ae52

2021-02-08 15:46:24 +0000

[diff] [blame]

654

getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);

655

Cost += AddCost * (LT.first - 1);

656

}

657

return Cost;

658

}

Graham Hunter

2024-03-04 16:17:01 +0000

[diff] [blame]

659

case Intrinsic::vector_extract:

660

case Intrinsic::vector_insert: {

661

// If both the vector and subvector types are legal types and the index

662

// is 0, then this should be a no-op or simple operation; return a

663

// relatively low cost.

664

665

// If arguments aren't actually supplied, then we cannot determine the

666

// value of the index. We also want to skip predicate types.

667

if (ICA.getArgs().size() != ICA.getArgTypes().size() ||

668

ICA.getReturnType()->getScalarType()->isIntegerTy(1))

669

break;

670

671

LLVMContext &C = RetTy->getContext();

672

EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);

673

bool IsExtract = ICA.getID() == Intrinsic::vector_extract;

674

EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)

675

: getTLI()->getValueType(DL, ICA.getArgTypes()[1]);

676

// Skip this if either the vector or subvector types are unpacked

677

// SVE types; they may get lowered to stack stores and loads.

678

if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))

679

break;

680

681

TargetLoweringBase::LegalizeKind SubVecLK =

682

getTLI()->getTypeConversion(C, SubVecVT);

683

TargetLoweringBase::LegalizeKind VecLK =

684

getTLI()->getTypeConversion(C, VecVT);

685

const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];

Graham Hunter

56abb8d

2024-03-05 09:35:16 +0000

[diff] [blame]

686

const ConstantInt *CIdx = cast<ConstantInt>(Idx);

Graham Hunter

2024-03-04 16:17:01 +0000

[diff] [blame]

687

if (SubVecLK.first == TargetLoweringBase::TypeLegal &&

Graham Hunter

56abb8d

2024-03-05 09:35:16 +0000

[diff] [blame]

688

VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())

Graham Hunter

2024-03-04 16:17:01 +0000

[diff] [blame]

689

return TTI::TCC_Free;

690

break;

691

}

Irina Dobrescu

2021-06-03 09:46:12 +0100

[diff] [blame]

692

case Intrinsic::bitreverse: {

693

static const CostTblEntry BitreverseTbl[] = {

694

{Intrinsic::bitreverse, MVT::i32, 1},

695

{Intrinsic::bitreverse, MVT::i64, 1},

696

{Intrinsic::bitreverse, MVT::v8i8, 1},

697

{Intrinsic::bitreverse, MVT::v16i8, 1},

698

{Intrinsic::bitreverse, MVT::v4i16, 2},

699

{Intrinsic::bitreverse, MVT::v8i16, 2},

700

{Intrinsic::bitreverse, MVT::v2i32, 2},

701

{Intrinsic::bitreverse, MVT::v4i32, 2},

702

{Intrinsic::bitreverse, MVT::v1i64, 2},

703

{Intrinsic::bitreverse, MVT::v2i64, 2},

704

};

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

705

const auto LegalisationCost = getTypeLegalizationCost(RetTy);

Irina Dobrescu

2021-06-03 09:46:12 +0100

[diff] [blame]

706

const auto *Entry =

707

CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);

Simon Pilgrim

676f280

2021-09-29 12:28:38 +0100

[diff] [blame]

708

if (Entry) {

709

// Cost Model is using the legal type(i32) that i8 and i16 will be

710

// converted to +1 so that we match the actual lowering cost

711

if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||

712

TLI->getValueType(DL, RetTy, true) == MVT::i16)

713

return LegalisationCost.first * Entry->Cost + 1;

714

Irina Dobrescu

2021-06-03 09:46:12 +0100

[diff] [blame]

715

return LegalisationCost.first * Entry->Cost;

Simon Pilgrim

676f280

2021-09-29 12:28:38 +0100

[diff] [blame]

716

}

Benjamin Kramer

3dceffd

2021-06-10 17:23:37 +0200

[diff] [blame]

717

break;

Irina Dobrescu

2021-06-03 09:46:12 +0100

[diff] [blame]

718

}

Rosie Sumpter

d7c219a

2021-06-09 10:00:16 +0100

[diff] [blame]

719

case Intrinsic::ctpop: {

Eli Friedman

b219a9c

2022-09-02 15:17:55 -0700

[diff] [blame]

720

if (!ST->hasNEON()) {

721

// 32-bit or 64-bit ctpop without NEON is 12 instructions.

722

return getTypeLegalizationCost(RetTy).first * 12;

723

}

Rosie Sumpter

d7c219a

2021-06-09 10:00:16 +0100

[diff] [blame]

724

static const CostTblEntry CtpopCostTbl[] = {

725

{ISD::CTPOP, MVT::v2i64, 4},

726

{ISD::CTPOP, MVT::v4i32, 3},

727

{ISD::CTPOP, MVT::v8i16, 2},

728

{ISD::CTPOP, MVT::v16i8, 1},

729

{ISD::CTPOP, MVT::i64, 4},

730

{ISD::CTPOP, MVT::v2i32, 3},

731

{ISD::CTPOP, MVT::v4i16, 2},

732

{ISD::CTPOP, MVT::v8i8, 1},

733

{ISD::CTPOP, MVT::i32, 5},

734

};

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

735

auto LT = getTypeLegalizationCost(RetTy);

Rosie Sumpter

d7c219a

2021-06-09 10:00:16 +0100

[diff] [blame]

736

MVT MTy = LT.second;

737

if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {

738

// Extra cost of +1 when illegal vector types are legalized by promoting

739

// the integer type.

740

int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=

741

RetTy->getScalarSizeInBits()

742

? 1

743

: 0;

744

return LT.first * Entry->Cost + ExtraCost;

745

}

746

break;

747

}

David Green

bc615e4

2022-01-07 16:20:23 +0000

[diff] [blame]

748

case Intrinsic::sadd_with_overflow:

749

case Intrinsic::uadd_with_overflow:

750

case Intrinsic::ssub_with_overflow:

751

case Intrinsic::usub_with_overflow:

David Green

c65270c

2022-01-06 17:22:47 +0000

[diff] [blame]

752

case Intrinsic::smul_with_overflow:

753

case Intrinsic::umul_with_overflow: {

754

static const CostTblEntry WithOverflowCostTbl[] = {

David Green

bc615e4

2022-01-07 16:20:23 +0000

[diff] [blame]

755

{Intrinsic::sadd_with_overflow, MVT::i8, 3},

756

{Intrinsic::uadd_with_overflow, MVT::i8, 3},

757

{Intrinsic::sadd_with_overflow, MVT::i16, 3},

758

{Intrinsic::uadd_with_overflow, MVT::i16, 3},

759

{Intrinsic::sadd_with_overflow, MVT::i32, 1},

760

{Intrinsic::uadd_with_overflow, MVT::i32, 1},

761

{Intrinsic::sadd_with_overflow, MVT::i64, 1},

762

{Intrinsic::uadd_with_overflow, MVT::i64, 1},

763

{Intrinsic::ssub_with_overflow, MVT::i8, 3},

764

{Intrinsic::usub_with_overflow, MVT::i8, 3},

765

{Intrinsic::ssub_with_overflow, MVT::i16, 3},

766

{Intrinsic::usub_with_overflow, MVT::i16, 3},

767

{Intrinsic::ssub_with_overflow, MVT::i32, 1},

768

{Intrinsic::usub_with_overflow, MVT::i32, 1},

769

{Intrinsic::ssub_with_overflow, MVT::i64, 1},

770

{Intrinsic::usub_with_overflow, MVT::i64, 1},

David Green

c65270c

2022-01-06 17:22:47 +0000

[diff] [blame]

771

{Intrinsic::smul_with_overflow, MVT::i8, 5},

772

{Intrinsic::umul_with_overflow, MVT::i8, 4},

773

{Intrinsic::smul_with_overflow, MVT::i16, 5},

774

{Intrinsic::umul_with_overflow, MVT::i16, 4},

775

{Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst

776

{Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw

777

{Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp

778

{Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr

779

};

780

EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);

781

if (MTy.isSimple())

782

if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),

MTy.getSimpleVT()))

return Entry->Cost;

break;

}

David Green

2022-05-02 11:36:05 +0100

[diff] [blame]

787

case Intrinsic::fptosi_sat:

788

case Intrinsic::fptoui_sat: {

789

if (ICA.getArgTypes().empty())

790

break;

791

bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

792

auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);

David Green

2022-05-02 11:36:05 +0100

[diff] [blame]

793

EVT MTy = TLI->getValueType(DL, RetTy);

794

// Check for the legal types, which are where the size of the input and the

795

// output are the same, or we are using cvt f64->i32 or f32->i64.

796

if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||

797

LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||

David Green

2024-07-28 10:47:40 +0100

[diff] [blame]

798

LT.second == MVT::v2f64)) {

799

if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||

800

(LT.second == MVT::f64 && MTy == MVT::i32) ||

801

(LT.second == MVT::f32 && MTy == MVT::i64)))

802

return LT.first;

803

// Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2

804

if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&

805

MTy.getScalarSizeInBits() == 64)

806

return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);

807

}

808

// Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to

809

// f32.

810

if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())

811

return LT.first + getIntrinsicInstrCost(

812

{ICA.getID(),

813

RetTy,

814

{ICA.getArgTypes()[0]->getWithNewType(

815

Type::getFloatTy(RetTy->getContext()))}},

816

CostKind);

817

if ((LT.second == MVT::f16 && MTy == MVT::i32) ||

818

(LT.second == MVT::f16 && MTy == MVT::i64) ||

819

((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&

820

(LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))

David Green

2022-05-02 11:36:05 +0100

[diff] [blame]

821

return LT.first;

David Green

2024-07-28 10:47:40 +0100

[diff] [blame]

822

// Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2

823

if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&

824

MTy.getScalarSizeInBits() == 32)

825

return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);

826

// Extending vector types v8f16->v8i32. These current scalarize but the

827

// codegen could be better.

828

if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&

829

MTy.getScalarSizeInBits() == 64)

830

return MTy.getVectorNumElements() * 3;

David Green

2022-05-02 11:36:05 +0100

[diff] [blame]

831

David Green

2024-07-28 10:47:40 +0100

[diff] [blame]

832

// If we can we use a legal convert followed by a min+max

David Green

2022-05-02 11:36:05 +0100

[diff] [blame]

833

if ((LT.second.getScalarType() == MVT::f32 ||

834

LT.second.getScalarType() == MVT::f64 ||

David Green

2024-07-28 10:47:40 +0100

[diff] [blame]

835

LT.second.getScalarType() == MVT::f16) &&

David Green

2022-05-02 11:36:05 +0100

[diff] [blame]

836

LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {

837

Type *LegalTy =

838

Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());

839

if (LT.second.isVector())

840

LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());

841

InstructionCost Cost = 1;

842

IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,

843

LegalTy, {LegalTy, LegalTy});

844

Cost += getIntrinsicInstrCost(Attrs1, CostKind);

845

IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,

846

LegalTy, {LegalTy, LegalTy});

847

Cost += getIntrinsicInstrCost(Attrs2, CostKind);

David Green

2024-07-28 10:47:40 +0100

[diff] [blame]

848

return LT.first * Cost +

849

((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0

850

: 1);

David Green

2022-05-02 11:36:05 +0100

[diff] [blame]

851

}

David Green

2024-07-28 10:47:40 +0100

[diff] [blame]

852

// Otherwise we need to follow the default expansion that clamps the value

853

// using a float min/max with a fcmp+sel for nan handling when signed.

854

Type *FPTy = ICA.getArgTypes()[0]->getScalarType();

855

RetTy = RetTy->getScalarType();

856

if (LT.second.isVector()) {

857

FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());

858

RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());

859

}

860

IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});

861

InstructionCost Cost = getIntrinsicInstrCost(Attrs1, CostKind);

862

IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});

863

Cost += getIntrinsicInstrCost(Attrs2, CostKind);

864

Cost +=

865

getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,

866

RetTy, FPTy, TTI::CastContextHint::None, CostKind);

867

if (IsSigned) {

868

Type *CondTy = RetTy->getWithNewBitWidth(1);

869

Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,

870

CmpInst::FCMP_UNO, CostKind);

871

Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,

872

CmpInst::FCMP_UNO, CostKind);

873

}

874

return LT.first * Cost;

David Green

2022-05-02 11:36:05 +0100

[diff] [blame]

875

}

Zain Jaffal

3d3d8fe

2023-04-20 18:20:01 +0100

[diff] [blame]

876

case Intrinsic::fshl:

877

case Intrinsic::fshr: {

878

if (ICA.getArgs().empty())

879

break;

880

881

// TODO: Add handling for fshl where third argument is not a constant.

882

const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);

883

if (!OpInfoZ.isConstant())

884

break;

885

886

const auto LegalisationCost = getTypeLegalizationCost(RetTy);

887

if (OpInfoZ.isUniform()) {

Zain Jaffal

3d3d8fe

2023-04-20 18:20:01 +0100

[diff] [blame]

888

static const CostTblEntry FshlTbl[] = {

David Green

e44e24d

2025-03-09 18:01:45 +0000

[diff] [blame]

889

{Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra

890

{Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},

891

{Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},

892

{Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};

Zain Jaffal

3d3d8fe

2023-04-20 18:20:01 +0100

[diff] [blame]

893

// Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl

894

// to avoid having to duplicate the costs.

895

const auto *Entry =

896

CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);

897

if (Entry)

898

return LegalisationCost.first * Entry->Cost;

899

}

900

901

auto TyL = getTypeLegalizationCost(RetTy);

902

if (!RetTy->isIntegerTy())

903

break;

904

905

// Estimate cost manually, as types like i8 and i16 will get promoted to

906

// i32 and CostTableLookup will ignore the extra conversion cost.

907

bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&

908

RetTy->getScalarSizeInBits() < 64) ||

909

(RetTy->getScalarSizeInBits() % 64 != 0);

910

unsigned ExtraCost = HigherCost ? 1 : 0;

911

if (RetTy->getScalarSizeInBits() == 32 ||

912

RetTy->getScalarSizeInBits() == 64)

913

ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single

// extr instruction.

else if (HigherCost)

ExtraCost = 1;

else

break;

return TyL.first + ExtraCost;

920

}

David Sherwood

96b2e35

2024-04-24 14:31:06 +0100

[diff] [blame]

921

case Intrinsic::get_active_lane_mask: {

922

auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());

923

if (RetTy) {

924

EVT RetVT = getTLI()->getValueType(DL, RetTy);

925

EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);

926

if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&

927

!getTLI()->isTypeLegal(RetVT)) {

928

// We don't have enough context at this point to determine if the mask

929

// is going to be kept live after the block, which will force the vXi1

930

// type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.

931

// For now, we just assume the vectorizer created this intrinsic and

932

// the result will be the input for a PHI. In this case the cost will

933

// be extremely high for fixed-width vectors.

934

// NOTE: getScalarizationOverhead returns a cost that's far too

935

// pessimistic for the actual generated codegen. In reality there are

936

// two instructions generated per lane.

937

return RetTy->getNumElements() * 2;

}

}

break;

}

Ricardo Jesus

2fe30bc

2024-12-11 07:51:11 +0000

[diff] [blame]

942

case Intrinsic::experimental_vector_match: {

943

auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);

944

EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);

945

unsigned SearchSize = NeedleTy->getNumElements();

946

if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {

947

// Base cost for MATCH instructions. At least on the Neoverse V2 and

948

// Neoverse V3, these are cheap operations with the same latency as a

949

// vector ADD. In most cases, however, we also need to do an extra DUP.

950

// For fixed-length vectors we currently need an extra five--six

951

// instructions besides the MATCH.

952

InstructionCost Cost = 4;

953

if (isa<FixedVectorType>(RetTy))

Cost += 10;

return Cost;

}

break;

}

David Sherwood

de5d588

2025-02-04 09:41:53 +0000

[diff] [blame]

959

case Intrinsic::experimental_cttz_elts: {

960

EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);

961

if (!getTLI()->shouldExpandCttzElements(ArgVT)) {

962

// This will consist of a SVE brkb and a cntp instruction. These

963

// typically have the same latency and half the throughput as a vector

// add instruction.

return 4;

}

break;

}

Florian Hahn

2020-10-23 09:00:20 +0100

[diff] [blame]

default:

break;

}

return BaseT::getIntrinsicInstrCost(ICA, CostKind);

973

}

974

Bradley Smith

2021-04-26 16:19:25 +0100

[diff] [blame]

975

/// The function will remove redundant reinterprets casting in the presence

976

/// of the control flow

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

977

static std::optional<Instruction *> processPhiNode(InstCombiner &IC,

978

IntrinsicInst &II) {

Bradley Smith

2021-04-26 16:19:25 +0100

[diff] [blame]

979

SmallVector<Instruction *, 32> Worklist;

980

auto RequiredType = II.getType();

981

982

auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));

983

assert(PN && "Expected Phi Node!");

984

985

// Don't create a new Phi unless we can remove the old one.

986

if (!PN->hasOneUse())

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

987

return std::nullopt;

Bradley Smith

2021-04-26 16:19:25 +0100

[diff] [blame]

988

989

for (Value *IncValPhi : PN->incoming_values()) {

990

auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);

991

if (!Reinterpret ||

992

Reinterpret->getIntrinsicID() !=

993

Intrinsic::aarch64_sve_convert_to_svbool ||

994

RequiredType != Reinterpret->getArgOperand(0)->getType())

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

995

return std::nullopt;

Bradley Smith

2021-04-26 16:19:25 +0100

[diff] [blame]

996

}

997

998

// Create the new Phi

Nikita Popov

f9f8517

2023-06-16 14:58:33 +0200

[diff] [blame]

999

IC.Builder.SetInsertPoint(PN);

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

1000

PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());

Bradley Smith

2021-04-26 16:19:25 +0100

[diff] [blame]

1001

Worklist.push_back(PN);

1002

1003

for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {

1004

auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));

1005

NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));

1006

Worklist.push_back(Reinterpret);

1007

}

1008

1009

// Cleanup Phi Node and reinterprets

1010

return IC.replaceInstUsesWith(II, NPN);

1011

}

1012

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1013

// A collection of properties common to SVE intrinsics that allow for combines

1014

// to be written without needing to know the specific intrinsic.

1015

struct SVEIntrinsicInfo {

1016

//

1017

// Helper routines for common intrinsic definitions.

1018

//

1019

1020

// e.g. llvm.aarch64.sve.add pg, op1, op2

1021

// with IID ==> llvm.aarch64.sve.add_u

1022

static SVEIntrinsicInfo

1023

defaultMergingOp(Intrinsic::ID IID = Intrinsic::not_intrinsic) {

1024

return SVEIntrinsicInfo()

1025

.setGoverningPredicateOperandIdx(0)

1026

.setOperandIdxInactiveLanesTakenFrom(1)

1027

.setMatchingUndefIntrinsic(IID);

1028

}

1029

1030

// e.g. llvm.aarch64.sve.neg inactive, pg, op

1031

static SVEIntrinsicInfo defaultMergingUnaryOp() {

1032

return SVEIntrinsicInfo()

1033

.setGoverningPredicateOperandIdx(1)

1034

.setOperandIdxInactiveLanesTakenFrom(0)

1035

.setOperandIdxWithNoActiveLanes(0);

1036

}

1037

1038

// e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op

1039

static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp() {

1040

return SVEIntrinsicInfo()

1041

.setGoverningPredicateOperandIdx(1)

1042

.setOperandIdxInactiveLanesTakenFrom(0);

1043

}

1044

1045

// e.g. llvm.aarch64.sve.add_u pg, op1, op2

1046

static SVEIntrinsicInfo defaultUndefOp() {

1047

return SVEIntrinsicInfo()

1048

.setGoverningPredicateOperandIdx(0)

1049

.setInactiveLanesAreNotDefined();

1050

}

1051

1052

// e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)

1053

// llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)

1054

static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {

1055

return SVEIntrinsicInfo()

1056

.setGoverningPredicateOperandIdx(GPIndex)

1057

.setInactiveLanesAreUnused();

1058

}

1059

1060

// e.g. llvm.aarch64.sve.cmpeq pg, op1, op2

1061

// llvm.aarch64.sve.ld1 pg, ptr

1062

static SVEIntrinsicInfo defaultZeroingOp() {

1063

return SVEIntrinsicInfo()

1064

.setGoverningPredicateOperandIdx(0)

1065

.setInactiveLanesAreUnused()

1066

.setResultIsZeroInitialized();

1067

}

1068

1069

// All properties relate to predication and thus having a general predicate

1070

// is the minimum requirement to say there is intrinsic info to act on.

1071

explicit operator bool() const { return hasGoverningPredicate(); }

1072

1073

//

1074

// Properties relating to the governing predicate.

1075

//

1076

1077

bool hasGoverningPredicate() const {

1078

return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();

1079

}

1080

1081

unsigned getGoverningPredicateOperandIdx() const {

1082

assert(hasGoverningPredicate() && "Propery not set!");

1083

return GoverningPredicateIdx;

1084

}

1085

1086

SVEIntrinsicInfo &setGoverningPredicateOperandIdx(unsigned Index) {

1087

assert(!hasGoverningPredicate() && "Cannot set property twice!");

1088

GoverningPredicateIdx = Index;

return *this;

}

//

// Properties relating to operations the intrinsic could be transformed into.

1094

// NOTE: This does not mean such a transformation is always possible, but the

1095

// knowledge makes it possible to reuse existing optimisations without needing

1096

// to embed specific handling for each intrinsic. For example, instruction

1097

// simplification can be used to optimise an intrinsic's active lanes.

1098

//

1099

1100

bool hasMatchingUndefIntrinsic() const {

1101

return UndefIntrinsic != Intrinsic::not_intrinsic;

1102

}

1103

1104

Intrinsic::ID getMatchingUndefIntrinsic() const {

1105

assert(hasMatchingUndefIntrinsic() && "Propery not set!");

1106

return UndefIntrinsic;

1107

}

1108

1109

SVEIntrinsicInfo &setMatchingUndefIntrinsic(Intrinsic::ID IID) {

1110

assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");

1111

UndefIntrinsic = IID;

return *this;

}

Paul Walker

2025-04-08 11:38:27 +0100

[diff] [blame]

1115

bool hasMatchingIROpode() const { return IROpcode != 0; }

1116

1117

unsigned getMatchingIROpode() const {

1118

assert(hasMatchingIROpode() && "Propery not set!");

return IROpcode;

}

SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) {

1123

assert(!hasMatchingIROpode() && "Cannot set property twice!");

IROpcode = Opcode;

return *this;

}

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1128

//

1129

// Properties relating to the result of inactive lanes.

1130

//

1131

1132

bool inactiveLanesTakenFromOperand() const {

1133

return ResultLanes == InactiveLanesTakenFromOperand;

1134

}

1135

1136

unsigned getOperandIdxInactiveLanesTakenFrom() const {

1137

assert(inactiveLanesTakenFromOperand() && "Propery not set!");

1138

return OperandIdxForInactiveLanes;

1139

}

1140

1141

SVEIntrinsicInfo &setOperandIdxInactiveLanesTakenFrom(unsigned Index) {

1142

assert(ResultLanes == Uninitialized && "Cannot set property twice!");

1143

ResultLanes = InactiveLanesTakenFromOperand;

1144

OperandIdxForInactiveLanes = Index;

return *this;

}

bool inactiveLanesAreNotDefined() const {

1149

return ResultLanes == InactiveLanesAreNotDefined;

1150

}

1151

1152

SVEIntrinsicInfo &setInactiveLanesAreNotDefined() {

1153

assert(ResultLanes == Uninitialized && "Cannot set property twice!");

1154

ResultLanes = InactiveLanesAreNotDefined;

return *this;

}

bool inactiveLanesAreUnused() const {

1159

return ResultLanes == InactiveLanesAreUnused;

1160

}

1161

1162

SVEIntrinsicInfo &setInactiveLanesAreUnused() {

1163

assert(ResultLanes == Uninitialized && "Cannot set property twice!");

1164

ResultLanes = InactiveLanesAreUnused;

return *this;

}

// NOTE: Whilst not limited to only inactive lanes, the common use case is:

1169

// inactiveLanesAreZerod =

1170

// resultIsZeroInitialized() && inactiveLanesAreUnused()

1171

bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }

1172

1173

SVEIntrinsicInfo &setResultIsZeroInitialized() {

1174

ResultIsZeroInitialized = true;

return *this;

}

//

// The first operand of unary merging operations is typically only used to

1180

// set the result for inactive lanes. Knowing this allows us to deadcode the

1181

// operand when we can prove there are no inactive lanes.

1182

//

1183

1184

bool hasOperandWithNoActiveLanes() const {

1185

return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();

1186

}

1187

1188

unsigned getOperandIdxWithNoActiveLanes() const {

1189

assert(hasOperandWithNoActiveLanes() && "Propery not set!");

1190

return OperandIdxWithNoActiveLanes;

1191

}

1192

1193

SVEIntrinsicInfo &setOperandIdxWithNoActiveLanes(unsigned Index) {

1194

assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");

1195

OperandIdxWithNoActiveLanes = Index;

return *this;

}

private:

unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();

1201

1202

Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;

Paul Walker

2025-04-08 11:38:27 +0100

[diff] [blame]

1203

unsigned IROpcode = 0;

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1204

1205

enum PredicationStyle {

1206

Uninitialized,

1207

InactiveLanesTakenFromOperand,

1208

InactiveLanesAreNotDefined,

1209

InactiveLanesAreUnused

1210

} ResultLanes = Uninitialized;

1211

1212

bool ResultIsZeroInitialized = false;

1213

unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();

1214

unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();

1215

};

1216

1217

static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {

1218

// Some SVE intrinsics do not use scalable vector types, but since they are

1219

// not relevant from an SVEIntrinsicInfo perspective, they are also ignored.

1220

if (!isa<ScalableVectorType>(II.getType()) &&

1221

all_of(II.args(), [&](const Value *V) {

1222

return !isa<ScalableVectorType>(V->getType());

1223

}))

1224

return SVEIntrinsicInfo();

1225

1226

Intrinsic::ID IID = II.getIntrinsicID();

switch (IID) {

default:

break;

case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:

1231

case Intrinsic::aarch64_sve_fcvt_f16f32:

1232

case Intrinsic::aarch64_sve_fcvt_f16f64:

1233

case Intrinsic::aarch64_sve_fcvt_f32f16:

1234

case Intrinsic::aarch64_sve_fcvt_f32f64:

1235

case Intrinsic::aarch64_sve_fcvt_f64f16:

1236

case Intrinsic::aarch64_sve_fcvt_f64f32:

1237

case Intrinsic::aarch64_sve_fcvtlt_f32f16:

1238

case Intrinsic::aarch64_sve_fcvtlt_f64f32:

1239

case Intrinsic::aarch64_sve_fcvtx_f32f64:

1240

case Intrinsic::aarch64_sve_fcvtzs:

1241

case Intrinsic::aarch64_sve_fcvtzs_i32f16:

1242

case Intrinsic::aarch64_sve_fcvtzs_i32f64:

1243

case Intrinsic::aarch64_sve_fcvtzs_i64f16:

1244

case Intrinsic::aarch64_sve_fcvtzs_i64f32:

1245

case Intrinsic::aarch64_sve_fcvtzu:

1246

case Intrinsic::aarch64_sve_fcvtzu_i32f16:

1247

case Intrinsic::aarch64_sve_fcvtzu_i32f64:

1248

case Intrinsic::aarch64_sve_fcvtzu_i64f16:

1249

case Intrinsic::aarch64_sve_fcvtzu_i64f32:

1250

case Intrinsic::aarch64_sve_scvtf:

1251

case Intrinsic::aarch64_sve_scvtf_f16i32:

1252

case Intrinsic::aarch64_sve_scvtf_f16i64:

1253

case Intrinsic::aarch64_sve_scvtf_f32i64:

1254

case Intrinsic::aarch64_sve_scvtf_f64i32:

1255

case Intrinsic::aarch64_sve_ucvtf:

1256

case Intrinsic::aarch64_sve_ucvtf_f16i32:

1257

case Intrinsic::aarch64_sve_ucvtf_f16i64:

1258

case Intrinsic::aarch64_sve_ucvtf_f32i64:

1259

case Intrinsic::aarch64_sve_ucvtf_f64i32:

1260

return SVEIntrinsicInfo::defaultMergingUnaryOp();

1261

1262

case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:

1263

case Intrinsic::aarch64_sve_fcvtnt_f16f32:

1264

case Intrinsic::aarch64_sve_fcvtnt_f32f64:

1265

case Intrinsic::aarch64_sve_fcvtxnt_f32f64:

1266

return SVEIntrinsicInfo::defaultMergingUnaryNarrowingTopOp();

1267

1268

case Intrinsic::aarch64_sve_fabd:

1269

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);

1270

case Intrinsic::aarch64_sve_fadd:

Paul Walker

2025-04-25 11:30:03 +0100

[diff] [blame]

1271

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)

1272

.setMatchingIROpcode(Instruction::FAdd);

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1273

case Intrinsic::aarch64_sve_fdiv:

Paul Walker

2025-04-25 11:30:03 +0100

[diff] [blame]

1274

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)

1275

.setMatchingIROpcode(Instruction::FDiv);

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1276

case Intrinsic::aarch64_sve_fmax:

1277

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);

1278

case Intrinsic::aarch64_sve_fmaxnm:

1279

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);

1280

case Intrinsic::aarch64_sve_fmin:

1281

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);

1282

case Intrinsic::aarch64_sve_fminnm:

1283

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);

1284

case Intrinsic::aarch64_sve_fmla:

1285

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);

1286

case Intrinsic::aarch64_sve_fmls:

1287

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);

1288

case Intrinsic::aarch64_sve_fmul:

Paul Walker

2025-04-08 11:38:27 +0100

[diff] [blame]

1289

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)

1290

.setMatchingIROpcode(Instruction::FMul);

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1291

case Intrinsic::aarch64_sve_fmulx:

1292

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);

1293

case Intrinsic::aarch64_sve_fnmla:

1294

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);

1295

case Intrinsic::aarch64_sve_fnmls:

1296

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);

1297

case Intrinsic::aarch64_sve_fsub:

Paul Walker

2025-04-25 11:30:03 +0100

[diff] [blame]

1298

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)

1299

.setMatchingIROpcode(Instruction::FSub);

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1300

case Intrinsic::aarch64_sve_add:

Paul Walker

2025-04-25 11:30:03 +0100

[diff] [blame]

1301

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)

1302

.setMatchingIROpcode(Instruction::Add);

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1303

case Intrinsic::aarch64_sve_mla:

1304

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);

1305

case Intrinsic::aarch64_sve_mls:

1306

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);

1307

case Intrinsic::aarch64_sve_mul:

Paul Walker

2025-04-08 11:38:27 +0100

[diff] [blame]

1308

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)

1309

.setMatchingIROpcode(Instruction::Mul);

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1310

case Intrinsic::aarch64_sve_sabd:

1311

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);

Paul Walker

2025-05-01 13:20:05 +0100

[diff] [blame]

1312

case Intrinsic::aarch64_sve_sdiv:

1313

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)

1314

.setMatchingIROpcode(Instruction::SDiv);

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1315

case Intrinsic::aarch64_sve_smax:

1316

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);

1317

case Intrinsic::aarch64_sve_smin:

1318

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);

1319

case Intrinsic::aarch64_sve_smulh:

1320

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);

1321

case Intrinsic::aarch64_sve_sub:

Paul Walker

2025-04-25 11:30:03 +0100

[diff] [blame]

1322

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)

1323

.setMatchingIROpcode(Instruction::Sub);

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1324

case Intrinsic::aarch64_sve_uabd:

1325

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);

Paul Walker

2025-05-01 13:20:05 +0100

[diff] [blame]

1326

case Intrinsic::aarch64_sve_udiv:

1327

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)

1328

.setMatchingIROpcode(Instruction::UDiv);

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1329

case Intrinsic::aarch64_sve_umax:

1330

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);

1331

case Intrinsic::aarch64_sve_umin:

1332

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);

1333

case Intrinsic::aarch64_sve_umulh:

1334

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);

1335

case Intrinsic::aarch64_sve_asr:

Paul Walker

2025-04-30 13:21:46 +0100

[diff] [blame]

1336

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)

1337

.setMatchingIROpcode(Instruction::AShr);

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1338

case Intrinsic::aarch64_sve_lsl:

Paul Walker

2025-04-30 13:21:46 +0100

[diff] [blame]

1339

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)

1340

.setMatchingIROpcode(Instruction::Shl);

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1341

case Intrinsic::aarch64_sve_lsr:

Paul Walker

2025-04-30 13:21:46 +0100

[diff] [blame]

1342

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)

1343

.setMatchingIROpcode(Instruction::LShr);

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1344

case Intrinsic::aarch64_sve_and:

Paul Walker

2025-04-25 11:30:03 +0100

[diff] [blame]

1345

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)

1346

.setMatchingIROpcode(Instruction::And);

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1347

case Intrinsic::aarch64_sve_bic:

1348

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);

1349

case Intrinsic::aarch64_sve_eor:

Paul Walker

2025-04-25 11:30:03 +0100

[diff] [blame]

1350

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)

1351

.setMatchingIROpcode(Instruction::Xor);

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1352

case Intrinsic::aarch64_sve_orr:

Paul Walker

2025-04-25 11:30:03 +0100

[diff] [blame]

1353

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)

1354

.setMatchingIROpcode(Instruction::Or);

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1355

case Intrinsic::aarch64_sve_sqsub:

1356

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);

1357

case Intrinsic::aarch64_sve_uqsub:

1358

return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);

1359

Paul Walker

2025-04-25 11:30:03 +0100

[diff] [blame]

1360

case Intrinsic::aarch64_sve_add_u:

1361

return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(

1362

Instruction::Add);

1363

case Intrinsic::aarch64_sve_and_u:

1364

return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(

1365

Instruction::And);

Paul Walker

2025-04-30 13:21:46 +0100

[diff] [blame]

1366

case Intrinsic::aarch64_sve_asr_u:

1367

return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(

1368

Instruction::AShr);

Paul Walker

2025-04-25 11:30:03 +0100

[diff] [blame]

1369

case Intrinsic::aarch64_sve_eor_u:

1370

return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(

1371

Instruction::Xor);

1372

case Intrinsic::aarch64_sve_fadd_u:

1373

return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(

1374

Instruction::FAdd);

1375

case Intrinsic::aarch64_sve_fdiv_u:

1376

return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(

1377

Instruction::FDiv);

Paul Walker

2025-04-08 11:38:27 +0100

[diff] [blame]

1378

case Intrinsic::aarch64_sve_fmul_u:

1379

return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(

1380

Instruction::FMul);

Paul Walker

2025-04-25 11:30:03 +0100

[diff] [blame]

1381

case Intrinsic::aarch64_sve_fsub_u:

1382

return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(

1383

Instruction::FSub);

Paul Walker

2025-04-30 13:21:46 +0100

[diff] [blame]

1384

case Intrinsic::aarch64_sve_lsl_u:

1385

return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(

1386

Instruction::Shl);

1387

case Intrinsic::aarch64_sve_lsr_u:

1388

return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(

1389

Instruction::LShr);

Paul Walker

2025-04-08 11:38:27 +0100

[diff] [blame]

1390

case Intrinsic::aarch64_sve_mul_u:

1391

return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(

1392

Instruction::Mul);

Paul Walker

2025-04-25 11:30:03 +0100

[diff] [blame]

1393

case Intrinsic::aarch64_sve_orr_u:

1394

return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(

1395

Instruction::Or);

Paul Walker

2025-05-01 13:20:05 +0100

[diff] [blame]

1396

case Intrinsic::aarch64_sve_sdiv_u:

1397

return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(

1398

Instruction::SDiv);

Paul Walker

2025-04-25 11:30:03 +0100

[diff] [blame]

1399

case Intrinsic::aarch64_sve_sub_u:

1400

return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(

1401

Instruction::Sub);

Paul Walker

2025-05-01 13:20:05 +0100

[diff] [blame]

1402

case Intrinsic::aarch64_sve_udiv_u:

1403

return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(

1404

Instruction::UDiv);

Paul Walker

2025-04-08 11:38:27 +0100

[diff] [blame]

1405

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1406

case Intrinsic::aarch64_sve_addqv:

1407

case Intrinsic::aarch64_sve_and_z:

1408

case Intrinsic::aarch64_sve_bic_z:

1409

case Intrinsic::aarch64_sve_brka_z:

1410

case Intrinsic::aarch64_sve_brkb_z:

1411

case Intrinsic::aarch64_sve_brkn_z:

1412

case Intrinsic::aarch64_sve_brkpa_z:

1413

case Intrinsic::aarch64_sve_brkpb_z:

1414

case Intrinsic::aarch64_sve_cntp:

1415

case Intrinsic::aarch64_sve_compact:

1416

case Intrinsic::aarch64_sve_eor_z:

1417

case Intrinsic::aarch64_sve_eorv:

1418

case Intrinsic::aarch64_sve_eorqv:

1419

case Intrinsic::aarch64_sve_nand_z:

1420

case Intrinsic::aarch64_sve_nor_z:

1421

case Intrinsic::aarch64_sve_orn_z:

1422

case Intrinsic::aarch64_sve_orr_z:

1423

case Intrinsic::aarch64_sve_orv:

1424

case Intrinsic::aarch64_sve_orqv:

1425

case Intrinsic::aarch64_sve_pnext:

1426

case Intrinsic::aarch64_sve_rdffr_z:

1427

case Intrinsic::aarch64_sve_saddv:

1428

case Intrinsic::aarch64_sve_uaddv:

1429

case Intrinsic::aarch64_sve_umaxv:

1430

case Intrinsic::aarch64_sve_umaxqv:

1431

case Intrinsic::aarch64_sve_cmpeq:

1432

case Intrinsic::aarch64_sve_cmpeq_wide:

1433

case Intrinsic::aarch64_sve_cmpge:

1434

case Intrinsic::aarch64_sve_cmpge_wide:

1435

case Intrinsic::aarch64_sve_cmpgt:

1436

case Intrinsic::aarch64_sve_cmpgt_wide:

1437

case Intrinsic::aarch64_sve_cmphi:

1438

case Intrinsic::aarch64_sve_cmphi_wide:

1439

case Intrinsic::aarch64_sve_cmphs:

1440

case Intrinsic::aarch64_sve_cmphs_wide:

1441

case Intrinsic::aarch64_sve_cmple_wide:

1442

case Intrinsic::aarch64_sve_cmplo_wide:

1443

case Intrinsic::aarch64_sve_cmpls_wide:

1444

case Intrinsic::aarch64_sve_cmplt_wide:

1445

case Intrinsic::aarch64_sve_cmpne:

1446

case Intrinsic::aarch64_sve_cmpne_wide:

1447

case Intrinsic::aarch64_sve_facge:

1448

case Intrinsic::aarch64_sve_facgt:

1449

case Intrinsic::aarch64_sve_fcmpeq:

1450

case Intrinsic::aarch64_sve_fcmpge:

1451

case Intrinsic::aarch64_sve_fcmpgt:

1452

case Intrinsic::aarch64_sve_fcmpne:

1453

case Intrinsic::aarch64_sve_fcmpuo:

1454

case Intrinsic::aarch64_sve_ld1:

1455

case Intrinsic::aarch64_sve_ld1_gather:

1456

case Intrinsic::aarch64_sve_ld1_gather_index:

1457

case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:

1458

case Intrinsic::aarch64_sve_ld1_gather_sxtw:

1459

case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:

1460

case Intrinsic::aarch64_sve_ld1_gather_uxtw:

1461

case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:

1462

case Intrinsic::aarch64_sve_ld1q_gather_index:

1463

case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:

1464

case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:

1465

case Intrinsic::aarch64_sve_ld1ro:

1466

case Intrinsic::aarch64_sve_ld1rq:

1467

case Intrinsic::aarch64_sve_ld1udq:

1468

case Intrinsic::aarch64_sve_ld1uwq:

1469

case Intrinsic::aarch64_sve_ld2_sret:

1470

case Intrinsic::aarch64_sve_ld2q_sret:

1471

case Intrinsic::aarch64_sve_ld3_sret:

1472

case Intrinsic::aarch64_sve_ld3q_sret:

1473

case Intrinsic::aarch64_sve_ld4_sret:

1474

case Intrinsic::aarch64_sve_ld4q_sret:

1475

case Intrinsic::aarch64_sve_ldff1:

1476

case Intrinsic::aarch64_sve_ldff1_gather:

1477

case Intrinsic::aarch64_sve_ldff1_gather_index:

1478

case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:

1479

case Intrinsic::aarch64_sve_ldff1_gather_sxtw:

1480

case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:

1481

case Intrinsic::aarch64_sve_ldff1_gather_uxtw:

1482

case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:

1483

case Intrinsic::aarch64_sve_ldnf1:

1484

case Intrinsic::aarch64_sve_ldnt1:

1485

case Intrinsic::aarch64_sve_ldnt1_gather:

1486

case Intrinsic::aarch64_sve_ldnt1_gather_index:

1487

case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:

1488

case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:

1489

return SVEIntrinsicInfo::defaultZeroingOp();

1490

1491

case Intrinsic::aarch64_sve_prf:

1492

case Intrinsic::aarch64_sve_prfb_gather_index:

1493

case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:

1494

case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:

1495

case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:

1496

case Intrinsic::aarch64_sve_prfd_gather_index:

1497

case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:

1498

case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:

1499

case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:

1500

case Intrinsic::aarch64_sve_prfh_gather_index:

1501

case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:

1502

case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:

1503

case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:

1504

case Intrinsic::aarch64_sve_prfw_gather_index:

1505

case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:

1506

case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:

1507

case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:

1508

return SVEIntrinsicInfo::defaultVoidOp(0);

1509

1510

case Intrinsic::aarch64_sve_st1_scatter:

1511

case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:

1512

case Intrinsic::aarch64_sve_st1_scatter_sxtw:

1513

case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:

1514

case Intrinsic::aarch64_sve_st1_scatter_uxtw:

1515

case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:

1516

case Intrinsic::aarch64_sve_st1dq:

1517

case Intrinsic::aarch64_sve_st1q_scatter_index:

1518

case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:

1519

case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:

1520

case Intrinsic::aarch64_sve_st1wq:

1521

case Intrinsic::aarch64_sve_stnt1:

1522

case Intrinsic::aarch64_sve_stnt1_scatter:

1523

case Intrinsic::aarch64_sve_stnt1_scatter_index:

1524

case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:

1525

case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:

1526

return SVEIntrinsicInfo::defaultVoidOp(1);

1527

case Intrinsic::aarch64_sve_st2:

1528

case Intrinsic::aarch64_sve_st2q:

1529

return SVEIntrinsicInfo::defaultVoidOp(2);

1530

case Intrinsic::aarch64_sve_st3:

1531

case Intrinsic::aarch64_sve_st3q:

1532

return SVEIntrinsicInfo::defaultVoidOp(3);

1533

case Intrinsic::aarch64_sve_st4:

1534

case Intrinsic::aarch64_sve_st4q:

1535

return SVEIntrinsicInfo::defaultVoidOp(4);

1536

}

1537

1538

return SVEIntrinsicInfo();

1539

}

1540

1541

static bool isAllActivePredicate(Value *Pred) {

1542

// Look through convert.from.svbool(convert.to.svbool(...) chain.

1543

Value *UncastedPred;

1544

if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(

1545

m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(

1546

m_Value(UncastedPred)))))

1547

// If the predicate has the same or less lanes than the uncasted

1548

// predicate then we know the casting has no effect.

1549

if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=

1550

cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())

1551

Pred = UncastedPred;

Matthew Devereau

2025-04-13 20:40:51 +0100

[diff] [blame]

1552

auto *C = dyn_cast<Constant>(Pred);

1553

return (C && C->isAllOnesValue());

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1554

}

1555

Paul Walker

a7999f3

2025-04-17 15:58:39 +0100

[diff] [blame]

1556

// Simplify `V` by only considering the operations that affect active lanes.

1557

// This function should only return existing Values or newly created Constants.

1558

static Value *stripInactiveLanes(Value *V, const Value *Pg) {

1559

auto *Dup = dyn_cast<IntrinsicInst>(V);

1560

if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&

1561

Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))

1562

return ConstantVector::getSplat(

1563

cast<VectorType>(V->getType())->getElementCount(),

1564

cast<Constant>(Dup->getOperand(2)));

return V;

}

static std::optional<Instruction *>

1570

simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II,

1571

const SVEIntrinsicInfo &IInfo) {

1572

const unsigned Opc = IInfo.getMatchingIROpode();

1573

assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");

1574

1575

Value *Pg = II.getOperand(0);

1576

Value *Op1 = II.getOperand(1);

1577

Value *Op2 = II.getOperand(2);

1578

const DataLayout &DL = II.getDataLayout();

1579

1580

// Canonicalise constants to the RHS.

1581

if (Instruction::isCommutative(Opc) && IInfo.inactiveLanesAreNotDefined() &&

1582

isa<Constant>(Op1) && !isa<Constant>(Op2)) {

1583

IC.replaceOperand(II, 1, Op2);

1584

IC.replaceOperand(II, 2, Op1);

return &II;

}

// Only active lanes matter when simplifying the operation.

1589

Op1 = stripInactiveLanes(Op1, Pg);

1590

Op2 = stripInactiveLanes(Op2, Pg);

1591

1592

Value *SimpleII;

1593

if (auto FII = dyn_cast<FPMathOperator>(&II))

1594

SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);

1595

else

1596

SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);

1597

Paul Walker

2025-04-30 13:21:46 +0100

[diff] [blame]

1598

// An SVE intrinsic's result is always defined. However, this is not the case

1599

// for its equivalent IR instruction (e.g. when shifting by an amount more

1600

// than the data's bitwidth). Simplifications to an undefined result must be

1601

// ignored to preserve the intrinsic's expected behaviour.

1602

if (!SimpleII || isa<UndefValue>(SimpleII))

Paul Walker

a7999f3

2025-04-17 15:58:39 +0100

[diff] [blame]

1603

return std::nullopt;

1604

1605

if (IInfo.inactiveLanesAreNotDefined())

1606

return IC.replaceInstUsesWith(II, SimpleII);

1607

1608

Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());

1609

1610

// The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).

1611

if (SimpleII == Inactive)

1612

return IC.replaceInstUsesWith(II, SimpleII);

1613

1614

// Inactive lanes must be preserved.

1615

SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);

1616

return IC.replaceInstUsesWith(II, SimpleII);

1617

}

1618

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

1619

// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise

1620

// to operations with less strict inactive lane requirements.

1621

static std::optional<Instruction *>

1622

simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II,

1623

const SVEIntrinsicInfo &IInfo) {

1624

if (!IInfo.hasGoverningPredicate())

1625

return std::nullopt;

1626

1627

auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());

1628

1629

// If there are no active lanes.

1630

if (match(OpPredicate, m_ZeroInt())) {

1631

if (IInfo.inactiveLanesTakenFromOperand())

1632

return IC.replaceInstUsesWith(

1633

II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));

1634

1635

if (IInfo.inactiveLanesAreUnused()) {

1636

if (IInfo.resultIsZeroInitialized())

1637

IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));

1638

1639

return IC.eraseInstFromFunction(II);

}

}

// If there are no inactive lanes.

1644

if (isAllActivePredicate(OpPredicate)) {

1645

if (IInfo.hasOperandWithNoActiveLanes()) {

1646

unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();

1647

if (!isa<UndefValue>(II.getOperand(OpIdx)))

1648

return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));

1649

}

1650

1651

if (IInfo.hasMatchingUndefIntrinsic()) {

1652

auto *NewDecl = Intrinsic::getOrInsertDeclaration(

1653

II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});

1654

II.setCalledFunction(NewDecl);

return &II;

}

}

Paul Walker

2025-04-17 15:58:39 +0100

[diff] [blame]

1659

// Operation specific simplifications.

1660

if (IInfo.hasMatchingIROpode() &&

1661

Instruction::isBinaryOp(IInfo.getMatchingIROpode()))

1662

return simplifySVEIntrinsicBinOp(IC, II, IInfo);

1663

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

return std::nullopt;

}

Matt Devereau

2022-01-05 13:42:01 +0000

[diff] [blame]

1667

// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))

1668

// => (binop (pred) (from_svbool _) (from_svbool _))

1669

//

1670

// The above transformation eliminates a `to_svbool` in the predicate

1671

// operand of bitwise operation `binop` by narrowing the vector width of

1672

// the operation. For example, it would convert a `<vscale x 16 x i1>

1673

// and` into a `<vscale x 4 x i1> and`. This is profitable because

1674

// to_svbool must zero the new lanes during widening, whereas

1675

// from_svbool is free.

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1676

static std::optional<Instruction *>

1677

tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {

Matt Devereau

2022-01-05 13:42:01 +0000

[diff] [blame]

1678

auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));

1679

if (!BinOp)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1680

return std::nullopt;

Matt Devereau

2022-01-05 13:42:01 +0000

[diff] [blame]

1681

1682

auto IntrinsicID = BinOp->getIntrinsicID();

1683

switch (IntrinsicID) {

1684

case Intrinsic::aarch64_sve_and_z:

1685

case Intrinsic::aarch64_sve_bic_z:

1686

case Intrinsic::aarch64_sve_eor_z:

1687

case Intrinsic::aarch64_sve_nand_z:

1688

case Intrinsic::aarch64_sve_nor_z:

1689

case Intrinsic::aarch64_sve_orn_z:

1690

case Intrinsic::aarch64_sve_orr_z:

1691

break;

1692

default:

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1693

return std::nullopt;

Matt Devereau

2022-01-05 13:42:01 +0000

[diff] [blame]

1694

}

1695

1696

auto BinOpPred = BinOp->getOperand(0);

1697

auto BinOpOp1 = BinOp->getOperand(1);

1698

auto BinOpOp2 = BinOp->getOperand(2);

1699

1700

auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);

1701

if (!PredIntr ||

1702

PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1703

return std::nullopt;

Matt Devereau

2022-01-05 13:42:01 +0000

[diff] [blame]

1704

1705

auto PredOp = PredIntr->getOperand(0);

1706

auto PredOpTy = cast<VectorType>(PredOp->getType());

1707

if (PredOpTy != II.getType())

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1708

return std::nullopt;

Matt Devereau

2022-01-05 13:42:01 +0000

[diff] [blame]

1709

Matt Devereau

2022-01-05 13:42:01 +0000

[diff] [blame]

1710

SmallVector<Value *> NarrowedBinOpArgs = {PredOp};

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

1711

auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(

Matt Devereau

2022-01-05 13:42:01 +0000

[diff] [blame]

1712

Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});

1713

NarrowedBinOpArgs.push_back(NarrowBinOpOp1);

1714

if (BinOpOp1 == BinOpOp2)

1715

NarrowedBinOpArgs.push_back(NarrowBinOpOp1);

1716

else

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

1717

NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(

Matt Devereau

2022-01-05 13:42:01 +0000

[diff] [blame]

1718

Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));

1719

1720

auto NarrowedBinOp =

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

1721

IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);

Matt Devereau

2022-01-05 13:42:01 +0000

[diff] [blame]

1722

return IC.replaceInstUsesWith(II, NarrowedBinOp);

1723

}

1724

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1725

static std::optional<Instruction *>

1726

instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {

Bradley Smith

2021-04-26 16:19:25 +0100

[diff] [blame]

1727

// If the reinterpret instruction operand is a PHI Node

1728

if (isa<PHINode>(II.getArgOperand(0)))

1729

return processPhiNode(IC, II);

1730

Matt Devereau

2022-01-05 13:42:01 +0000

[diff] [blame]

1731

if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))

1732

return BinOpCombine;

1733

Sander de Smalen

11926e6

2023-05-22 13:52:18 +0000

[diff] [blame]

1734

// Ignore converts to/from svcount_t.

1735

if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||

1736

isa<TargetExtType>(II.getType()))

1737

return std::nullopt;

1738

Bradley Smith

2021-04-26 16:19:25 +0100

[diff] [blame]

1739

SmallVector<Instruction *, 32> CandidatesForRemoval;

1740

Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;

1741

1742

const auto *IVTy = cast<VectorType>(II.getType());

1743

1744

// Walk the chain of conversions.

1745

while (Cursor) {

1746

// If the type of the cursor has fewer lanes than the final result, zeroing

1747

// must take place, which breaks the equivalence chain.

1748

const auto *CursorVTy = cast<VectorType>(Cursor->getType());

1749

if (CursorVTy->getElementCount().getKnownMinValue() <

1750

IVTy->getElementCount().getKnownMinValue())

1751

break;

1752

1753

// If the cursor has the same type as I, it is a viable replacement.

1754

if (Cursor->getType() == IVTy)

1755

EarliestReplacement = Cursor;

1756

1757

auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);

1758

1759

// If this is not an SVE conversion intrinsic, this is the end of the chain.

1760

if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==

1761

Intrinsic::aarch64_sve_convert_to_svbool ||

1762

IntrinsicCursor->getIntrinsicID() ==

1763

Intrinsic::aarch64_sve_convert_from_svbool))

1764

break;

1765

1766

CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);

1767

Cursor = IntrinsicCursor->getOperand(0);

1768

}

1769

1770

// If no viable replacement in the conversion chain was found, there is

1771

// nothing to do.

1772

if (!EarliestReplacement)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1773

return std::nullopt;

Bradley Smith

2021-04-26 16:19:25 +0100

[diff] [blame]

1774

1775

return IC.replaceInstUsesWith(II, EarliestReplacement);

1776

}

1777

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1778

static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,

1779

IntrinsicInst &II) {

zhongyunde 00443407

bf90ffb

2023-09-27 22:42:43 -0400

[diff] [blame]

1780

// svsel(ptrue, x, y) => x

1781

auto *OpPredicate = II.getOperand(0);

1782

if (isAllActivePredicate(OpPredicate))

1783

return IC.replaceInstUsesWith(II, II.getOperand(1));

1784

1785

auto Select =

1786

IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));

Matt Devereau

a9e08bc

2022-03-16 11:41:14 +0000

[diff] [blame]

1787

return IC.replaceInstUsesWith(II, Select);

1788

}

1789

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1790

static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,

1791

IntrinsicInst &II) {

Bradley Smith

2021-04-23 13:55:42 +0100

[diff] [blame]

1792

IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));

1793

if (!Pg)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1794

return std::nullopt;

Bradley Smith

2021-04-23 13:55:42 +0100

[diff] [blame]

1795

1796

if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1797

return std::nullopt;

Bradley Smith

2021-04-23 13:55:42 +0100

[diff] [blame]

1798

1799

const auto PTruePattern =

1800

cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();

1801

if (PTruePattern != AArch64SVEPredPattern::vl1)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1802

return std::nullopt;

Bradley Smith

2021-04-23 13:55:42 +0100

[diff] [blame]

1803

1804

// The intrinsic is inserting into lane zero so use an insert instead.

1805

auto *IdxTy = Type::getInt64Ty(II.getContext());

1806

auto *Insert = InsertElementInst::Create(

1807

II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));

Jeremy Morse

8e70273

2025-01-24 10:53:11 +0000

[diff] [blame]

1808

Insert->insertBefore(II.getIterator());

Bradley Smith

2021-04-23 13:55:42 +0100

[diff] [blame]

1809

Insert->takeName(&II);

1810

1811

return IC.replaceInstUsesWith(II, Insert);

1812

}

1813

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1814

static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,

1815

IntrinsicInst &II) {

Usman Nadeem

2021-09-10 17:57:29 -0700

[diff] [blame]

1816

// Replace DupX with a regular IR splat.

Usman Nadeem

2021-09-10 17:57:29 -0700

[diff] [blame]

1817

auto *RetTy = cast<ScalableVectorType>(II.getType());

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

1818

Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),

1819

II.getArgOperand(0));

Usman Nadeem

2021-09-10 17:57:29 -0700

[diff] [blame]

1820

Splat->takeName(&II);

1821

return IC.replaceInstUsesWith(II, Splat);

1822

}

1823

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1824

static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,

1825

IntrinsicInst &II) {

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

1826

LLVMContext &Ctx = II.getContext();

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

1827

Matthew Devereau

2025-04-13 20:40:51 +0100

[diff] [blame]

1828

if (!isAllActivePredicate(II.getArgOperand(0)))

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1829

return std::nullopt;

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

1830

1831

// Check that we have a compare of zero..

Usman Nadeem

2021-09-10 17:57:29 -0700

[diff] [blame]

1832

auto *SplatValue =

1833

dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));

1834

if (!SplatValue || !SplatValue->isZero())

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1835

return std::nullopt;

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

1836

1837

// ..against a dupq

1838

auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));

1839

if (!DupQLane ||

1840

DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1841

return std::nullopt;

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

1842

1843

// Where the dupq is a lane 0 replicate of a vector insert

cceerczw

67a9093

2024-08-23 22:30:51 +0800

[diff] [blame]

1844

auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));

1845

if (!DupQLaneIdx || !DupQLaneIdx->isZero())

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1846

return std::nullopt;

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

1847

1848

auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));

Bradley Smith

a83aa33

2022-06-16 14:45:28 +0000

[diff] [blame]

1849

if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1850

return std::nullopt;

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

1851

1852

// Where the vector insert is a fixed constant vector insert into undef at

1853

// index zero

1854

if (!isa<UndefValue>(VecIns->getArgOperand(0)))

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1855

return std::nullopt;

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

1856

1857

if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1858

return std::nullopt;

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

1859

1860

auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));

1861

if (!ConstVec)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1862

return std::nullopt;

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

1863

1864

auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());

1865

auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());

1866

if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1867

return std::nullopt;

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

1868

1869

unsigned NumElts = VecTy->getNumElements();

1870

unsigned PredicateBits = 0;

1871

1872

// Expand intrinsic operands to a 16-bit byte level predicate

1873

for (unsigned I = 0; I < NumElts; ++I) {

1874

auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));

1875

if (!Arg)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1876

return std::nullopt;

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

1877

if (!Arg->isZero())

1878

PredicateBits |= 1 << (I * (16 / NumElts));

1879

}

1880

1881

// If all bits are zero bail early with an empty predicate

1882

if (PredicateBits == 0) {

1883

auto *PFalse = Constant::getNullValue(II.getType());

1884

PFalse->takeName(&II);

1885

return IC.replaceInstUsesWith(II, PFalse);

1886

}

1887

1888

// Calculate largest predicate type used (where byte predicate is largest)

1889

unsigned Mask = 8;

1890

for (unsigned I = 0; I < 16; ++I)

1891

if ((PredicateBits & (1 << I)) != 0)

1892

Mask |= (I % 8);

1893

1894

unsigned PredSize = Mask & -Mask;

1895

auto *PredType = ScalableVectorType::get(

1896

Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));

1897

1898

// Ensure all relevant bits are set

1899

for (unsigned I = 0; I < 16; I += PredSize)

1900

if ((PredicateBits & (1 << I)) == 0)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1901

return std::nullopt;

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

1902

1903

auto *PTruePat =

1904

ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

1905

auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,

1906

{PredType}, {PTruePat});

1907

auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

1908

Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});

1909

auto *ConvertFromSVBool =

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

1910

IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,

1911

{II.getType()}, {ConvertToSVBool});

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

1912

1913

ConvertFromSVBool->takeName(&II);

1914

return IC.replaceInstUsesWith(II, ConvertFromSVBool);

1915

}

1916

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1917

static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,

1918

IntrinsicInst &II) {

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

1919

Value *Pg = II.getArgOperand(0);

1920

Value *Vec = II.getArgOperand(1);

Usman Nadeem

2021-07-27 21:02:32 -0700

[diff] [blame]

1921

auto IntrinsicID = II.getIntrinsicID();

1922

bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

1923

Sander de Smalen

eb1a512

2021-07-19 10:48:42 +0100

[diff] [blame]

1924

// lastX(splat(X)) --> X

1925

if (auto *SplatVal = getSplatValue(Vec))

1926

return IC.replaceInstUsesWith(II, SplatVal);

1927

Usman Nadeem

2021-07-27 21:02:32 -0700

[diff] [blame]

1928

// If x and/or y is a splat value then:

1929

// lastX (binop (x, y)) --> binop(lastX(x), lastX(y))

1930

Value *LHS, *RHS;

1931

if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {

1932

if (isSplatValue(LHS) || isSplatValue(RHS)) {

1933

auto *OldBinOp = cast<BinaryOperator>(Vec);

1934

auto OpC = OldBinOp->getOpcode();

1935

auto *NewLHS =

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

1936

IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});

Usman Nadeem

2021-07-27 21:02:32 -0700

[diff] [blame]

1937

auto *NewRHS =

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

1938

IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});

Usman Nadeem

2021-07-27 21:02:32 -0700

[diff] [blame]

1939

auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(

Jeremy Morse

b9d83ef

2024-03-19 16:36:29 +0000

[diff] [blame]

1940

OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());

Usman Nadeem

2021-07-27 21:02:32 -0700

[diff] [blame]

1941

return IC.replaceInstUsesWith(II, NewBinOp);

}

}

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

1945

auto *C = dyn_cast<Constant>(Pg);

1946

if (IsAfter && C && C->isNullValue()) {

1947

// The intrinsic is extracting lane 0 so use an extract instead.

1948

auto *IdxTy = Type::getInt64Ty(II.getContext());

1949

auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));

Jeremy Morse

8e70273

2025-01-24 10:53:11 +0000

[diff] [blame]

1950

Extract->insertBefore(II.getIterator());

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

1951

Extract->takeName(&II);

1952

return IC.replaceInstUsesWith(II, Extract);

1953

}

1954

1955

auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);

1956

if (!IntrPG)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1957

return std::nullopt;

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

1958

1959

if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1960

return std::nullopt;

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

1961

1962

const auto PTruePattern =

1963

cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();

1964

1965

// Can the intrinsic's predicate be converted to a known constant index?

Jun Ma

2021-08-25 17:25:39 +0800

[diff] [blame]

1966

unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);

1967

if (!MinNumElts)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1968

return std::nullopt;

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

1969

Jun Ma

2021-08-25 17:25:39 +0800

[diff] [blame]

1970

unsigned Idx = MinNumElts - 1;

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

1971

// Increment the index if extracting the element after the last active

1972

// predicate element.

if (IsAfter)

++Idx;

// Ignore extracts whose index is larger than the known minimum vector

1977

// length. NOTE: This is an artificial constraint where we prefer to

1978

// maintain what the user asked for until an alternative is proven faster.

1979

auto *PgVTy = cast<ScalableVectorType>(Pg->getType());

1980

if (Idx >= PgVTy->getMinNumElements())

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1981

return std::nullopt;

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

1982

1983

// The intrinsic is extracting a fixed lane so use an extract instead.

1984

auto *IdxTy = Type::getInt64Ty(II.getContext());

1985

auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));

Jeremy Morse

8e70273

2025-01-24 10:53:11 +0000

[diff] [blame]

1986

Extract->insertBefore(II.getIterator());

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

1987

Extract->takeName(&II);

1988

return IC.replaceInstUsesWith(II, Extract);

1989

}

1990

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

1991

static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,

1992

IntrinsicInst &II) {

Cullen Rhodes

2022-07-08 15:18:27 +0000

[diff] [blame]

1993

// The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar

1994

// integer variant across a variety of micro-architectures. Replace scalar

1995

// integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple

1996

// bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more

1997

// depending on the micro-architecture, but has been observed as generally

1998

// being faster, particularly when the CLAST[AB] op is a loop-carried

1999

// dependency.

Cullen Rhodes

2022-07-08 15:18:27 +0000

[diff] [blame]

2000

Value *Pg = II.getArgOperand(0);

2001

Value *Fallback = II.getArgOperand(1);

2002

Value *Vec = II.getArgOperand(2);

2003

Type *Ty = II.getType();

2004

2005

if (!Ty->isIntegerTy())

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2006

return std::nullopt;

Cullen Rhodes

2022-07-08 15:18:27 +0000

[diff] [blame]

2007

2008

Type *FPTy;

2009

switch (cast<IntegerType>(Ty)->getBitWidth()) {

2010

default:

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2011

return std::nullopt;

Cullen Rhodes

2022-07-08 15:18:27 +0000

[diff] [blame]

2012

case 16:

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2013

FPTy = IC.Builder.getHalfTy();

Cullen Rhodes

2022-07-08 15:18:27 +0000

[diff] [blame]

2014

break;

2015

case 32:

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2016

FPTy = IC.Builder.getFloatTy();

Cullen Rhodes

2022-07-08 15:18:27 +0000

[diff] [blame]

2017

break;

2018

case 64:

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2019

FPTy = IC.Builder.getDoubleTy();

Cullen Rhodes

2022-07-08 15:18:27 +0000

[diff] [blame]

break;

}

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2023

Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);

Cullen Rhodes

2022-07-08 15:18:27 +0000

[diff] [blame]

2024

auto *FPVTy = VectorType::get(

2025

FPTy, cast<VectorType>(Vec->getType())->getElementCount());

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2026

Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);

2027

auto *FPII = IC.Builder.CreateIntrinsic(

2028

II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});

2029

Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());

Cullen Rhodes

2022-07-08 15:18:27 +0000

[diff] [blame]

2030

return IC.replaceInstUsesWith(II, FPIItoInt);

2031

}

2032

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2033

static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,

2034

IntrinsicInst &II) {

Peter Waller

2021-05-12 14:47:22 +0000

[diff] [blame]

2035

LLVMContext &Ctx = II.getContext();

Peter Waller

2021-05-12 14:47:22 +0000

[diff] [blame]

2036

// Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr

2037

// can work with RDFFR_PP for ptest elimination.

2038

auto *AllPat =

2039

ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2040

auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,

2041

{II.getType()}, {AllPat});

Peter Waller

2021-05-12 14:47:22 +0000

[diff] [blame]

2042

auto *RDFFR =

Rahul Joshi

74b7abf

2025-03-31 08:10:34 -0700

[diff] [blame]

2043

IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});

Peter Waller

2021-05-12 14:47:22 +0000

[diff] [blame]

2044

RDFFR->takeName(&II);

2045

return IC.replaceInstUsesWith(II, RDFFR);

2046

}

2047

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2048

static std::optional<Instruction *>

Jun Ma

2021-06-18 11:55:01 +0800

[diff] [blame]

2049

instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {

2050

const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();

2051

2052

if (Pattern == AArch64SVEPredPattern::all) {

Jun Ma

2021-06-18 11:55:01 +0800

[diff] [blame]

2053

Constant *StepVal = ConstantInt::get(II.getType(), NumElts);

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2054

auto *VScale = IC.Builder.CreateVScale(StepVal);

Jun Ma

2021-06-18 11:55:01 +0800

[diff] [blame]

2055

VScale->takeName(&II);

2056

return IC.replaceInstUsesWith(II, VScale);

2057

}

2058

Jun Ma

2021-08-25 17:25:39 +0800

[diff] [blame]

2059

unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);

Jun Ma

2021-06-18 11:55:01 +0800

[diff] [blame]

2060

Jun Ma

2021-08-25 17:25:39 +0800

[diff] [blame]

2061

return MinNumElts && NumElts >= MinNumElts

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2062

? std::optional<Instruction *>(IC.replaceInstUsesWith(

Jun Ma

2021-06-18 11:55:01 +0800

[diff] [blame]

2063

II, ConstantInt::get(II.getType(), MinNumElts)))

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2064

: std::nullopt;

Jun Ma

2021-06-18 11:55:01 +0800

[diff] [blame]

2065

}

2066

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2067

static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,

2068

IntrinsicInst &II) {

Bradley Smith

daf1a1f

2022-11-11 15:24:57 +0000

[diff] [blame]

2069

Value *PgVal = II.getArgOperand(0);

2070

Value *OpVal = II.getArgOperand(1);

2071

Bradley Smith

daf1a1f

2022-11-11 15:24:57 +0000

[diff] [blame]

2072

// PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).

2073

// Later optimizations prefer this form.

2074

if (PgVal == OpVal &&

2075

(II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||

2076

II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {

2077

Value *Ops[] = {PgVal, OpVal};

2078

Type *Tys[] = {PgVal->getType()};

2079

2080

auto *PTest =

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2081

IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);

Bradley Smith

daf1a1f

2022-11-11 15:24:57 +0000

[diff] [blame]

2082

PTest->takeName(&II);

2083

2084

return IC.replaceInstUsesWith(II, PTest);

2085

}

2086

2087

IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);

2088

IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);

Bradley Smith

2021-07-13 14:42:36 +0000

[diff] [blame]

2089

Cullen Rhodes

2022-11-04 08:40:18 +0000

[diff] [blame]

2090

if (!Pg || !Op)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2091

return std::nullopt;

Cullen Rhodes

2022-10-12 08:36:03 +0000

[diff] [blame]

2092

Cullen Rhodes

2022-11-04 08:40:18 +0000

[diff] [blame]

2093

Intrinsic::ID OpIID = Op->getIntrinsicID();

2094

Cullen Rhodes

2022-11-04 08:40:18 +0000

[diff] [blame]

2095

if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&

2096

OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&

2097

Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {

2098

Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};

2099

Type *Tys[] = {Pg->getArgOperand(0)->getType()};

Bradley Smith

2021-07-13 14:42:36 +0000

[diff] [blame]

2100

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2101

auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);

Bradley Smith

2021-07-13 14:42:36 +0000

[diff] [blame]

2102

2103

PTest->takeName(&II);

2104

return IC.replaceInstUsesWith(II, PTest);

2105

}

2106

Cullen Rhodes

2022-10-12 08:36:03 +0000

[diff] [blame]

2107

// Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).

2108

// Later optimizations may rewrite sequence to use the flag-setting variant

2109

// of instruction X to remove PTEST.

Cullen Rhodes

2022-11-04 08:40:18 +0000

[diff] [blame]

2110

if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&

2111

((OpIID == Intrinsic::aarch64_sve_brka_z) ||

2112

(OpIID == Intrinsic::aarch64_sve_brkb_z) ||

2113

(OpIID == Intrinsic::aarch64_sve_brkpa_z) ||

2114

(OpIID == Intrinsic::aarch64_sve_brkpb_z) ||

2115

(OpIID == Intrinsic::aarch64_sve_rdffr_z) ||

2116

(OpIID == Intrinsic::aarch64_sve_and_z) ||

2117

(OpIID == Intrinsic::aarch64_sve_bic_z) ||

2118

(OpIID == Intrinsic::aarch64_sve_eor_z) ||

2119

(OpIID == Intrinsic::aarch64_sve_nand_z) ||

2120

(OpIID == Intrinsic::aarch64_sve_nor_z) ||

2121

(OpIID == Intrinsic::aarch64_sve_orn_z) ||

2122

(OpIID == Intrinsic::aarch64_sve_orr_z))) {

2123

Value *Ops[] = {Pg->getArgOperand(0), Pg};

2124

Type *Tys[] = {Pg->getType()};

Cullen Rhodes

2022-10-12 08:36:03 +0000

[diff] [blame]

2125

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2126

auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);

Cullen Rhodes

2022-10-12 08:36:03 +0000

[diff] [blame]

2127

PTest->takeName(&II);

2128

2129

return IC.replaceInstUsesWith(II, PTest);

2130

}

2131

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2132

return std::nullopt;

Bradley Smith

2021-07-13 14:42:36 +0000

[diff] [blame]

2133

}

2134

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2135

template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2136

static std::optional<Instruction *>

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2137

instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,

2138

bool MergeIntoAddendOp) {

Matt

2021-11-03 11:31:41 +0000

[diff] [blame]

2139

Value *P = II.getOperand(0);

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2140

Value *MulOp0, *MulOp1, *AddendOp, *Mul;

2141

if (MergeIntoAddendOp) {

2142

AddendOp = II.getOperand(1);

2143

Mul = II.getOperand(2);

2144

} else {

2145

AddendOp = II.getOperand(2);

2146

Mul = II.getOperand(1);

2147

}

2148

2149

if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),

2150

m_Value(MulOp1))))

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2151

return std::nullopt;

Matt

2021-11-03 11:31:41 +0000

[diff] [blame]

2152

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2153

if (!Mul->hasOneUse())

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2154

return std::nullopt;

Matt

2021-11-03 11:31:41 +0000

[diff] [blame]

2155

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2156

Instruction *FMFSource = nullptr;

2157

if (II.getType()->isFPOrFPVectorTy()) {

2158

llvm::FastMathFlags FAddFlags = II.getFastMathFlags();

2159

// Stop the combine when the flags on the inputs differ in case dropping

2160

// flags would lead to us missing out on more beneficial optimizations.

2161

if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())

2162

return std::nullopt;

2163

if (!FAddFlags.allowContract())

2164

return std::nullopt;

2165

FMFSource = &II;

2166

}

Matt

2021-11-03 11:31:41 +0000

[diff] [blame]

2167

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2168

CallInst *Res;

2169

if (MergeIntoAddendOp)

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2170

Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},

2171

{P, AddendOp, MulOp0, MulOp1}, FMFSource);

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2172

else

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2173

Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},

2174

{P, MulOp0, MulOp1, AddendOp}, FMFSource);

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2175

2176

return IC.replaceInstUsesWith(II, Res);

Matt

2021-11-03 11:31:41 +0000

[diff] [blame]

2177

}

2178

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2179

static std::optional<Instruction *>

Matt Devereau

2021-11-04 16:10:55 +0000

[diff] [blame]

2180

instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {

Matt Devereau

2021-11-04 16:10:55 +0000

[diff] [blame]

2181

Value *Pred = II.getOperand(0);

2182

Value *PtrOp = II.getOperand(1);

2183

Type *VecTy = II.getType();

Matt Devereau

2021-11-04 16:10:55 +0000

[diff] [blame]

2184

Paul Walker

01bc67e

2021-12-03 14:36:54 +0000

[diff] [blame]

2185

if (isAllActivePredicate(Pred)) {

Youngsuk Kim

2023-07-08 13:05:58 -0400

[diff] [blame]

2186

LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);

Sander de Smalen

2022-02-11 07:53:20 +0000

[diff] [blame]

2187

Load->copyMetadata(II);

Matt Devereau

2021-11-04 16:10:55 +0000

[diff] [blame]

2188

return IC.replaceInstUsesWith(II, Load);

2189

}

2190

2191

CallInst *MaskedLoad =

Youngsuk Kim

2023-07-08 13:05:58 -0400

[diff] [blame]

2192

IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2193

Pred, ConstantAggregateZero::get(VecTy));

Sander de Smalen

2022-02-11 07:53:20 +0000

[diff] [blame]

2194

MaskedLoad->copyMetadata(II);

Matt Devereau

2021-11-04 16:10:55 +0000

[diff] [blame]

2195

return IC.replaceInstUsesWith(II, MaskedLoad);

2196

}

2197

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2198

static std::optional<Instruction *>

Matt Devereau

2021-11-04 16:10:55 +0000

[diff] [blame]

2199

instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {

Matt Devereau

2021-11-04 16:10:55 +0000

[diff] [blame]

2200

Value *VecOp = II.getOperand(0);

2201

Value *Pred = II.getOperand(1);

2202

Value *PtrOp = II.getOperand(2);

Matt Devereau

2021-11-04 16:10:55 +0000

[diff] [blame]

2203

Paul Walker

01bc67e

2021-12-03 14:36:54 +0000

[diff] [blame]

2204

if (isAllActivePredicate(Pred)) {

Youngsuk Kim

2023-07-08 13:05:58 -0400

[diff] [blame]

2205

StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);

Sander de Smalen

2022-02-11 07:53:20 +0000

[diff] [blame]

2206

Store->copyMetadata(II);

Matt Devereau

2021-11-04 16:10:55 +0000

[diff] [blame]

2207

return IC.eraseInstFromFunction(II);

2208

}

2209

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2210

CallInst *MaskedStore = IC.Builder.CreateMaskedStore(

Youngsuk Kim

2023-07-08 13:05:58 -0400

[diff] [blame]

2211

VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);

Sander de Smalen

2022-02-11 07:53:20 +0000

[diff] [blame]

2212

MaskedStore->copyMetadata(II);

Matt Devereau

2021-11-04 16:10:55 +0000

[diff] [blame]

2213

return IC.eraseInstFromFunction(II);

2214

}

2215

Matthew Devereau

2021-09-01 16:41:42 +0100

[diff] [blame]

2216

static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {

2217

switch (Intrinsic) {

Jolanta Jensen

2023-05-17 09:21:40 +0000

[diff] [blame]

2218

case Intrinsic::aarch64_sve_fmul_u:

Matthew Devereau

2021-09-01 16:41:42 +0100

[diff] [blame]

2219

return Instruction::BinaryOps::FMul;

Jolanta Jensen

2023-05-17 09:21:40 +0000

[diff] [blame]

2220

case Intrinsic::aarch64_sve_fadd_u:

Matthew Devereau

2021-09-01 16:41:42 +0100

[diff] [blame]

2221

return Instruction::BinaryOps::FAdd;

Jolanta Jensen

2023-05-17 09:21:40 +0000

[diff] [blame]

2222

case Intrinsic::aarch64_sve_fsub_u:

Matthew Devereau

2021-09-01 16:41:42 +0100

[diff] [blame]

2223

return Instruction::BinaryOps::FSub;

2224

default:

2225

return Instruction::BinaryOpsEnd;

}

}

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2229

static std::optional<Instruction *>

2230

instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {

Paul Walker

65031c1

2023-04-04 12:51:25 +0000

[diff] [blame]

2231

// Bail due to missing support for ISD::STRICT_ scalable vector operations.

if (II.isStrictFP())

return std::nullopt;

Matthew Devereau

2021-10-04 16:56:56 +0100

[diff] [blame]

2235

auto *OpPredicate = II.getOperand(0);

Matthew Devereau

2021-09-01 16:41:42 +0100

[diff] [blame]

2236

auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());

2237

if (BinOpCode == Instruction::BinaryOpsEnd ||

Matthew Devereau

2025-04-13 20:40:51 +0100

[diff] [blame]

2238

!isAllActivePredicate(OpPredicate))

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2239

return std::nullopt;

Yingwei Zheng

a77346b

2025-01-06 14:37:04 +0800

[diff] [blame]

2240

auto BinOp = IC.Builder.CreateBinOpFMF(

2241

BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());

Matthew Devereau

2ac1999

2021-10-04 16:56:56 +0100

[diff] [blame]

2242

return IC.replaceInstUsesWith(II, BinOp);

Matthew Devereau

2021-09-01 16:41:42 +0100

[diff] [blame]

2243

}

2244

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2245

static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,

2246

IntrinsicInst &II) {

Paul Walker

2023-06-17 16:48:09 +0100

[diff] [blame]

2247

if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,

2248

Intrinsic::aarch64_sve_mla>(

2249

IC, II, true))

2250

return MLA;

2251

if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,

2252

Intrinsic::aarch64_sve_mad>(

IC, II, false))

return MAD;

return std::nullopt;

}

static std::optional<Instruction *>

2259

instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2260

if (auto FMLA =

2261

instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,

2262

Intrinsic::aarch64_sve_fmla>(IC, II,

2263

true))

Matt

2021-11-03 11:31:41 +0000

[diff] [blame]

2264

return FMLA;

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2265

if (auto FMAD =

2266

instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,

2267

Intrinsic::aarch64_sve_fmad>(IC, II,

2268

false))

2269

return FMAD;

Paul Walker

2023-06-17 17:51:49 +0100

[diff] [blame]

2270

if (auto FMLA =

Jolanta Jensen

2023-05-17 09:21:40 +0000

[diff] [blame]

2271

instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,

Paul Walker

2023-06-17 17:51:49 +0100

[diff] [blame]

2272

Intrinsic::aarch64_sve_fmla>(IC, II,

2273

true))

2274

return FMLA;

Jolanta Jensen

5cd16e2

2023-06-20 12:51:41 +0000

[diff] [blame]

2275

return std::nullopt;

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2276

}

2277

Paul Walker

2023-06-17 16:48:09 +0100

[diff] [blame]

2278

static std::optional<Instruction *>

2279

instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {

2280

if (auto FMLA =

2281

instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,

2282

Intrinsic::aarch64_sve_fmla>(IC, II,

true))

return FMLA;

if (auto FMAD =

instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,

2287

Intrinsic::aarch64_sve_fmad>(IC, II,

false))

return FMAD;

if (auto FMLA_U =

instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,

2292

Intrinsic::aarch64_sve_fmla_u>(

2293

IC, II, true))

2294

return FMLA_U;

2295

return instCombineSVEVectorBinOp(IC, II);

2296

}

2297

2298

static std::optional<Instruction *>

2299

instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2300

if (auto FMLS =

2301

instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,

2302

Intrinsic::aarch64_sve_fmls>(IC, II,

2303

true))

2304

return FMLS;

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2305

if (auto FMSB =

2306

instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,

2307

Intrinsic::aarch64_sve_fnmsb>(

2308

IC, II, false))

2309

return FMSB;

Paul Walker

2023-06-17 17:51:49 +0100

[diff] [blame]

2310

if (auto FMLS =

Jolanta Jensen

2023-05-17 09:21:40 +0000

[diff] [blame]

2311

instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,

Paul Walker

2023-06-17 17:51:49 +0100

[diff] [blame]

2312

Intrinsic::aarch64_sve_fmls>(IC, II,

2313

true))

2314

return FMLS;

Jolanta Jensen

5cd16e2

2023-06-20 12:51:41 +0000

[diff] [blame]

2315

return std::nullopt;

Matt

2021-11-03 11:31:41 +0000

[diff] [blame]

2316

}

2317

Paul Walker

2023-06-17 16:48:09 +0100

[diff] [blame]

2318

static std::optional<Instruction *>

2319

instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {

2320

if (auto FMLS =

2321

instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,

2322

Intrinsic::aarch64_sve_fmls>(IC, II,

true))

return FMLS;

if (auto FMSB =

instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,

2327

Intrinsic::aarch64_sve_fnmsb>(

IC, II, false))

return FMSB;

if (auto FMLS_U =

instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,

2332

Intrinsic::aarch64_sve_fmls_u>(

2333

IC, II, true))

2334

return FMLS_U;

2335

return instCombineSVEVectorBinOp(IC, II);

2336

}

2337

2338

static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,

2339

IntrinsicInst &II) {

2340

if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,

2341

Intrinsic::aarch64_sve_mls>(

IC, II, true))

return MLS;

return std::nullopt;

}

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2347

static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,

2348

IntrinsicInst &II) {

Usman Nadeem

2021-08-05 17:23:01 -0700

[diff] [blame]

2349

Value *UnpackArg = II.getArgOperand(0);

2350

auto *RetTy = cast<ScalableVectorType>(II.getType());

2351

bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||

2352

II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;

2353

2354

// Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))

2355

// Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))

2356

if (auto *ScalarArg = getSplatValue(UnpackArg)) {

2357

ScalarArg =

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2358

IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);

Usman Nadeem

2021-08-05 17:23:01 -0700

[diff] [blame]

2359

Value *NewVal =

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2360

IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);

Usman Nadeem

2021-08-05 17:23:01 -0700

[diff] [blame]

2361

NewVal->takeName(&II);

2362

return IC.replaceInstUsesWith(II, NewVal);

2363

}

2364

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2365

return std::nullopt;

Usman Nadeem

2021-08-05 17:23:01 -0700

[diff] [blame]

2366

}

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2367

static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,

2368

IntrinsicInst &II) {

Bradley Smith

2021-07-13 14:42:36 +0000

[diff] [blame]

2369

auto *OpVal = II.getOperand(0);

2370

auto *OpIndices = II.getOperand(1);

2371

VectorType *VTy = cast<VectorType>(II.getType());

2372

Usman Nadeem

2021-09-10 17:57:29 -0700

[diff] [blame]

2373

// Check whether OpIndices is a constant splat value < minimal element count

2374

// of result.

2375

auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));

Bradley Smith

2021-07-13 14:42:36 +0000

[diff] [blame]

2376

if (!SplatValue ||

2377

SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2378

return std::nullopt;

Bradley Smith

2021-07-13 14:42:36 +0000

[diff] [blame]

2379

2380

// Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to

2381

// splat_vector(extractelement(OpVal, SplatValue)) for further optimization.

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2382

auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);

Bradley Smith

2021-07-13 14:42:36 +0000

[diff] [blame]

2383

auto *VectorSplat =

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2384

IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);

Bradley Smith

2021-07-13 14:42:36 +0000

[diff] [blame]

2385

2386

VectorSplat->takeName(&II);

2387

return IC.replaceInstUsesWith(II, VectorSplat);

2388

}

2389

Usman Nadeem

267d6b5

2024-02-15 10:40:09 -0800

[diff] [blame]

2390

static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,

2391

IntrinsicInst &II) {

2392

Value *A, *B;

2393

Type *RetTy = II.getType();

2394

constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;

2395

constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;

2396

2397

// uzp1(to_svbool(A), to_svbool(B)) --> <A, B>

2398

// uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>

2399

if ((match(II.getArgOperand(0),

2400

m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&

2401

match(II.getArgOperand(1),

2402

m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||

2403

(match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&

2404

match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {

2405

auto *TyA = cast<ScalableVectorType>(A->getType());

2406

if (TyA == B->getType() &&

2407

RetTy == ScalableVectorType::getDoubleElementsVectorType(TyA)) {

2408

auto *SubVec = IC.Builder.CreateInsertVector(

Craig Topper

123758b

2025-05-02 16:10:18 -0700

[diff] [blame]

2409

RetTy, PoisonValue::get(RetTy), A, uint64_t(0));

2410

auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,

2411

TyA->getMinNumElements());

Usman Nadeem

267d6b5

2024-02-15 10:40:09 -0800

[diff] [blame]

2412

ConcatVec->takeName(&II);

2413

return IC.replaceInstUsesWith(II, ConcatVec);

}

}

return std::nullopt;

}

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2420

static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,

2421

IntrinsicInst &II) {

Usman Nadeem

757384a

2021-09-12 15:53:26 -0700

[diff] [blame]

2422

// zip1(uzp1(A, B), uzp2(A, B)) --> A

2423

// zip2(uzp1(A, B), uzp2(A, B)) --> B

2424

Value *A, *B;

2425

if (match(II.getArgOperand(0),

2426

m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&

2427

match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(

2428

m_Specific(A), m_Specific(B))))

2429

return IC.replaceInstUsesWith(

2430

II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));

2431

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2432

return std::nullopt;

Usman Nadeem

757384a

2021-09-12 15:53:26 -0700

[diff] [blame]

2433

}

2434

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2435

static std::optional<Instruction *>

2436

instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {

Peter Waller

2021-11-03 13:40:22 +0000

[diff] [blame]

2437

Value *Mask = II.getOperand(0);

2438

Value *BasePtr = II.getOperand(1);

2439

Value *Index = II.getOperand(2);

2440

Type *Ty = II.getType();

Peter Waller

2021-11-03 13:40:22 +0000

[diff] [blame]

2441

Value *PassThru = ConstantAggregateZero::get(Ty);

2442

2443

// Contiguous gather => masked load.

2444

// (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))

2445

// => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)

2446

Value *IndexBase;

2447

if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(

2448

m_Value(IndexBase), m_SpecificInt(1)))) {

Peter Waller

2021-11-03 13:40:22 +0000

[diff] [blame]

2449

Align Alignment =

Nikita Popov

2d209d9

2024-06-27 16:38:15 +0200

[diff] [blame]

2450

BasePtr->getPointerAlignment(II.getDataLayout());

Peter Waller

2021-11-03 13:40:22 +0000

[diff] [blame]

2451

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2452

Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),

2453

BasePtr, IndexBase);

Peter Waller

2021-11-03 13:40:22 +0000

[diff] [blame]

2454

CallInst *MaskedLoad =

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2455

IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);

Peter Waller

2021-11-03 13:40:22 +0000

[diff] [blame]

2456

MaskedLoad->takeName(&II);

2457

return IC.replaceInstUsesWith(II, MaskedLoad);

2458

}

2459

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2460

return std::nullopt;

Peter Waller

2021-11-03 13:40:22 +0000

[diff] [blame]

2461

}

2462

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2463

static std::optional<Instruction *>

2464

instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {

Peter Waller

2021-11-03 13:40:22 +0000

[diff] [blame]

2465

Value *Val = II.getOperand(0);

2466

Value *Mask = II.getOperand(1);

2467

Value *BasePtr = II.getOperand(2);

2468

Value *Index = II.getOperand(3);

2469

Type *Ty = Val->getType();

Peter Waller

2021-11-03 13:40:22 +0000

[diff] [blame]

2470

2471

// Contiguous scatter => masked store.

Nikita Popov

3196ef8

2022-02-08 15:16:16 +0100

[diff] [blame]

2472

// (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))

Peter Waller

2021-11-03 13:40:22 +0000

[diff] [blame]

2473

// => (masked.store Value (gep BasePtr IndexBase) Align Mask)

2474

Value *IndexBase;

2475

if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(

2476

m_Value(IndexBase), m_SpecificInt(1)))) {

Peter Waller

2021-11-03 13:40:22 +0000

[diff] [blame]

2477

Align Alignment =

Nikita Popov

2d209d9

2024-06-27 16:38:15 +0200

[diff] [blame]

2478

BasePtr->getPointerAlignment(II.getDataLayout());

Peter Waller

2021-11-03 13:40:22 +0000

[diff] [blame]

2479

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2480

Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),

2481

BasePtr, IndexBase);

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2482

(void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);

Peter Waller

2021-11-03 13:40:22 +0000

[diff] [blame]

2483

2484

return IC.eraseInstFromFunction(II);

2485

}

2486

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2487

return std::nullopt;

Peter Waller

2021-11-03 13:40:22 +0000

[diff] [blame]

2488

}

2489

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2490

static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,

2491

IntrinsicInst &II) {

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2492

Type *Int32Ty = IC.Builder.getInt32Ty();

Matt Devereau

2021-12-09 15:32:35 +0000

[diff] [blame]

2493

Value *Pred = II.getOperand(0);

2494

Value *Vec = II.getOperand(1);

2495

Value *DivVec = II.getOperand(2);

2496

2497

Value *SplatValue = getSplatValue(DivVec);

2498

ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);

2499

if (!SplatConstantInt)

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2500

return std::nullopt;

Matthew Devereau

1808fc1

2024-09-20 13:53:02 +0100

[diff] [blame]

2501

Matt Devereau

2021-12-09 15:32:35 +0000

[diff] [blame]

2502

APInt Divisor = SplatConstantInt->getValue();

Matthew Devereau

1808fc1

2024-09-20 13:53:02 +0100

[diff] [blame]

2503

const int64_t DivisorValue = Divisor.getSExtValue();

2504

if (DivisorValue == -1)

2505

return std::nullopt;

2506

if (DivisorValue == 1)

2507

IC.replaceInstUsesWith(II, Vec);

Matt Devereau

2021-12-09 15:32:35 +0000

[diff] [blame]

2508

2509

if (Divisor.isPowerOf2()) {

2510

Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2511

auto ASRD = IC.Builder.CreateIntrinsic(

Matt Devereau

2021-12-09 15:32:35 +0000

[diff] [blame]

2512

Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});

2513

return IC.replaceInstUsesWith(II, ASRD);

2514

}

2515

if (Divisor.isNegatedPowerOf2()) {

2516

Divisor.negate();

2517

Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2518

auto ASRD = IC.Builder.CreateIntrinsic(

Matt Devereau

2021-12-09 15:32:35 +0000

[diff] [blame]

2519

Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2520

auto NEG = IC.Builder.CreateIntrinsic(

2521

Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});

Matt Devereau

2021-12-09 15:32:35 +0000

[diff] [blame]

2522

return IC.replaceInstUsesWith(II, NEG);

2523

}

2524

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2525

return std::nullopt;

Matt Devereau

2021-12-09 15:32:35 +0000

[diff] [blame]

2526

}

2527

Matt Devereau

2023-01-16 14:21:18 +0000

[diff] [blame]

2528

bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {

Matt Devereau

2022-12-16 11:19:28 +0000

[diff] [blame]

2529

size_t VecSize = Vec.size();

2530

if (VecSize == 1)

2531

return true;

2532

if (!isPowerOf2_64(VecSize))

2533

return false;

2534

size_t HalfVecSize = VecSize / 2;

2535

2536

for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;

2537

RHS != Vec.end(); LHS++, RHS++) {

Matt Devereau

2023-01-16 14:21:18 +0000

[diff] [blame]

2538

if (*LHS != nullptr && *RHS != nullptr) {

if (*LHS == *RHS)

continue;

else

return false;

}

if (!AllowPoison)

return false;

if (*LHS == nullptr && *RHS != nullptr)

2547

*LHS = *RHS;

Matt Devereau

2022-12-16 11:19:28 +0000

[diff] [blame]

2548

}

2549

2550

Vec.resize(HalfVecSize);

Matt Devereau

2023-01-16 14:21:18 +0000

[diff] [blame]

2551

SimplifyValuePattern(Vec, AllowPoison);

Matt Devereau

2022-12-16 11:19:28 +0000

[diff] [blame]

return true;

}

// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)

2556

// to dupqlane(f64(C)) where C is A concatenated with B

2557

static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,

2558

IntrinsicInst &II) {

2559

Value *CurrentInsertElt = nullptr, *Default = nullptr;

2560

if (!match(II.getOperand(0),

2561

m_Intrinsic<Intrinsic::vector_insert>(

2562

m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||

2563

!isa<FixedVectorType>(CurrentInsertElt->getType()))

2564

return std::nullopt;

2565

auto IIScalableTy = cast<ScalableVectorType>(II.getType());

2566

2567

// Insert the scalars into a container ordered by InsertElement index

2568

SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);

2569

while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {

2570

auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));

2571

Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);

2572

CurrentInsertElt = InsertElt->getOperand(0);

2573

}

2574

Matt Devereau

2023-01-16 14:21:18 +0000

[diff] [blame]

2575

bool AllowPoison =

2576

isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);

2577

if (!SimplifyValuePattern(Elts, AllowPoison))

Matt Devereau

2022-12-16 11:19:28 +0000

[diff] [blame]

2578

return std::nullopt;

2579

2580

// Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)

Matt Devereau

2022-12-16 11:19:28 +0000

[diff] [blame]

2581

Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());

2582

for (size_t I = 0; I < Elts.size(); I++) {

Matt Devereau

2023-01-16 14:21:18 +0000

[diff] [blame]

2583

if (Elts[I] == nullptr)

2584

continue;

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2585

InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],

2586

IC.Builder.getInt64(I));

Matt Devereau

2022-12-16 11:19:28 +0000

[diff] [blame]

2587

}

Matt Devereau

2023-01-16 14:21:18 +0000

[diff] [blame]

2588

if (InsertEltChain == nullptr)

2589

return std::nullopt;

Matt Devereau

2022-12-16 11:19:28 +0000

[diff] [blame]

2590

2591

// Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64

2592

// value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector

2593

// be bitcast to a type wide enough to fit the sequence, be splatted, and then

2594

// be narrowed back to the original type.

2595

unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();

2596

unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *

2597

IIScalableTy->getMinNumElements() /

2598

PatternWidth;

2599

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2600

IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);

Matt Devereau

2022-12-16 11:19:28 +0000

[diff] [blame]

2601

auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);

2602

auto *WideShuffleMaskTy =

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2603

ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);

Matt Devereau

2022-12-16 11:19:28 +0000

[diff] [blame]

2604

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2605

auto InsertSubvector = IC.Builder.CreateInsertVector(

Craig Topper

123758b

2025-05-02 16:10:18 -0700

[diff] [blame]

2606

II.getType(), PoisonValue::get(II.getType()), InsertEltChain,

2607

uint64_t(0));

Matt Devereau

2022-12-16 11:19:28 +0000

[diff] [blame]

2608

auto WideBitcast =

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2609

IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);

Matt Devereau

2022-12-16 11:19:28 +0000

[diff] [blame]

2610

auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2611

auto WideShuffle = IC.Builder.CreateShuffleVector(

Matt Devereau

2022-12-16 11:19:28 +0000

[diff] [blame]

2612

WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);

2613

auto NarrowBitcast =

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2614

IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());

Matt Devereau

2022-12-16 11:19:28 +0000

[diff] [blame]

2615

2616

return IC.replaceInstUsesWith(II, NarrowBitcast);

2617

}

2618

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2619

static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,

2620

IntrinsicInst &II) {

Florian Hahn

17a7399

2022-05-10 19:57:43 +0100

[diff] [blame]

2621

Value *A = II.getArgOperand(0);

2622

Value *B = II.getArgOperand(1);

2623

if (A == B)

2624

return IC.replaceInstUsesWith(II, A);

2625

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2626

return std::nullopt;

Florian Hahn

17a7399

2022-05-10 19:57:43 +0100

[diff] [blame]

2627

}

2628

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2629

static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,

2630

IntrinsicInst &II) {

Bradley Smith

2022-05-06 14:45:56 +0000

[diff] [blame]

2631

Value *Pred = II.getOperand(0);

2632

Value *Vec = II.getOperand(1);

2633

Value *Shift = II.getOperand(2);

2634

2635

// Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.

2636

Value *AbsPred, *MergedValue;

2637

if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(

2638

m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&

2639

!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(

2640

m_Value(MergedValue), m_Value(AbsPred), m_Value())))

2641

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2642

return std::nullopt;

Bradley Smith

2022-05-06 14:45:56 +0000

[diff] [blame]

2643

2644

// Transform is valid if any of the following are true:

2645

// * The ABS merge value is an undef or non-negative

2646

// * The ABS predicate is all active

2647

// * The ABS predicate and the SRSHL predicates are the same

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2648

if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&

Bradley Smith

2022-05-06 14:45:56 +0000

[diff] [blame]

2649

AbsPred != Pred && !isAllActivePredicate(AbsPred))

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2650

return std::nullopt;

Bradley Smith

2022-05-06 14:45:56 +0000

[diff] [blame]

2651

2652

// Only valid when the shift amount is non-negative, otherwise the rounding

2653

// behaviour of SRSHL cannot be ignored.

2654

if (!match(Shift, m_NonNegative()))

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2655

return std::nullopt;

Bradley Smith

2022-05-06 14:45:56 +0000

[diff] [blame]

2656

Nikita Popov

2023-05-16 18:11:17 +0200

[diff] [blame]

2657

auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,

2658

{II.getType()}, {Pred, Vec, Shift});

Bradley Smith

2022-05-06 14:45:56 +0000

[diff] [blame]

2659

2660

return IC.replaceInstUsesWith(II, LSL);

2661

}

2662

Paul Walker

622ae7f

2024-09-24 15:11:36 +0100

[diff] [blame]

2663

static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,

2664

IntrinsicInst &II) {

2665

Value *Vec = II.getOperand(0);

2666

2667

if (getSplatValue(Vec) == II.getOperand(1))

2668

return IC.replaceInstUsesWith(II, Vec);

return std::nullopt;

}

Danila Malyutin

2024-10-17 21:04:04 +0400

[diff] [blame]

2673

static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,

2674

IntrinsicInst &II) {

2675

// If this barrier is post-dominated by identical one we can remove it

2676

auto *NI = II.getNextNonDebugInstruction();

2677

unsigned LookaheadThreshold = DMBLookaheadThreshold;

2678

auto CanSkipOver = [](Instruction *I) {

2679

return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();

2680

};

2681

while (LookaheadThreshold-- && CanSkipOver(NI)) {

2682

auto *NIBB = NI->getParent();

2683

NI = NI->getNextNonDebugInstruction();

2684

if (!NI) {

2685

if (auto *SuccBB = NIBB->getUniqueSuccessor())

Jeremy Morse

81d18ad8

2025-01-27 16:27:54 +0000

[diff] [blame]

2686

NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();

Danila Malyutin

1a60905

2024-10-17 21:04:04 +0400

[diff] [blame]

else

break;

}

}

auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);

2692

if (NextII && II.isIdenticalTo(NextII))

2693

return IC.eraseInstFromFunction(II);

return std::nullopt;

}

Matthew Devereau

2025-04-13 20:40:51 +0100

[diff] [blame]

2698

static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,

2699

IntrinsicInst &II) {

2700

if (match(II.getOperand(0), m_ConstantInt<AArch64SVEPredPattern::all>()))

2701

return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));

return std::nullopt;

}

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2705

std::optional<Instruction *>

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

2706

AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,

2707

IntrinsicInst &II) const {

Paul Walker

2025-04-01 13:27:46 +0100

[diff] [blame]

2708

const SVEIntrinsicInfo &IInfo = constructSVEIntrinsicInfo(II);

2709

if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))

2710

return I;

2711

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

2712

Intrinsic::ID IID = II.getIntrinsicID();

2713

switch (IID) {

2714

default:

2715

break;

Danila Malyutin

1a60905

2024-10-17 21:04:04 +0400

[diff] [blame]

2716

case Intrinsic::aarch64_dmb:

2717

return instCombineDMB(IC, II);

Florian Hahn

17a7399

2022-05-10 19:57:43 +0100

[diff] [blame]

2718

case Intrinsic::aarch64_neon_fmaxnm:

2719

case Intrinsic::aarch64_neon_fminnm:

2720

return instCombineMaxMinNM(IC, II);

Bradley Smith

2021-04-26 16:19:25 +0100

[diff] [blame]

2721

case Intrinsic::aarch64_sve_convert_from_svbool:

2722

return instCombineConvertFromSVBool(IC, II);

Bradley Smith

2021-04-23 13:55:42 +0100

[diff] [blame]

2723

case Intrinsic::aarch64_sve_dup:

2724

return instCombineSVEDup(IC, II);

Usman Nadeem

2021-09-10 17:57:29 -0700

[diff] [blame]

2725

case Intrinsic::aarch64_sve_dup_x:

2726

return instCombineSVEDupX(IC, II);

Bradley Smith

2021-05-20 11:13:34 +0100

[diff] [blame]

2727

case Intrinsic::aarch64_sve_cmpne:

2728

case Intrinsic::aarch64_sve_cmpne_wide:

2729

return instCombineSVECmpNE(IC, II);

Peter Waller

2021-05-12 14:47:22 +0000

[diff] [blame]

2730

case Intrinsic::aarch64_sve_rdffr:

2731

return instCombineRDFFR(IC, II);

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

2732

case Intrinsic::aarch64_sve_lasta:

2733

case Intrinsic::aarch64_sve_lastb:

2734

return instCombineSVELast(IC, II);

Cullen Rhodes

2022-07-08 15:18:27 +0000

[diff] [blame]

2735

case Intrinsic::aarch64_sve_clasta_n:

2736

case Intrinsic::aarch64_sve_clastb_n:

2737

return instCombineSVECondLast(IC, II);

Jun Ma

2021-06-18 11:55:01 +0800

[diff] [blame]

2738

case Intrinsic::aarch64_sve_cntd:

2739

return instCombineSVECntElts(IC, II, 2);

2740

case Intrinsic::aarch64_sve_cntw:

2741

return instCombineSVECntElts(IC, II, 4);

2742

case Intrinsic::aarch64_sve_cnth:

2743

return instCombineSVECntElts(IC, II, 8);

2744

case Intrinsic::aarch64_sve_cntb:

2745

return instCombineSVECntElts(IC, II, 16);

Bradley Smith

2021-07-13 14:42:36 +0000

[diff] [blame]

2746

case Intrinsic::aarch64_sve_ptest_any:

2747

case Intrinsic::aarch64_sve_ptest_first:

2748

case Intrinsic::aarch64_sve_ptest_last:

2749

return instCombineSVEPTest(IC, II);

Matthew Devereau

2021-09-01 16:41:42 +0100

[diff] [blame]

2750

case Intrinsic::aarch64_sve_fadd:

Paul Walker

2023-06-17 16:48:09 +0100

[diff] [blame]

2751

return instCombineSVEVectorFAdd(IC, II);

Jolanta Jensen

2023-05-17 09:21:40 +0000

[diff] [blame]

2752

case Intrinsic::aarch64_sve_fadd_u:

Paul Walker

2023-06-17 16:48:09 +0100

[diff] [blame]

2753

return instCombineSVEVectorFAddU(IC, II);

Jolanta Jensen

ecb07f4

2023-05-17 09:21:40 +0000

[diff] [blame]

2754

case Intrinsic::aarch64_sve_fmul_u:

Paul Walker

a7999f3

2025-04-17 15:58:39 +0100

[diff] [blame]

2755

return instCombineSVEVectorBinOp(IC, II);

Jolanta Jensen

ecb07f4

2023-05-17 09:21:40 +0000

[diff] [blame]

2756

case Intrinsic::aarch64_sve_fsub:

2757

return instCombineSVEVectorFSub(IC, II);

2758

case Intrinsic::aarch64_sve_fsub_u:

2759

return instCombineSVEVectorFSubU(IC, II);

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2760

case Intrinsic::aarch64_sve_add:

2761

return instCombineSVEVectorAdd(IC, II);

Jolanta Jensen

105d63a

2023-05-12 13:00:55 +0000

[diff] [blame]

2762

case Intrinsic::aarch64_sve_add_u:

2763

return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,

2764

Intrinsic::aarch64_sve_mla_u>(

2765

IC, II, true);

Matt Devereau

2022-12-15 16:09:13 +0000

[diff] [blame]

2766

case Intrinsic::aarch64_sve_sub:

2767

return instCombineSVEVectorSub(IC, II);

Jolanta Jensen

105d63a

2023-05-12 13:00:55 +0000

[diff] [blame]

2768

case Intrinsic::aarch64_sve_sub_u:

2769

return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,

2770

Intrinsic::aarch64_sve_mls_u>(

2771

IC, II, true);

Bradley Smith

2021-07-13 14:42:36 +0000

[diff] [blame]

2772

case Intrinsic::aarch64_sve_tbl:

2773

return instCombineSVETBL(IC, II);

Usman Nadeem

2021-08-05 17:23:01 -0700

[diff] [blame]

2774

case Intrinsic::aarch64_sve_uunpkhi:

2775

case Intrinsic::aarch64_sve_uunpklo:

2776

case Intrinsic::aarch64_sve_sunpkhi:

2777

case Intrinsic::aarch64_sve_sunpklo:

2778

return instCombineSVEUnpack(IC, II);

Usman Nadeem

267d6b5

2024-02-15 10:40:09 -0800

[diff] [blame]

2779

case Intrinsic::aarch64_sve_uzp1:

2780

return instCombineSVEUzp1(IC, II);

Usman Nadeem

757384a

2021-09-12 15:53:26 -0700

[diff] [blame]

2781

case Intrinsic::aarch64_sve_zip1:

2782

case Intrinsic::aarch64_sve_zip2:

2783

return instCombineSVEZip(IC, II);

Peter Waller

2021-11-03 13:40:22 +0000

[diff] [blame]

2784

case Intrinsic::aarch64_sve_ld1_gather_index:

2785

return instCombineLD1GatherIndex(IC, II);

2786

case Intrinsic::aarch64_sve_st1_scatter_index:

2787

return instCombineST1ScatterIndex(IC, II);

Matt Devereau

2021-11-04 16:10:55 +0000

[diff] [blame]

2788

case Intrinsic::aarch64_sve_ld1:

2789

return instCombineSVELD1(IC, II, DL);

2790

case Intrinsic::aarch64_sve_st1:

2791

return instCombineSVEST1(IC, II, DL);

Matt Devereau

2021-12-09 15:32:35 +0000

[diff] [blame]

2792

case Intrinsic::aarch64_sve_sdiv:

2793

return instCombineSVESDIV(IC, II);

Matt Devereau

a9e08bc

2022-03-16 11:41:14 +0000

[diff] [blame]

2794

case Intrinsic::aarch64_sve_sel:

2795

return instCombineSVESel(IC, II);

Bradley Smith

2022-05-06 14:45:56 +0000

[diff] [blame]

2796

case Intrinsic::aarch64_sve_srshl:

2797

return instCombineSVESrshl(IC, II);

Matt Devereau

2022-12-16 11:19:28 +0000

[diff] [blame]

2798

case Intrinsic::aarch64_sve_dupq_lane:

2799

return instCombineSVEDupqLane(IC, II);

Paul Walker

622ae7f

2024-09-24 15:11:36 +0100

[diff] [blame]

2800

case Intrinsic::aarch64_sve_insr:

2801

return instCombineSVEInsr(IC, II);

Matthew Devereau

2025-04-13 20:40:51 +0100

[diff] [blame]

2802

case Intrinsic::aarch64_sve_ptrue:

2803

return instCombinePTrue(IC, II);

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

2804

}

2805

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2806

return std::nullopt;

Joe Ellis

2021-04-16 10:05:05 +0000

[diff] [blame]

2807

}

2808

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2809

std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(

David Green

61888d9

2022-01-13 11:53:12 +0000

[diff] [blame]

2810

InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,

2811

APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,

2812

std::function<void(Instruction *, unsigned, APInt, APInt &)>

2813

SimplifyAndSetOp) const {

2814

switch (II.getIntrinsicID()) {

2815

default:

2816

break;

2817

case Intrinsic::aarch64_neon_fcvtxn:

2818

case Intrinsic::aarch64_neon_rshrn:

2819

case Intrinsic::aarch64_neon_sqrshrn:

2820

case Intrinsic::aarch64_neon_sqrshrun:

2821

case Intrinsic::aarch64_neon_sqshrn:

2822

case Intrinsic::aarch64_neon_sqshrun:

2823

case Intrinsic::aarch64_neon_sqxtn:

2824

case Intrinsic::aarch64_neon_sqxtun:

2825

case Intrinsic::aarch64_neon_uqrshrn:

2826

case Intrinsic::aarch64_neon_uqshrn:

2827

case Intrinsic::aarch64_neon_uqxtn:

2828

SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);

break;

}

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

2832

return std::nullopt;

David Green

61888d9

2022-01-13 11:53:12 +0000

[diff] [blame]

2833

}

2834

Paul Walker

7775a48

2024-08-05 11:25:44 +0100

[diff] [blame]

2835

bool AArch64TTIImpl::enableScalableVectorization() const {

2836

return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&

2837

EnableScalableAutovecInStreamingMode);

2838

}

2839

Sander de Smalen

2022-10-19 14:14:00 +0000

[diff] [blame]

2840

TypeSize

2841

AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {

2842

switch (K) {

2843

case TargetTransformInfo::RGK_Scalar:

Sander de Smalen

81b7f11

2023-11-22 08:52:53 +0000

[diff] [blame]

2844

return TypeSize::getFixed(64);

Sander de Smalen

2022-10-19 14:14:00 +0000

[diff] [blame]

2845

case TargetTransformInfo::RGK_FixedWidthVector:

Sander de Smalen

2024-06-24 11:06:16 +0100

[diff] [blame]

2846

if (ST->useSVEForFixedLengthVectors() &&

2847

(ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))

Sander de Smalen

81b7f11

2023-11-22 08:52:53 +0000

[diff] [blame]

2848

return TypeSize::getFixed(

2849

std::max(ST->getMinSVEVectorSizeInBits(), 128u));

Sander de Smalen

2024-06-24 11:06:16 +0100

[diff] [blame]

2850

else if (ST->isNeonAvailable())

2851

return TypeSize::getFixed(128);

2852

else

2853

return TypeSize::getFixed(0);

Sander de Smalen

2022-10-19 14:14:00 +0000

[diff] [blame]

2854

case TargetTransformInfo::RGK_ScalableVector:

Sander de Smalen

2024-06-24 11:06:16 +0100

[diff] [blame]

2855

if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&

2856

EnableScalableAutovecInStreamingMode))

2857

return TypeSize::getScalable(128);

2858

else

Sander de Smalen

81b7f11

2023-11-22 08:52:53 +0000

[diff] [blame]

2859

return TypeSize::getScalable(0);

Sander de Smalen

2022-10-19 14:14:00 +0000

[diff] [blame]

2860

}

2861

llvm_unreachable("Unsupported register kind");

2862

}

2863

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

2864

bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,

David Green

2023-07-12 13:13:06 +0100

[diff] [blame]

2865

ArrayRef<const Value *> Args,

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

2866

Type *SrcOverrideTy) const {

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

2867

// A helper that returns a vector type from the given type. The number of

David Kreitzer

6918a15

2022-04-29 12:26:13 -0700

[diff] [blame]

2868

// elements in type Ty determines the vector width.

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

2869

auto toVectorTy = [&](Type *ArgTy) {

Caroline Concatto

6c4d8f4

2020-11-11 14:41:01 +0000

[diff] [blame]

2870

return VectorType::get(ArgTy->getScalarType(),

2871

cast<VectorType>(DstTy)->getElementCount());

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

2872

};

2873

David Green

2023-07-12 13:13:06 +0100

[diff] [blame]

2874

// Exit early if DstTy is not a vector type whose elements are one of [i16,

2875

// i32, i64]. SVE doesn't generally have the same set of instructions to

David Green

f2a92db

2022-11-30 13:09:48 +0000

[diff] [blame]

2876

// perform an extend with the add/sub/mul. There are SMULLB style

2877

// instructions, but they operate on top/bottom, requiring some sort of lane

2878

// interleaving to be used with zext/sext.

David Green

2023-07-12 13:13:06 +0100

[diff] [blame]

2879

unsigned DstEltSize = DstTy->getScalarSizeInBits();

2880

if (!useNeonVector(DstTy) || Args.size() != 2 ||

2881

(DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

2882

return false;

2883

2884

// Determine if the operation has a widening variant. We consider both the

2885

// "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the

2886

// instructions.

2887

//

David Green

2022-04-04 12:45:04 +0100

[diff] [blame]

2888

// TODO: Add additional widening operations (e.g., shl, etc.) once we

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

2889

// verify that their extending operands are eliminated during code

2890

// generation.

David Green

2023-07-12 13:13:06 +0100

[diff] [blame]

2891

Type *SrcTy = SrcOverrideTy;

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

2892

switch (Opcode) {

2893

case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).

2894

case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).

David Green

2023-07-12 13:13:06 +0100

[diff] [blame]

2895

// The second operand needs to be an extend

2896

if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {

2897

if (!SrcTy)

2898

SrcTy =

2899

toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());

2900

} else

2901

return false;

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

2902

break;

David Green

2023-07-12 13:13:06 +0100

[diff] [blame]

2903

case Instruction::Mul: { // SMULL(2), UMULL(2)

2904

// Both operands need to be extends of the same type.

2905

if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||

2906

(isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {

2907

if (!SrcTy)

2908

SrcTy =

2909

toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());

2910

} else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {

2911

// If one of the operands is a Zext and the other has enough zero bits to

2912

// be treated as unsigned, we can still general a umull, meaning the zext

2913

// is free.

2914

KnownBits Known =

2915

computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);

2916

if (Args[0]->getType()->getScalarSizeInBits() -

2917

Known.Zero.countLeadingOnes() >

2918

DstTy->getScalarSizeInBits() / 2)

2919

return false;

2920

if (!SrcTy)

2921

SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),

2922

DstTy->getScalarSizeInBits() / 2));

} else

return false;

break;

}

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

default:

return false;

}

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

2931

// Legalize the destination type and ensure it can be used in a widening

2932

// operation.

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

2933

auto DstTyL = getTypeLegalizationCost(DstTy);

David Green

2023-07-12 13:13:06 +0100

[diff] [blame]

2934

if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

2935

return false;

2936

2937

// Legalize the source type and ensure it can be used in a widening

2938

// operation.

David Green

2023-07-12 13:13:06 +0100

[diff] [blame]

2939

assert(SrcTy && "Expected some SrcTy");

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

2940

auto SrcTyL = getTypeLegalizationCost(SrcTy);

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

2941

unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();

2942

if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())

2943

return false;

2944

2945

// Get the total number of vector elements in the legalized types.

Daniil Fukalov

3489c2d

2021-04-29 16:02:51 +0300

[diff] [blame]

2946

InstructionCost NumDstEls =

2947

DstTyL.first * DstTyL.second.getVectorMinNumElements();

2948

InstructionCost NumSrcEls =

2949

SrcTyL.first * SrcTyL.second.getVectorMinNumElements();

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

2950

2951

// Return true if the legalized types have the same number of vector elements

2952

// and the destination element type size is twice that of the source type.

David Green

2023-07-12 13:13:06 +0100

[diff] [blame]

2953

return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

2954

}

2955

Kerry McLaughlin

2023-08-29 08:15:29 +0000

[diff] [blame]

2956

// s/urhadd instructions implement the following pattern, making the

2957

// extends free:

2958

// %x = add ((zext i8 -> i16), 1)

2959

// %y = (zext i8 -> i16)

2960

// trunc i16 (lshr (add %x, %y), 1) -> i8

2961

//

zhongyunde

f41223ee

2023-09-01 23:40:21 +0800

[diff] [blame]

2962

bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst,

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

2963

Type *Src) const {

Kerry McLaughlin

2023-08-29 08:15:29 +0000

[diff] [blame]

2964

// The source should be a legal vector type.

2965

if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||

2966

(Src->isScalableTy() && !ST->hasSVE2()))

2967

return false;

2968

2969

if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())

2970

return false;

2971

2972

// Look for trunc/shl/add before trying to match the pattern.

2973

const Instruction *Add = ExtUser;

2974

auto *AddUser =

2975

dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());

2976

if (AddUser && AddUser->getOpcode() == Instruction::Add)

2977

Add = AddUser;

2978

2979

auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());

2980

if (!Shr || Shr->getOpcode() != Instruction::LShr)

2981

return false;

2982

2983

auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());

2984

if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||

2985

Src->getScalarSizeInBits() !=

2986

cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())

2987

return false;

2988

2989

// Try to match the whole pattern. Ext could be either the first or second

2990

// m_ZExtOrSExt matched.

2991

Instruction *Ex1, *Ex2;

2992

if (!(match(Add, m_c_Add(m_Instruction(Ex1),

2993

m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))

2994

return false;

2995

2996

// Ensure both extends are of the same type

2997

if (match(Ex1, m_ZExtOrSExt(m_Value())) &&

2998

Ex1->getOpcode() == Ex2->getOpcode())

return true;

return false;

}

Sander de Smalen

2021-01-21 13:40:22 +0000

[diff] [blame]

3004

InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,

3005

Type *Src,

3006

TTI::CastContextHint CCH,

3007

TTI::TargetCostKind CostKind,

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

3008

const Instruction *I) const {

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

3009

int ISD = TLI->InstructionOpcodeToISD(Opcode);

3010

assert(ISD && "Invalid opcode");

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

3011

// If the cast is observable, and it is used by a widening instruction (e.g.,

3012

// uaddl, saddw, etc.), it may be free.

David Green

2022-04-04 12:45:04 +0100

[diff] [blame]

3013

if (I && I->hasOneUser()) {

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

3014

auto *SingleUser = cast<Instruction>(*I->user_begin());

3015

SmallVector<const Value *, 4> Operands(SingleUser->operand_values());

David Green

2023-07-12 13:13:06 +0100

[diff] [blame]

3016

if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {

3017

// For adds only count the second operand as free if both operands are

3018

// extends but not the same operation. (i.e both operands are not free in

3019

// add(sext, zext)).

3020

if (SingleUser->getOpcode() == Instruction::Add) {

3021

if (I == SingleUser->getOperand(1) ||

3022

(isa<CastInst>(SingleUser->getOperand(1)) &&

3023

cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

3024

return 0;

David Green

2023-07-12 13:13:06 +0100

[diff] [blame]

3025

} else // Others are free so long as isWideningInstruction returned true.

3026

return 0;

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

3027

}

Kerry McLaughlin

2023-08-29 08:15:29 +0000

[diff] [blame]

3028

3029

// The cast will be free for the s/urhadd instructions

3030

if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&

zhongyunde

f41223ee

2023-09-01 23:40:21 +0800

[diff] [blame]

3031

isExtPartOfAvgExpr(SingleUser, Dst, Src))

Kerry McLaughlin

2023-08-29 08:15:29 +0000

[diff] [blame]

3032

return 0;

Matthew Simpson

2017-05-09 20:18:12 +0000

[diff] [blame]

3033

}

3034

Sam Parker

8aaabad

2020-05-26 11:27:57 +0100

[diff] [blame]

3035

// TODO: Allow non-throughput costs that aren't binary.

Sander de Smalen

92d8421

2021-01-21 13:40:22 +0000

[diff] [blame]

3036

auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {

Sam Parker

8aaabad

2020-05-26 11:27:57 +0100

[diff] [blame]

3037

if (CostKind != TTI::TCK_RecipThroughput)

3038

return Cost == 0 ? 0 : 1;

return Cost;

};

Mehdi Amini

2015-07-09 02:09:04 +0000

[diff] [blame]

3042

EVT SrcTy = TLI->getValueType(DL, Src);

3043

EVT DstTy = TLI->getValueType(DL, Dst);

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

3044

3045

if (!SrcTy.isSimple() || !DstTy.isSimple())

David Green

2020-07-29 13:32:53 +0100

[diff] [blame]

3046

return AdjustCost(

3047

BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

3048

David Green

2db7b31

2025-01-07 09:39:08 +0000

[diff] [blame]

3049

static const TypeConversionCostTblEntry BF16Tbl[] = {

3050

{ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt

3051

{ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt

3052

{ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn

3053

{ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2

3054

{ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn

3055

{ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn

3056

{ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn

};

if (ST->hasBF16())

if (const auto *Entry = ConvertCostTableLookup(

3061

BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))

3062

return AdjustCost(Entry->Cost);

3063

Graham Hunter

2025-03-25 10:43:44 +0000

[diff] [blame]

3064

// Symbolic constants for the SVE sitofp/uitofp entries in the table below

3065

// The cost of unpacking twice is artificially increased for now in order

3066

// to avoid regressions against NEON, which will use tbl instructions directly

3067

// instead of multiple layers of [s|u]unpk[lo|hi].

3068

// We use the unpacks in cases where the destination type is illegal and

3069

// requires splitting of the input, even if the input type itself is legal.

3070

const unsigned int SVE_EXT_COST = 1;

3071

const unsigned int SVE_FCVT_COST = 1;

3072

const unsigned int SVE_UNPACK_ONCE = 4;

3073

const unsigned int SVE_UNPACK_TWICE = 16;

3074

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3075

static const TypeConversionCostTblEntry ConversionTbl[] = {

3076

{ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn

3077

{ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn

3078

{ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn

3079

{ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn

3080

{ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1

3081

{ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn

3082

{ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn

3083

{ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1

3084

{ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn

3085

{ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn

3086

{ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn

3087

{ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1

3088

{ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1

3089

{ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1

3090

{ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1

3091

{ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1

3092

{ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1

3093

{ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1

3094

{ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1

3095

{ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1

Silviu Baranga

2015-08-17 16:05:09 +0000

[diff] [blame]

3096

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3097

// Truncations on nxvmiN

David Sherwood

eaf482f

2024-12-19 10:07:41 +0000

[diff] [blame]

3098

{ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},

3099

{ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},

3100

{ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},

3101

{ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},

3102

{ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},

3103

{ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},

3104

{ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},

3105

{ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},

3106

{ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},

3107

{ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},

3108

{ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},

3109

{ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},

3110

{ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},

3111

{ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},

3112

{ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},

3113

{ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},

3114

{ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},

3115

{ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},

3116

{ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},

3117

{ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},

3118

{ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},

3119

{ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},

3120

{ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},

3121

{ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},

3122

{ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},

3123

{ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},

3124

{ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},

3125

{ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},

3126

{ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},

3127

{ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},

3128

{ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},

3129

{ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},

3130

{ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},

Nashe Mncube

2021-03-17 12:00:31 +0000

[diff] [blame]

3131

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3132

// The number of shll instructions for the extension.

3133

{ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},

3134

{ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},

3135

{ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},

3136

{ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},

3137

{ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},

3138

{ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},

3139

{ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},

3140

{ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},

3141

{ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},

3142

{ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},

3143

{ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},

3144

{ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},

3145

{ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},

3146

{ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},

3147

{ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},

3148

{ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},

Silviu Baranga

2015-08-17 16:05:09 +0000

[diff] [blame]

3149

David Green

2f18b5e

2024-12-11 06:26:41 +0000

[diff] [blame]

3150

// FP Ext and trunc

3151

{ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt

3152

{ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl

3153

{ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2

3154

// FP16

3155

{ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt

3156

{ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt

3157

{ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl

3158

{ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2

3159

{ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl

3160

{ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl

3161

{ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl

David Green

2db7b31

2025-01-07 09:39:08 +0000

[diff] [blame]

3162

// BF16 (uses shift)

3163

{ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl

3164

{ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt

3165

{ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll

3166

{ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2

3167

{ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl

3168

{ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2

3169

{ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2

David Green

2f18b5e

2024-12-11 06:26:41 +0000

[diff] [blame]

3170

// FP Ext and trunc

3171

{ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt

3172

{ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn

3173

{ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2

3174

// FP16

3175

{ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt

3176

{ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt

3177

{ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn

3178

{ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2

3179

{ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn

3180

{ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn

3181

{ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn

David Green

2db7b31

2025-01-07 09:39:08 +0000

[diff] [blame]

3182

// BF16 (more complex, with +bf16 is handled above)

3183

{ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns

3184

{ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above

3185

{ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},

3186

{ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},

3187

{ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},

3188

{ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},

3189

{ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},

3190

{ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},

David Green

2f18b5e

2024-12-11 06:26:41 +0000

[diff] [blame]

3191

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3192

// LowerVectorINT_TO_FP:

3193

{ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},

3194

{ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},

3195

{ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},

3196

{ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},

3197

{ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},

3198

{ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},

Tim Northover

2014-06-15 09:27:06 +0000

[diff] [blame]

3199

Graham Hunter

2025-03-25 10:43:44 +0000

[diff] [blame]

3200

// SVE: to nxv2f16

3201

{ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,

3202

SVE_EXT_COST + SVE_FCVT_COST},

3203

{ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},

3204

{ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},

3205

{ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},

3206

{ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,

3207

SVE_EXT_COST + SVE_FCVT_COST},

3208

{ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},

3209

{ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},

3210

{ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},

3211

3212

// SVE: to nxv4f16

3213

{ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,

3214

SVE_EXT_COST + SVE_FCVT_COST},

3215

{ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},

3216

{ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},

3217

{ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,

3218

SVE_EXT_COST + SVE_FCVT_COST},

3219

{ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},

3220

{ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},

3221

3222

// SVE: to nxv8f16

3223

{ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,

3224

SVE_EXT_COST + SVE_FCVT_COST},

3225

{ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},

3226

{ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,

3227

SVE_EXT_COST + SVE_FCVT_COST},

3228

{ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},

3229

3230

// SVE: to nxv16f16

3231

{ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,

3232

SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},

3233

{ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,

3234

SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},

3235

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3236

// Complex: to v2f32

3237

{ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},

3238

{ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3239

{ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},

3240

{ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},

Tim Northover

2014-06-15 09:27:06 +0000

[diff] [blame]

3241

Graham Hunter

2025-03-25 10:43:44 +0000

[diff] [blame]

3242

// SVE: to nxv2f32

3243

{ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,

3244

SVE_EXT_COST + SVE_FCVT_COST},

3245

{ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},

3246

{ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},

3247

{ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},

3248

{ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,

3249

SVE_EXT_COST + SVE_FCVT_COST},

3250

{ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},

3251

{ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},

3252

{ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},

3253

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3254

// Complex: to v4f32

3255

{ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},

3256

{ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},

3257

{ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},

3258

{ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},

Tim Northover

2014-06-15 09:27:06 +0000

[diff] [blame]

3259

Graham Hunter

2025-03-25 10:43:44 +0000

[diff] [blame]

3260

// SVE: to nxv4f32

3261

{ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,

3262

SVE_EXT_COST + SVE_FCVT_COST},

3263

{ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},

3264

{ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},

3265

{ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,

3266

SVE_EXT_COST + SVE_FCVT_COST},

3267

{ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},

3268

{ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},

3269

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3270

// Complex: to v8f32

3271

{ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},

3272

{ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},

3273

{ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},

3274

{ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},

Silviu Baranga

2015-08-17 16:05:09 +0000

[diff] [blame]

3275

Graham Hunter

2025-03-25 10:43:44 +0000

[diff] [blame]

3276

// SVE: to nxv8f32

3277

{ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,

3278

SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},

3279

{ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,

3280

SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},

3281

{ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,

3282

SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},

3283

{ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,

3284

SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},

3285

3286

// SVE: to nxv16f32

3287

{ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,

3288

SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},

3289

{ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,

3290

SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},

3291

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3292

// Complex: to v16f32

3293

{ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},

3294

{ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},

Silviu Baranga

2015-08-17 16:05:09 +0000

[diff] [blame]

3295

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3296

// Complex: to v2f64

3297

{ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},

3298

{ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},

3299

{ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},

3300

{ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},

3301

{ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},

3302

{ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},

Tim Northover

2014-06-15 09:27:06 +0000

[diff] [blame]

3303

Graham Hunter

2025-03-25 10:43:44 +0000

[diff] [blame]

3304

// SVE: to nxv2f64

3305

{ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,

3306

SVE_EXT_COST + SVE_FCVT_COST},

3307

{ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},

3308

{ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},

3309

{ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},

3310

{ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,

3311

SVE_EXT_COST + SVE_FCVT_COST},

3312

{ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},

3313

{ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},

3314

{ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},

3315

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3316

// Complex: to v4f64

3317

{ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},

3318

{ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},

Tim Northover

2014-06-15 09:27:06 +0000

[diff] [blame]

3319

Graham Hunter

2025-03-25 10:43:44 +0000

[diff] [blame]

3320

// SVE: to nxv4f64

3321

{ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,

3322

SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},

3323

{ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,

3324

SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},

3325

{ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,

3326

SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},

3327

{ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,

3328

SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},

3329

{ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,

3330

SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},

3331

{ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,

3332

SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},

3333

3334

// SVE: to nxv8f64

3335

{ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,

3336

SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},

3337

{ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,

3338

SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},

3339

{ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,

3340

SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},

3341

{ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,

3342

SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},

3343

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3344

// LowerVectorFP_TO_INT

3345

{ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},

3346

{ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},

3347

{ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},

3348

{ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},

3349

{ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},

3350

{ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},

Tim Northover

2014-06-15 09:27:06 +0000

[diff] [blame]

3351

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3352

// Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).

3353

{ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},

3354

{ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},

3355

{ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},

3356

{ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},

3357

{ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},

3358

{ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},

Tim Northover

dbecc3b

2014-06-15 09:27:15 +0000

[diff] [blame]

3359

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3360

// Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2

3361

{ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},

3362

{ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},

3363

{ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},

3364

{ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},

Tim Northover

dbecc3b

2014-06-15 09:27:15 +0000

[diff] [blame]

3365

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3366

// Complex, from nxv2f32.

3367

{ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},

3368

{ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},

3369

{ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},

3370

{ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},

3371

{ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},

3372

{ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},

3373

{ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},

3374

{ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},

Nashe Mncube

2021-03-17 12:00:31 +0000

[diff] [blame]

3375

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3376

// Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.

3377

{ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},

3378

{ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},

3379

{ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},

3380

{ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},

3381

{ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},

3382

{ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},

Nashe Mncube

2021-03-17 12:00:31 +0000

[diff] [blame]

3383

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3384

// Complex, from nxv2f64.

3385

{ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},

3386

{ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},

3387

{ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},

3388

{ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},

Paul Walker

2025-03-04 11:34:44 +0000

[diff] [blame]

3389

{ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3390

{ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},

3391

{ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},

3392

{ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},

3393

{ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},

Paul Walker

2025-03-04 11:34:44 +0000

[diff] [blame]

3394

{ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},

Nashe Mncube

2021-03-17 12:00:31 +0000

[diff] [blame]

3395

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3396

// Complex, from nxv4f32.

3397

{ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},

3398

{ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},

3399

{ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},

3400

{ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},

Paul Walker

2025-03-04 11:34:44 +0000

[diff] [blame]

3401

{ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3402

{ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},

3403

{ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},

3404

{ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},

3405

{ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},

Paul Walker

2025-03-04 11:34:44 +0000

[diff] [blame]

3406

{ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},

Nashe Mncube

2021-03-17 12:00:31 +0000

[diff] [blame]

3407

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3408

// Complex, from nxv8f64. Illegal -> illegal conversions not required.

3409

{ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},

3410

{ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},

3411

{ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},

3412

{ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},

Nashe Mncube

2021-03-17 12:00:31 +0000

[diff] [blame]

3413

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3414

// Complex, from nxv4f64. Illegal -> illegal conversions not required.

3415

{ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},

3416

{ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},

3417

{ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},

3418

{ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},

3419

{ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},

3420

{ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},

Nashe Mncube

2021-03-17 12:00:31 +0000

[diff] [blame]

3421

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3422

// Complex, from nxv8f32. Illegal -> illegal conversions not required.

3423

{ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},

3424

{ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},

3425

{ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},

3426

{ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},

David Sherwood

57ca65e

2021-04-06 11:06:58 +0100

[diff] [blame]

3427

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3428

// Complex, from nxv8f16.

3429

{ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},

3430

{ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},

3431

{ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},

3432

{ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},

Paul Walker

2025-03-04 11:34:44 +0000

[diff] [blame]

3433

{ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3434

{ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},

3435

{ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},

3436

{ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},

3437

{ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},

Paul Walker

2025-03-04 11:34:44 +0000

[diff] [blame]

3438

{ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},

David Sherwood

57ca65e

2021-04-06 11:06:58 +0100

[diff] [blame]

3439

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3440

// Complex, from nxv4f16.

3441

{ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},

3442

{ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},

3443

{ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},

3444

{ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},

3445

{ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},

3446

{ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},

3447

{ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},

3448

{ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},

David Sherwood

57ca65e

2021-04-06 11:06:58 +0100

[diff] [blame]

3449

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3450

// Complex, from nxv2f16.

3451

{ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},

3452

{ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},

3453

{ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},

3454

{ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},

3455

{ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},

3456

{ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},

3457

{ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},

3458

{ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},

Nashe Mncube

2021-03-17 12:00:31 +0000

[diff] [blame]

3459

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3460

// Truncate from nxvmf32 to nxvmf16.

3461

{ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},

3462

{ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},

3463

{ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},

Nashe Mncube

2021-03-17 12:00:31 +0000

[diff] [blame]

3464

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3465

// Truncate from nxvmf64 to nxvmf16.

3466

{ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},

3467

{ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},

3468

{ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},

Nashe Mncube

2021-03-17 12:00:31 +0000

[diff] [blame]

3469

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3470

// Truncate from nxvmf64 to nxvmf32.

3471

{ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},

3472

{ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},

3473

{ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},

Nashe Mncube

2021-03-17 12:00:31 +0000

[diff] [blame]

3474

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3475

// Extend from nxvmf16 to nxvmf32.

3476

{ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},

3477

{ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},

3478

{ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},

Nashe Mncube

2021-03-17 12:00:31 +0000

[diff] [blame]

3479

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3480

// Extend from nxvmf16 to nxvmf64.

3481

{ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},

3482

{ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},

3483

{ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},

Nashe Mncube

2021-03-17 12:00:31 +0000

[diff] [blame]

3484

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3485

// Extend from nxvmf32 to nxvmf64.

3486

{ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},

3487

{ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},

3488

{ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},

Nashe Mncube

2021-03-17 12:00:31 +0000

[diff] [blame]

3489

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3490

// Bitcasts from float to integer

3491

{ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},

3492

{ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},

3493

{ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},

Alban Bridonneau

2feddb3

2022-01-26 13:33:38 +0000

[diff] [blame]

3494

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3495

// Bitcasts from integer to float

3496

{ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},

3497

{ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},

3498

{ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},

Hassnaa Hamdi

045eec6

2023-04-19 09:23:13 +0000

[diff] [blame]

3499

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3500

// Add cost for extending to illegal -too wide- scalable vectors.

3501

// zero/sign extend are implemented by multiple unpack operations,

3502

// where each operation has a cost of 1.

3503

{ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},

3504

{ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},

3505

{ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},

3506

{ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},

3507

{ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},

3508

{ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},

Hassnaa Hamdi

045eec6

2023-04-19 09:23:13 +0000

[diff] [blame]

3509

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3510

{ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},

3511

{ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},

3512

{ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},

3513

{ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},

3514

{ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},

3515

{ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

3516

};

3517

Dinar Temirbulatov

73668cc

2023-05-15 16:18:45 +0000

[diff] [blame]

3518

// We have to estimate a cost of fixed length operation upon

3519

// SVE registers(operations) with the number of registers required

3520

// for a fixed type to be represented upon SVE registers.

3521

EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;

3522

if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&

3523

SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&

3524

ST->useSVEForFixedLengthVectors(WiderTy)) {

3525

std::pair<InstructionCost, MVT> LT =

3526

getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3527

unsigned NumElements =

3528

AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();

Dinar Temirbulatov

73668cc

2023-05-15 16:18:45 +0000

[diff] [blame]

return AdjustCost(

LT.first *

getCastInstrCost(

Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),

3533

ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,

CostKind, I));

}

David Green

2024-12-09 23:41:18 +0000

[diff] [blame]

3537

if (const auto *Entry = ConvertCostTableLookup(

3538

ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))

Sam Parker

8aaabad

2020-05-26 11:27:57 +0100

[diff] [blame]

3539

return AdjustCost(Entry->Cost);

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

3540

David Green

47f4cd9

2022-03-03 11:17:24 +0000

[diff] [blame]

3541

static const TypeConversionCostTblEntry FP16Tbl[] = {

3542

{ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs

3543

{ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},

3544

{ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs

3545

{ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},

3546

{ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs

3547

{ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},

3548

{ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn

3549

{ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},

3550

{ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs

3551

{ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},

3552

{ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs

3553

{ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},

3554

{ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn

3555

{ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},

3556

{ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs

3557

{ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},

3558

{ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs

3559

{ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},

Florian Hahn

aa590e5

2022-03-11 10:27:17 +0000

[diff] [blame]

3560

{ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf

3561

{ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf

3562

{ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf

3563

{ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf

David Green

47f4cd9

2022-03-03 11:17:24 +0000

[diff] [blame]

3564

};

3565

3566

if (ST->hasFullFP16())

3567

if (const auto *Entry = ConvertCostTableLookup(

3568

FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))

3569

return AdjustCost(Entry->Cost);

3570

David Green

e2202b9

2025-03-26 07:26:17 +0000

[diff] [blame]

3571

// INT_TO_FP of i64->f32 will scalarize, which is required to avoid

3572

// double-rounding issues.

3573

if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&

3574

DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&

3575

isa<FixedVectorType>(Dst) && isa<FixedVectorType>(Src))

3576

return AdjustCost(

3577

cast<FixedVectorType>(Dst)->getNumElements() *

3578

getCastInstrCost(Opcode, Dst->getScalarType(), Src->getScalarType(),

3579

CCH, CostKind) +

3580

BaseT::getScalarizationOverhead(cast<FixedVectorType>(Src), false, true,

3581

CostKind) +

3582

BaseT::getScalarizationOverhead(cast<FixedVectorType>(Dst), true, false,

3583

CostKind));

3584

David Sherwood

fad69a5

2023-10-02 10:50:56 +0100

[diff] [blame]

3585

if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&

Sander de Smalen

c436649

2024-06-25 13:27:06 +0100

[diff] [blame]

3586

CCH == TTI::CastContextHint::Masked &&

3587

ST->isSVEorStreamingSVEAvailable() &&

David Sherwood

fad69a5

2023-10-02 10:50:56 +0100

[diff] [blame]

3588

TLI->getTypeAction(Src->getContext(), SrcTy) ==

3589

TargetLowering::TypePromoteInteger &&

3590

TLI->getTypeAction(Dst->getContext(), DstTy) ==

3591

TargetLowering::TypeSplitVector) {

3592

// The standard behaviour in the backend for these cases is to split the

3593

// extend up into two parts:

3594

// 1. Perform an extending load or masked load up to the legal type.

3595

// 2. Extend the loaded data to the final type.

3596

std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);

3597

Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());

3598

InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(

3599

Opcode, LegalTy, Src, CCH, CostKind, I);

3600

InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost(

3601

Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);

3602

return Part1 + Part2;

3603

}

3604

David Sherwood

afc2b7d

2023-04-05 12:58:03 +0000

[diff] [blame]

3605

// The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,

3606

// but we also want to include the TTI::CastContextHint::Masked case too.

3607

if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&

Sander de Smalen

c436649

2024-06-25 13:27:06 +0100

[diff] [blame]

3608

CCH == TTI::CastContextHint::Masked &&

3609

ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))

David Sherwood

afc2b7d

2023-04-05 12:58:03 +0000

[diff] [blame]

3610

CCH = TTI::CastContextHint::Normal;

3611

David Green

2020-07-29 13:32:53 +0100

[diff] [blame]

3612

return AdjustCost(

3613

BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

3614

}

3615

David Green

d20604e

2025-04-22 15:09:43 +0100

[diff] [blame]

3616

InstructionCost

3617

AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,

3618

VectorType *VecTy, unsigned Index,

3619

TTI::TargetCostKind CostKind) const {

Matthew Simpson

2016-04-27 15:20:21 +0000

[diff] [blame]

3620

3621

// Make sure we were given a valid extend opcode.

Matthew Simpson

47bd399

2016-04-27 16:25:04 +0000

[diff] [blame]

3622

assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&

3623

"Invalid opcode");

Matthew Simpson

2016-04-27 15:20:21 +0000

[diff] [blame]

3624

3625

// We are extending an element we extract from a vector, so the source type

3626

// of the extend is the element type of the vector.

3627

auto *Src = VecTy->getElementType();

3628

3629

// Sign- and zero-extends are for integer types only.

3630

assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");

3631

3632

// Get the cost for the extract. We compute the cost (if any) for the extend

3633

// below.

Alexey Bataev

9b5f626

2022-12-21 13:38:38 -0800

[diff] [blame]

3634

InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,

ShihPo Hung

5fb3a57

2023-01-21 05:29:05 -0800

[diff] [blame]

3635

CostKind, Index, nullptr, nullptr);

Matthew Simpson

2016-04-27 15:20:21 +0000

[diff] [blame]

3636

3637

// Legalize the types.

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

3638

auto VecLT = getTypeLegalizationCost(VecTy);

Matthew Simpson

2016-04-27 15:20:21 +0000

[diff] [blame]

3639

auto DstVT = TLI->getValueType(DL, Dst);

3640

auto SrcVT = TLI->getValueType(DL, Src);

Matthew Simpson

2016-04-27 15:20:21 +0000

[diff] [blame]

3641

3642

// If the resulting type is still a vector and the destination type is legal,

3643

// we may get the extension for free. If not, get the default cost for the

3644

// extend.

3645

if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))

David Green

2020-07-29 13:32:53 +0100

[diff] [blame]

3646

return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,

3647

CostKind);

Matthew Simpson

2016-04-27 15:20:21 +0000

[diff] [blame]

3648

3649

// The destination type should be larger than the element type. If not, get

3650

// the default cost for the extend.

David Sherwood

d67d8f8

2020-10-09 12:03:20 +0100

[diff] [blame]

3651

if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())

David Green

2020-07-29 13:32:53 +0100

[diff] [blame]

3652

return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,

3653

CostKind);

Matthew Simpson

2016-04-27 15:20:21 +0000

[diff] [blame]

switch (Opcode) {

default:

llvm_unreachable("Opcode should be either SExt or ZExt");

3658

3659

// For sign-extends, we only need a smov, which performs the extension

3660

// automatically.

3661

case Instruction::SExt:

3662

return Cost;

3663

3664

// For zero-extends, the extend is performed automatically by a umov unless

3665

// the destination type is i64 and the element type is i8 or i16.

3666

case Instruction::ZExt:

3667

if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)

return Cost;

}

// If we are unable to perform the extend for free, get the default cost.

David Green

2020-07-29 13:32:53 +0100

[diff] [blame]

3672

return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,

3673

CostKind);

Matthew Simpson

2016-04-27 15:20:21 +0000

[diff] [blame]

3674

}

3675

Sander de Smalen

14b934f

2021-01-26 16:32:30 +0000

[diff] [blame]

3676

InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,

3677

TTI::TargetCostKind CostKind,

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

3678

const Instruction *I) const {

Florian Hahn

1ccc499

2020-06-30 10:39:23 +0100

[diff] [blame]

3679

if (CostKind != TTI::TCK_RecipThroughput)

3680

return Opcode == Instruction::PHI ? 0 : 1;

Florian Hahn

c30da98

2020-07-01 18:20:01 +0100

[diff] [blame]

3681

assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");

Florian Hahn

1ccc499

2020-06-30 10:39:23 +0100

[diff] [blame]

3682

// Branches are assumed to be predicted.

Florian Hahn

c30da98

2020-07-01 18:20:01 +0100

[diff] [blame]

3683

return 0;

Florian Hahn

1ccc499

2020-06-30 10:39:23 +0100

[diff] [blame]

3684

}

3685

Sushant Gokhale

2024-11-13 11:10:49 +0530

[diff] [blame]

3686

InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(

David Green

2025-03-27 17:25:02 +0000

[diff] [blame]

3687

unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,

3688

bool HasRealUse, const Instruction *I, Value *Scalar,

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

3689

ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

3690

assert(Val->isVectorTy() && "This must be a vector type");

3691

3692

if (Index != -1U) {

3693

// Legalize the type.

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

3694

std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

3695

3696

// This type is legalized to a scalar type.

3697

if (!LT.second.isVector())

3698

return 0;

3699

David Sherwood

ef1ca4d

2022-01-12 09:51:34 +0000

[diff] [blame]

3700

// The type may be split. For fixed-width vectors we can normalize the

3701

// index to the new type.

3702

if (LT.second.isFixedLengthVector()) {

3703

unsigned Width = LT.second.getVectorNumElements();

3704

Index = Index % Width;

3705

}

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

3706

3707

// The element at index zero is already inside the vector.

Mingming Liu

2022-06-21 13:38:30 -0700

[diff] [blame]

3708

// - For a physical (HasRealUse==true) insert-element or extract-element

3709

// instruction that extracts integers, an explicit FPR -> GPR move is

3710

// needed. So it has non-zero cost.

3711

// - For the rest of cases (virtual instruction or element type is float),

3712

// consider the instruction free.

Sjoerd Meijer

079c488

2023-02-09 16:07:17 +0000

[diff] [blame]

3713

if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))

3714

return 0;

3715

3716

// This is recognising a LD1 single-element structure to one lane of one

3717

// register instruction. I.e., if this is an `insertelement` instruction,

3718

// and its second operand is a load, then we will generate a LD1, which

3719

// are expensive instructions.

3720

if (I && dyn_cast<LoadInst>(I->getOperand(1)))

David Green

2025-03-27 17:25:02 +0000

[diff] [blame]

3721

return CostKind == TTI::TCK_CodeSize

3722

? 0

3723

: ST->getVectorInsertExtractBaseCost() + 1;

Sjoerd Meijer

079c488

2023-02-09 16:07:17 +0000

[diff] [blame]

3724

David Green

eb764a7

2023-06-01 10:54:53 +0100

[diff] [blame]

3725

// i1 inserts and extract will include an extra cset or cmp of the vector

3726

// value. Increase the cost by 1 to account.

3727

if (Val->getScalarSizeInBits() == 1)

David Green

2025-03-27 17:25:02 +0000

[diff] [blame]

3728

return CostKind == TTI::TCK_CodeSize

3729

? 2

3730

: ST->getVectorInsertExtractBaseCost() + 1;

David Green

eb764a7

2023-06-01 10:54:53 +0100

[diff] [blame]

3731

Mingming Liu

2022-06-21 13:38:30 -0700

[diff] [blame]

3732

// FIXME:

3733

// If the extract-element and insert-element instructions could be

3734

// simplified away (e.g., could be combined into users by looking at use-def

3735

// context), they have no cost. This is not done in the first place for

3736

// compile-time considerations.

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

3737

}

3738

Sushant Gokhale

2024-11-13 11:10:49 +0530

[diff] [blame]

3739

// In case of Neon, if there exists extractelement from lane != 0 such that

3740

// 1. extractelement does not necessitate a move from vector_reg -> GPR.

3741

// 2. extractelement result feeds into fmul.

3742

// 3. Other operand of fmul is an extractelement from lane 0 or lane

3743

// equivalent to 0.

3744

// then the extractelement can be merged with fmul in the backend and it

3745

// incurs no cost.

3746

// e.g.

3747

// define double @foo(<2 x double> %a) {

3748

// %1 = extractelement <2 x double> %a, i32 0

3749

// %2 = extractelement <2 x double> %a, i32 1

3750

// %res = fmul double %1, %2

3751

// ret double %res

3752

// }

3753

// %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]

3754

auto ExtractCanFuseWithFmul = [&]() {

3755

// We bail out if the extract is from lane 0.

if (Index == 0)

return false;

// Check if the scalar element type of the vector operand of ExtractElement

3760

// instruction is one of the allowed types.

3761

auto IsAllowedScalarTy = [&](const Type *T) {

3762

return T->isFloatTy() || T->isDoubleTy() ||

3763

(T->isHalfTy() && ST->hasFullFP16());

3764

};

3765

3766

// Check if the extractelement user is scalar fmul.

3767

auto IsUserFMulScalarTy = [](const Value *EEUser) {

3768

// Check if the user is scalar fmul.

David Green

2024-11-29 01:11:39 +0000

[diff] [blame]

3769

const auto *BO = dyn_cast<BinaryOperator>(EEUser);

Sushant Gokhale

2024-11-13 11:10:49 +0530

[diff] [blame]

3770

return BO && BO->getOpcode() == BinaryOperator::FMul &&

3771

!BO->getType()->isVectorTy();

3772

};

3773

3774

// Check if the extract index is from lane 0 or lane equivalent to 0 for a

3775

// certain scalar type and a certain vector register width.

David Green

2024-11-29 01:11:39 +0000

[diff] [blame]

3776

auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {

Sushant Gokhale

2024-11-13 11:10:49 +0530

[diff] [blame]

3777

auto RegWidth =

3778

getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

3779

.getFixedValue();

David Green

d714b22

2024-11-29 04:01:03 +0000

[diff] [blame]

3780

return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);

Sushant Gokhale

2024-11-13 11:10:49 +0530

[diff] [blame]

3781

};

3782

3783

// Check if the type constraints on input vector type and result scalar type

3784

// of extractelement instruction are satisfied.

3785

if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))

return false;

if (Scalar) {

DenseMap<User *, unsigned> UserToExtractIdx;

3790

for (auto *U : Scalar->users()) {

3791

if (!IsUserFMulScalarTy(U))

3792

return false;

3793

// Recording entry for the user is important. Index value is not

3794

// important.

3795

UserToExtractIdx[U];

3796

}

David Green

2024-11-29 01:11:39 +0000

[diff] [blame]

3797

if (UserToExtractIdx.empty())

3798

return false;

Sushant Gokhale

2024-11-13 11:10:49 +0530

[diff] [blame]

3799

for (auto &[S, U, L] : ScalarUserAndIdx) {

3800

for (auto *U : S->users()) {

Kazu Hirata

2d287f5

2025-05-03 21:55:36 -0700

[diff] [blame^]

3801

if (UserToExtractIdx.contains(U)) {

Sushant Gokhale

2024-11-13 11:10:49 +0530

[diff] [blame]

3802

auto *FMul = cast<BinaryOperator>(U);

3803

auto *Op0 = FMul->getOperand(0);

3804

auto *Op1 = FMul->getOperand(1);

David Green

2024-11-29 01:11:39 +0000

[diff] [blame]

3805

if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {

Sushant Gokhale

2024-11-13 11:10:49 +0530

[diff] [blame]

3806

UserToExtractIdx[U] = L;

break;

}

}

}

}

for (auto &[U, L] : UserToExtractIdx) {

3813

if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&

3814

!IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))

return false;

}

} else {

const auto *EE = cast<ExtractElementInst>(I);

3819

3820

const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());

if (!IdxOp)

return false;

return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {

3825

if (!IsUserFMulScalarTy(U))

3826

return false;

3827

3828

// Check if the other operand of extractelement is also extractelement

3829

// from lane equivalent to 0.

3830

const auto *BO = cast<BinaryOperator>(U);

3831

const auto *OtherEE = dyn_cast<ExtractElementInst>(

3832

BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));

3833

if (OtherEE) {

3834

const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());

3835

if (!IdxOp)

3836

return false;

3837

return IsExtractLaneEquivalentToZero(

3838

cast<ConstantInt>(OtherEE->getIndexOperand())

3839

->getValue()

3840

.getZExtValue(),

3841

OtherEE->getType()->getScalarSizeInBits());

}

return true;

});

}

return true;

};

if (Opcode == Instruction::ExtractElement && (I || Scalar) &&

3850

ExtractCanFuseWithFmul())

3851

return 0;

3852

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

3853

// All other insert/extracts cost this much.

David Green

2025-03-27 17:25:02 +0000

[diff] [blame]

3854

return CostKind == TTI::TCK_CodeSize ? 1

3855

: ST->getVectorInsertExtractBaseCost();

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

3856

}

3857

Mingming Liu

2022-06-21 13:38:30 -0700

[diff] [blame]

3858

InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,

ShihPo Hung

5fb3a57

2023-01-21 05:29:05 -0800

[diff] [blame]

3859

TTI::TargetCostKind CostKind,

David Green

abd2c07

2025-05-01 15:55:08 +0100

[diff] [blame]

3860

unsigned Index,

3861

const Value *Op0,

3862

const Value *Op1) const {

Alexey Bataev

8cf0290

2023-04-14 09:35:03 -0700

[diff] [blame]

3863

bool HasRealUse =

3864

Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);

David Green

2025-03-27 17:25:02 +0000

[diff] [blame]

3865

return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);

Sushant Gokhale

2024-11-13 11:10:49 +0530

[diff] [blame]

3866

}

3867

3868

InstructionCost AArch64TTIImpl::getVectorInstrCost(

3869

unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,

3870

Value *Scalar,

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

3871

ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {

David Green

2025-03-27 17:25:02 +0000

[diff] [blame]

3872

return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,

3873

Scalar, ScalarUserAndIdx);

Mingming Liu

2022-06-21 13:38:30 -0700

[diff] [blame]

3874

}

3875

3876

InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,

ShihPo Hung

5fb3a57

2023-01-21 05:29:05 -0800

[diff] [blame]

3877

Type *Val,

3878

TTI::TargetCostKind CostKind,

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

3879

unsigned Index) const {

David Green

2025-03-27 17:25:02 +0000

[diff] [blame]

3880

return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,

Sushant Gokhale

2024-11-13 11:10:49 +0530

[diff] [blame]

3881

true /* HasRealUse */, &I);

Mingming Liu

2022-06-21 13:38:30 -0700

[diff] [blame]

3882

}

3883

David Green

2a859b2

2023-07-28 21:26:50 +0100

[diff] [blame]

3884

InstructionCost AArch64TTIImpl::getScalarizationOverhead(

3885

VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,

Jonas Paulsson

f5c8c1e

2025-04-30 17:11:27 +0200

[diff] [blame]

3886

TTI::TargetCostKind CostKind, bool ForPoisonSrc,

3887

ArrayRef<Value *> VL) const {

David Green

2a859b2

2023-07-28 21:26:50 +0100

[diff] [blame]

3888

if (isa<ScalableVectorType>(Ty))

3889

return InstructionCost::getInvalid();

3890

if (Ty->getElementType()->isFloatingPointTy())

3891

return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,

3892

CostKind);

David Green

052225d

2025-04-11 20:18:26 +0100

[diff] [blame]

3893

unsigned VecInstCost =

3894

CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();

3895

return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;

David Green

2a859b2

2023-07-28 21:26:50 +0100

[diff] [blame]

3896

}

3897

Sander de Smalen

2021-04-14 16:53:01 +0100

[diff] [blame]

3898

InstructionCost AArch64TTIImpl::getArithmeticInstrCost(

Sam Parker

2020-04-28 14:11:27 +0100

[diff] [blame]

3899

unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,

Philip Reames

2022-08-20 08:07:28 -0700

[diff] [blame]

3900

TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

3901

ArrayRef<const Value *> Args, const Instruction *CxtI) const {

Philip Reames

478cf94

2022-08-22 12:03:36 -0700

[diff] [blame]

3902

David Green

2024-08-09 14:25:07 +0100

[diff] [blame]

3903

// The code-generator is currently not able to handle scalable vectors

3904

// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting

3905

// it. This change will be removed when code-generation for these types is

3906

// sufficiently reliable.

3907

if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))

3908

if (VTy->getElementCount() == ElementCount::getScalable(1))

3909

return InstructionCost::getInvalid();

3910

Sam Parker

fa8bff0

2020-06-05 08:42:03 +0100

[diff] [blame]

3911

// TODO: Handle more cost kinds.

3912

if (CostKind != TTI::TCK_RecipThroughput)

Philip Reames

2022-08-20 08:07:28 -0700

[diff] [blame]

3913

return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,

3914

Op2Info, Args, CxtI);

Sam Parker

fa8bff0

2020-06-05 08:42:03 +0100

[diff] [blame]

3915

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

3916

// Legalize the type.

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

3917

std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

3918

int ISD = TLI->InstructionOpcodeToISD(Opcode);

3919

3920

switch (ISD) {

3921

default:

Philip Reames

2022-08-20 08:07:28 -0700

[diff] [blame]

3922

return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,

3923

Op2Info);

Sushant Gokhale

2025-03-09 22:26:39 -0700

[diff] [blame]

3924

case ISD::SREM:

Evandro Menezes

f9bd871

2018-03-07 22:35:32 +0000

[diff] [blame]

3925

case ISD::SDIV:

Sushant Gokhale

2025-03-09 22:26:39 -0700

[diff] [blame]

3926

/*

3927

Notes for sdiv/srem specific costs:

3928

1. This only considers the cases where the divisor is constant, uniform and

3929

(pow-of-2/non-pow-of-2). Other cases are not important since they either

3930

result in some form of (ldr + adrp), corresponding to constant vectors, or

3931

scalarization of the division operation.

3932

2. Constant divisors, either negative in whole or partially, don't result in

3933

significantly different codegen as compared to positive constant divisors.

3934

So, we don't consider negative divisors seperately.

3935

3. If the codegen is significantly different with SVE, it has been indicated

3936

using comments at appropriate places.

3937

3938

sdiv specific cases:

3939

-----------------------------------------------------------------------

3940

codegen | pow-of-2 | Type

3941

-----------------------------------------------------------------------

3942

add + cmp + csel + asr | Y | i64

3943

add + cmp + csel + asr | Y | i32

3944

-----------------------------------------------------------------------

3945

3946

srem specific cases:

3947

-----------------------------------------------------------------------

3948

codegen | pow-of-2 | Type

3949

-----------------------------------------------------------------------

3950

negs + and + and + csneg | Y | i64

3951

negs + and + and + csneg | Y | i32

3952

-----------------------------------------------------------------------

3953

3954

other sdiv/srem cases:

3955

-------------------------------------------------------------------------

3956

commom codegen | + srem | + sdiv | pow-of-2 | Type

3957

-------------------------------------------------------------------------

3958

smulh + asr + add + add | - | - | N | i64

3959

smull + lsr + add + add | - | - | N | i32

3960

usra | and + sub | sshr | Y | <2 x i64>

3961

2 * (scalar code) | - | - | N | <2 x i64>

3962

usra | bic + sub | sshr + neg | Y | <4 x i32>

3963

smull2 + smull + uzp2 | mls | - | N | <4 x i32>

3964

+ sshr + usra | | | |

3965

-------------------------------------------------------------------------

3966

*/

3967

if (Op2Info.isConstant() && Op2Info.isUniform()) {

3968

InstructionCost AddCost =

3969

getArithmeticInstrCost(Instruction::Add, Ty, CostKind,

3970

Op1Info.getNoProps(), Op2Info.getNoProps());

3971

InstructionCost AsrCost =

3972

getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,

3973

Op1Info.getNoProps(), Op2Info.getNoProps());

3974

InstructionCost MulCost =

3975

getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,

3976

Op1Info.getNoProps(), Op2Info.getNoProps());

3977

// add/cmp/csel/csneg should have similar cost while asr/negs/and should

3978

// have similar cost.

3979

auto VT = TLI->getValueType(DL, Ty);

David Green

9c6eca2

2025-03-29 19:25:17 +0000

[diff] [blame]

3980

if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {

Sushant Gokhale

2025-03-09 22:26:39 -0700

[diff] [blame]

3981

if (Op2Info.isPowerOf2()) {

3982

return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)

3983

: (3 * AsrCost + AddCost);

3984

} else {

3985

return MulCost + AsrCost + 2 * AddCost;

3986

}

3987

} else if (VT.isVector()) {

3988

InstructionCost UsraCost = 2 * AsrCost;

3989

if (Op2Info.isPowerOf2()) {

3990

// Division with scalable types corresponds to native 'asrd'

3991

// instruction when SVE is available.

3992

// e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)

3993

if (Ty->isScalableTy() && ST->hasSVE())

return 2 * AsrCost;

return UsraCost +

(ISD == ISD::SDIV

? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) *

3998

AsrCost

3999

: 2 * AddCost);

4000

} else if (LT.second == MVT::v2i64) {

4001

return VT.getVectorNumElements() *

4002

getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,

4003

Op1Info.getNoProps(),

4004

Op2Info.getNoProps());

4005

} else {

4006

// When SVE is available, we get:

4007

// smulh + lsr + add/sub + asr + add/sub.

4008

if (Ty->isScalableTy() && ST->hasSVE())

4009

return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;

4010

return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;

}

}

}

if (Op2Info.isConstant() && !Op2Info.isUniform() &&

4015

LT.second.isFixedLengthVector()) {

4016

// FIXME: When the constant vector is non-uniform, this may result in

4017

// loading the vector from constant pool or in some cases, may also result

4018

// in scalarization. For now, we are approximating this with the

4019

// scalarization cost.

4020

auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,

4021

CostKind, -1, nullptr, nullptr);

4022

auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,

4023

CostKind, -1, nullptr, nullptr);

4024

unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();

4025

return ExtractCost + InsertCost +

4026

NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),

4027

CostKind, Op1Info.getNoProps(),

4028

Op2Info.getNoProps());

Evandro Menezes

f9bd871

2018-03-07 22:35:32 +0000

[diff] [blame]

4029

}

Fangrui Song

de9d80c

2022-08-08 11:24:15 -0700

[diff] [blame]

4030

[[fallthrough]];

David Green

a5d8b7a

2025-02-26 13:49:48 +0000

[diff] [blame]

4031

case ISD::UDIV:

4032

case ISD::UREM: {

Jon Roelofs

bded3b3

2024-09-05 07:42:23 -0700

[diff] [blame]

4033

auto VT = TLI->getValueType(DL, Ty);

David Green

a5d8b7a

2025-02-26 13:49:48 +0000

[diff] [blame]

4034

if (Op2Info.isConstant()) {

4035

// If the operand is a power of 2 we can use the shift or and cost.

4036

if (ISD == ISD::UDIV && Op2Info.isPowerOf2())

4037

return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,

4038

Op1Info.getNoProps(),

4039

Op2Info.getNoProps());

4040

if (ISD == ISD::UREM && Op2Info.isPowerOf2())

4041

return getArithmeticInstrCost(Instruction::And, Ty, CostKind,

4042

Op1Info.getNoProps(),

4043

Op2Info.getNoProps());

4044

4045

if (ISD == ISD::UDIV || ISD == ISD::UREM) {

4046

// Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.

4047

// The MULHU will be expanded to UMULL for the types not listed below,

4048

// and will become a pair of UMULL+MULL2 for 128bit vectors.

4049

bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||

4050

LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||

4051

LT.second == MVT::nxv16i8;

4052

bool Is128bit = LT.second.is128BitVector();

4053

4054

InstructionCost MulCost =

4055

getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,

4056

Op1Info.getNoProps(), Op2Info.getNoProps());

4057

InstructionCost AddCost =

4058

getArithmeticInstrCost(Instruction::Add, Ty, CostKind,

4059

Op1Info.getNoProps(), Op2Info.getNoProps());

4060

InstructionCost ShrCost =

4061

getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,

4062

Op1Info.getNoProps(), Op2Info.getNoProps());

4063

InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH

4064

(HasMULH ? 0 : ShrCost) + // UMULL shift

4065

AddCost * 2 + ShrCost;

4066

return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);

4067

}

Adhemerval Zanella

f384bc7

2018-05-09 12:48:22 +0000

[diff] [blame]

4068

}

4069

Jon Roelofs

bded3b3

2024-09-05 07:42:23 -0700

[diff] [blame]

4070

// div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are

4071

// emitted by the backend even when those functions are not declared in the

4072

// module.

4073

if (!VT.isVector() && VT.getSizeInBits() > 64)

4074

return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);

4075

David Green

2022-04-03 22:16:39 +0100

[diff] [blame]

4076

InstructionCost Cost = BaseT::getArithmeticInstrCost(

Philip Reames

2022-08-20 08:07:28 -0700

[diff] [blame]

4077

Opcode, Ty, CostKind, Op1Info, Op2Info);

David Green

a5d8b7a

2025-02-26 13:49:48 +0000

[diff] [blame]

4078

if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {

Hassnaa Hamdi

2022-08-23 15:22:52 +0000

[diff] [blame]

4079

if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {

Hassnaa Hamdi

181f200

2022-09-23 11:51:19 +0000

[diff] [blame]

4080

// SDIV/UDIV operations are lowered using SVE, then we can have less

4081

// costs.

Guillaume Chatelet

8fd5558

2023-01-11 16:48:35 +0000

[diff] [blame]

4082

if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)

4083

->getPrimitiveSizeInBits()

4084

.getFixedValue() < 128) {

Hassnaa Hamdi

2022-08-23 15:22:52 +0000

[diff] [blame]

4085

EVT VT = TLI->getValueType(DL, Ty);

4086

static const CostTblEntry DivTbl[]{

4087

{ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},

4088

{ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},

4089

{ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},

4090

{ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},

4091

{ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},

4092

{ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};

4093

4094

const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());

4095

if (nullptr != Entry)

4096

return Entry->Cost;

4097

}

4098

// For 8/16-bit elements, the cost is higher because the type

4099

// requires promotion and possibly splitting:

4100

if (LT.second.getScalarType() == MVT::i8)

4101

Cost *= 8;

4102

else if (LT.second.getScalarType() == MVT::i16)

4103

Cost *= 4;

4104

return Cost;

4105

} else {

Zain Jaffal

6e4cea5

2022-11-28 10:37:31 +0200

[diff] [blame]

4106

// If one of the operands is a uniform constant then the cost for each

4107

// element is Cost for insertion, extraction and division.

4108

// Insertion cost = 2, Extraction Cost = 2, Division = cost for the

4109

// operation with scalar type

4110

if ((Op1Info.isConstant() && Op1Info.isUniform()) ||

4111

(Op2Info.isConstant() && Op2Info.isUniform())) {

4112

if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {

4113

InstructionCost DivCost = BaseT::getArithmeticInstrCost(

4114

Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);

4115

return (4 + DivCost) * VTy->getNumElements();

4116

}

4117

}

Hassnaa Hamdi

2022-08-23 15:22:52 +0000

[diff] [blame]

4118

// On AArch64, without SVE, vector divisions are expanded

4119

// into scalar divisions of each pair of elements.

David Green

c51b24c

2025-04-02 14:51:22 +0100

[diff] [blame]

4120

Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,

4121

-1, nullptr, nullptr);

4122

Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,

4123

nullptr, nullptr);

Hassnaa Hamdi

2022-08-23 15:22:52 +0000

[diff] [blame]

4124

}

4125

Evandro Menezes

f9bd871

2018-03-07 22:35:32 +0000

[diff] [blame]

4126

// TODO: if one of the arguments is scalar, then it's not necessary to

4127

// double the cost of handling the vector elements.

4128

Cost += Cost;

4129

}

4130

return Cost;

David Green

2022-04-03 22:16:39 +0100

[diff] [blame]

4131

}

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4132

case ISD::MUL:

Hassnaa Hamdi

2022-08-23 15:22:52 +0000

[diff] [blame]

4133

// When SVE is available, then we can lower the v2i64 operation using

4134

// the SVE mul instruction, which has a lower cost.

4135

if (LT.second == MVT::v2i64 && ST->hasSVE())

4136

return LT.first;

4137

4138

// When SVE is not available, there is no MUL.2d instruction,

4139

// which means mul <2 x i64> is expensive as elements are extracted

4140

// from the vectors and the muls scalarized.

4141

// As getScalarizationOverhead is a bit too pessimistic, we

4142

// estimate the cost for a i64 vector directly here, which is:

David Green

750bf35

2022-04-04 17:42:20 +0100

[diff] [blame]

4143

// - four 2-cost i64 extracts,

4144

// - two 2-cost i64 inserts, and

4145

// - two 1-cost muls.

4146

// So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with

4147

// LT.first = 2 the cost is 28. If both operands are extensions it will not

David Green

2022-04-04 12:45:04 +0100

[diff] [blame]

4148

// need to scalarize so the cost can be cheaper (smull or umull).

Hassnaa Hamdi

2022-08-23 15:22:52 +0000

[diff] [blame]

4149

// so the cost can be cheaper (smull or umull).

David Green

2023-07-12 13:13:06 +0100

[diff] [blame]

4150

if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))

David Green

2022-04-04 12:45:04 +0100

[diff] [blame]

4151

return LT.first;

David Green

27a2d3d

2025-01-20 11:43:57 +0000

[diff] [blame]

4152

return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *

4153

(getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +

4154

getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,

4155

nullptr, nullptr) *

4156

2 +

4157

getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,

4158

nullptr, nullptr));

Sjoerd Meijer

5110ff0

2020-11-30 11:16:10 +0000

[diff] [blame]

4159

case ISD::ADD:

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4160

case ISD::XOR:

4161

case ISD::OR:

4162

case ISD::AND:

David Green

65c0e45

2022-03-03 10:42:57 +0000

[diff] [blame]

4163

case ISD::SRL:

4164

case ISD::SRA:

4165

case ISD::SHL:

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4166

// These nodes are marked as 'custom' for combining purposes only.

4167

// We know that they are legal. See LowerAdd in ISelLowering.

David Green

2022-04-03 22:16:39 +0100

[diff] [blame]

4168

return LT.first;

Paul Walker

2020-06-20 20:23:31 +0100

[diff] [blame]

4169

Sjoerd Meijer

d827865

2023-04-11 12:40:14 +0100

[diff] [blame]

4170

case ISD::FNEG:

David Green

c61d565

2024-08-21 18:10:16 +0100

[diff] [blame]

4171

// Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul

4172

if ((Ty->isFloatTy() || Ty->isDoubleTy() ||

4173

(Ty->isHalfTy() && ST->hasFullFP16())) &&

4174

CxtI &&

4175

((CxtI->hasOneUse() &&

4176

match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||

4177

match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))

4178

return 0;

4179

[[fallthrough]];

Paul Walker

2020-06-20 20:23:31 +0100

[diff] [blame]

4180

case ISD::FADD:

David Sherwood

d581d94

2021-08-31 14:07:50 +0100

[diff] [blame]

4181

case ISD::FSUB:

Sjoerd Meijer

d827865

2023-04-11 12:40:14 +0100

[diff] [blame]

4182

// Increase the cost for half and bfloat types if not architecturally

4183

// supported.

4184

if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||

4185

(Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))

4186

return 2 * LT.first;

4187

if (!Ty->getScalarType()->isFP128Ty())

4188

return LT.first;

Craig Topper

6006d43

2023-05-24 12:15:23 -0700

[diff] [blame]

4189

[[fallthrough]];

David Sherwood

d581d94

2021-08-31 14:07:50 +0100

[diff] [blame]

4190

case ISD::FMUL:

4191

case ISD::FDIV:

Paul Walker

2020-06-20 20:23:31 +0100

[diff] [blame]

4192

// These nodes are marked as 'custom' just to lower them to SVE.

4193

// We know said lowering will incur no additional cost.

David Sherwood

d581d94

2021-08-31 14:07:50 +0100

[diff] [blame]

4194

if (!Ty->getScalarType()->isFP128Ty())

David Green

2022-04-03 22:16:39 +0100

[diff] [blame]

4195

return 2 * LT.first;

Paul Walker

2020-06-20 20:23:31 +0100

[diff] [blame]

4196

Philip Reames

2022-08-20 08:07:28 -0700

[diff] [blame]

4197

return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,

4198

Op2Info);

Paschalis Mpeis

bbdc62e

2024-02-23 09:29:45 +0000

[diff] [blame]

4199

case ISD::FREM:

4200

// Pass nullptr as fmod/fmodf calls are emitted by the backend even when

4201

// those functions are not declared in the module.

4202

if (!Ty->isVectorTy())

4203

return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);

4204

return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,

4205

Op2Info);

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

}

}

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

4209

InstructionCost

4210

AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,

4211

const SCEV *Ptr) const {

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4212

// Address computations in vectorized code with non-consecutive addresses will

4213

// likely result in more instructions compared to scalar code where the

4214

// computation can more often be merged into the index mode. The resulting

4215

// extra micro-ops can significantly decrease throughput.

zhongyunde

df19d87

2023-06-07 21:50:54 +0800

[diff] [blame]

4216

unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;

Mohammed Agabaria

23599ba

2017-01-05 14:03:41 +0000

[diff] [blame]

4217

int MaxMergeDistance = 64;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4218

Fangrui Song

f78650a

2018-07-30 19:41:25 +0000

[diff] [blame]

4219

if (Ty->isVectorTy() && SE &&

Mohammed Agabaria

23599ba

2017-01-05 14:03:41 +0000

[diff] [blame]

4220

!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4221

return NumVectorInstToHideOverhead;

4222

4223

// In many cases the address computation is not merged into the instruction

// addressing mode.

return 1;

}

Philip Reames

2024-09-25 07:25:57 -0700

[diff] [blame]

4228

InstructionCost AArch64TTIImpl::getCmpSelInstrCost(

4229

unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,

4230

TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

4231

TTI::OperandValueInfo Op2Info, const Instruction *I) const {

Sam Parker

3728961

2020-05-26 14:28:34 +0100

[diff] [blame]

4232

// TODO: Handle other cost kinds.

4233

if (CostKind != TTI::TCK_RecipThroughput)

Florian Hahn

2020-11-02 12:40:34 +0000

[diff] [blame]

4234

return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,

Philip Reames

d288574

2024-09-25 07:25:57 -0700

[diff] [blame]

4235

Op1Info, Op2Info, I);

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4236

4237

int ISD = TLI->InstructionOpcodeToISD(Opcode);

Silviu Baranga

a3e27ed

2015-09-09 15:35:02 +0000

[diff] [blame]

4238

// We don't lower some vector selects well that are wider than the register

4239

// width.

David Sherwood

2e080eb

2021-01-19 15:38:03 +0000

[diff] [blame]

4240

if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4241

// We would need this many instructions to hide the scalarization happening.

Chandler Carruth

2015-08-05 18:08:10 +0000

[diff] [blame]

4242

const int AmortizationCost = 20;

Florian Hahn

2020-11-02 12:40:34 +0000

[diff] [blame]

4243

4244

// If VecPred is not set, check if we can get a predicate from the context

4245

// instruction, if its type matches the requested ValTy.

4246

if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {

Ramkumar Ramachandra

4a0d53a

2024-12-13 14:18:33 +0000

[diff] [blame]

4247

CmpPredicate CurrentPred;

Florian Hahn

2020-11-02 12:40:34 +0000

[diff] [blame]

4248

if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),

4249

m_Value())))

4250

VecPred = CurrentPred;

4251

}

Florian Hahn

17ebd68

2022-01-31 10:18:28 +0000

[diff] [blame]

4252

// Check if we have a compare/select chain that can be lowered using

4253

// a (F)CMxx & BFI pair.

4254

if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||

4255

VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||

4256

VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||

4257

VecPred == CmpInst::FCMP_UNE) {

4258

static const auto ValidMinMaxTys = {

4259

MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,

4260

MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};

4261

static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};

4262

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

4263

auto LT = getTypeLegalizationCost(ValTy);

Florian Hahn

17ebd68

2022-01-31 10:18:28 +0000

[diff] [blame]

4264

if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||

4265

(ST->hasFullFP16() &&

4266

any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))

Florian Hahn

2020-11-02 12:40:34 +0000

[diff] [blame]

return LT.first;

}

Craig Topper

2015-10-28 04:02:12 +0000

[diff] [blame]

4270

static const TypeConversionCostTblEntry

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4271

VectorSelectTbl[] = {

Zhongyunde

cb353dc

2023-06-20 13:12:02 +0800

[diff] [blame]

4272

{ ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },

4273

{ ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },

4274

{ ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },

4275

{ ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },

4276

{ ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },

Silviu Baranga

a3e27ed

2015-09-09 15:35:02 +0000

[diff] [blame]

4277

{ ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },

4278

{ ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },

4279

{ ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4280

{ ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },

4281

{ ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },

4282

{ ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }

4283

};

4284

Mehdi Amini

44ede33

2015-07-09 02:09:04 +0000

[diff] [blame]

4285

EVT SelCondTy = TLI->getValueType(DL, CondTy);

4286

EVT SelValTy = TLI->getValueType(DL, ValTy);

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4287

if (SelCondTy.isSimple() && SelValTy.isSimple()) {

Craig Topper

ee0c859

2015-10-27 04:14:24 +0000

[diff] [blame]

4288

if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,

4289

SelCondTy.getSimpleVT(),

4290

SelValTy.getSimpleVT()))

4291

return Entry->Cost;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4292

}

4293

}

Craig Topper

9ad9380

2023-05-13 23:33:00 -0700

[diff] [blame]

4294

David Green

1ba9ec0

2023-05-14 23:28:11 +0100

[diff] [blame]

4295

if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {

Paul Walker

a095ebc

2025-04-22 11:20:17 +0100

[diff] [blame]

4296

Type *ValScalarTy = ValTy->getScalarType();

4297

if ((ValScalarTy->isHalfTy() && !ST->hasFullFP16()) ||

4298

ValScalarTy->isBFloatTy()) {

4299

auto *ValVTy = cast<FixedVectorType>(ValTy);

4300

Paul Walker

a095ebc

2025-04-22 11:20:17 +0100

[diff] [blame]

4301

// Without dedicated instructions we promote [b]f16 compares to f32.

4302

auto *PromotedTy =

4303

VectorType::get(Type::getFloatTy(ValTy->getContext()), ValVTy);

4304

4305

InstructionCost Cost = 0;

4306

// Promote operands to float vectors.

4307

Cost += 2 * getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy,

4308

TTI::CastContextHint::None, CostKind);

4309

// Compare float vectors.

4310

Cost += getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind,

4311

Op1Info, Op2Info);

4312

// During codegen we'll truncate the vector result from i32 to i16.

4313

Cost +=

4314

getCastInstrCost(Instruction::Trunc, VectorType::getInteger(ValVTy),

4315

VectorType::getInteger(PromotedTy),

4316

TTI::CastContextHint::None, CostKind);

4317

return Cost;

4318

}

Craig Topper

9ad9380

2023-05-13 23:33:00 -0700

[diff] [blame]

4319

}

4320

David Green

5106b22

2023-07-01 21:59:54 +0100

[diff] [blame]

4321

// Treat the icmp in icmp(and, 0) as free, as we can make use of ands.

4322

// FIXME: This can apply to more conditions and add/sub if it can be shown to

4323

// be profitable.

4324

if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&

4325

ICmpInst::isEquality(VecPred) &&

4326

TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&

4327

match(I->getOperand(1), m_Zero()) &&

4328

match(I->getOperand(0), m_And(m_Value(), m_Value())))

4329

return 0;

4330

David Sherwood

2e080eb

2021-01-19 15:38:03 +0000

[diff] [blame]

4331

// The base case handles scalable vectors fine for now, since it treats the

4332

// cost as 1 * legalization cost.

Philip Reames

d288574

2024-09-25 07:25:57 -0700

[diff] [blame]

4333

return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,

4334

Op1Info, Op2Info, I);

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4335

}

4336

Evandro Menezes

a005c1a

2019-08-05 18:09:14 +0000

[diff] [blame]

4337

AArch64TTIImpl::TTI::MemCmpExpansionOptions

4338

AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {

4339

TTI::MemCmpExpansionOptions Options;

Eli Friedman

e9ac757

2020-04-06 15:17:02 -0700

[diff] [blame]

4340

if (ST->requiresStrictAlign()) {

4341

// TODO: Add cost modeling for strict align. Misaligned loads expand to

4342

// a bunch of instructions when strict align is enabled.

4343

return Options;

4344

}

4345

Options.AllowOverlappingLoads = true;

Evandro Menezes

a005c1a

2019-08-05 18:09:14 +0000

[diff] [blame]

4346

Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);

4347

Options.NumLoadsPerBlock = Options.MaxNumLoads;

4348

// TODO: Though vector loads usually perform well on AArch64, in some targets

4349

// they may wake up the FP unit, which raises the power consumption. Perhaps

4350

// they could be used with no holds barred (-O3).

4351

Options.LoadSizes = {8, 4, 2, 1};

Igor Kirillov

849f963

2023-10-30 18:40:48 +0000

[diff] [blame]

4352

Options.AllowedTailExpansions = {3, 5, 6};

Evandro Menezes

a005c1a

2019-08-05 18:09:14 +0000

[diff] [blame]

return Options;

}

Tiehu Zhang

2022-06-17 18:24:23 +0800

[diff] [blame]

4356

bool AArch64TTIImpl::prefersVectorizedAddressing() const {

return ST->hasSVE();

}

David Sherwood

2021-04-16 16:08:38 +0100

[diff] [blame]

4360

InstructionCost

4361

AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,

4362

Align Alignment, unsigned AddressSpace,

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

4363

TTI::TargetCostKind CostKind) const {

Matthew Devereau

e00f22c

2021-08-19 11:42:20 +0100

[diff] [blame]

4364

if (useNeonVector(Src))

David Sherwood

a458b78

2021-04-16 16:08:38 +0100

[diff] [blame]

4365

return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,

4366

CostKind);

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

4367

auto LT = getTypeLegalizationCost(Src);

Kerry McLaughlin

5db5275

2021-06-08 10:49:22 +0100

[diff] [blame]

4368

if (!LT.first.isValid())

4369

return InstructionCost::getInvalid();

Sander de Smalen

2021-07-14 09:43:30 +0100

[diff] [blame]

4370

David Sherwood

2024-06-25 15:04:24 +0100

[diff] [blame]

4371

// Return an invalid cost for element types that we are unable to lower.

4372

auto *VT = cast<VectorType>(Src);

4373

if (VT->getElementType()->isIntegerTy(1))

4374

return InstructionCost::getInvalid();

4375

Sander de Smalen

2021-07-14 09:43:30 +0100

[diff] [blame]

4376

// The code-generator is currently not able to handle scalable vectors

4377

// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting

4378

// it. This change will be removed when code-generation for these types is

4379

// sufficiently reliable.

David Sherwood

2024-06-25 15:04:24 +0100

[diff] [blame]

4380

if (VT->getElementCount() == ElementCount::getScalable(1))

Sander de Smalen

2021-07-14 09:43:30 +0100

[diff] [blame]

4381

return InstructionCost::getInvalid();

4382

liqinweng

6efb45f

2022-12-09 12:45:42 +0800

[diff] [blame]

4383

return LT.first;

David Sherwood

a458b78

2021-04-16 16:08:38 +0100

[diff] [blame]

4384

}

4385

Madhur Amilkanthwar

b73771c

2024-08-14 10:12:40 +0530

[diff] [blame]

4386

// This function returns gather/scatter overhead either from

4387

// user-provided value or specialized values per-target from \p ST.

4388

static unsigned getSVEGatherScatterOverhead(unsigned Opcode,

4389

const AArch64Subtarget *ST) {

4390

assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&

4391

"Should be called on only load or stores.");

4392

switch (Opcode) {

4393

case Instruction::Load:

4394

if (SVEGatherOverhead.getNumOccurrences() > 0)

4395

return SVEGatherOverhead;

4396

return ST->getGatherOverhead();

4397

break;

4398

case Instruction::Store:

4399

if (SVEScatterOverhead.getNumOccurrences() > 0)

4400

return SVEScatterOverhead;

4401

return ST->getScatterOverhead();

4402

break;

4403

default:

4404

llvm_unreachable("Shouldn't have reached here");

4405

}

David Sherwood

8b0448c

2021-12-06 11:02:29 +0000

[diff] [blame]

4406

}

4407

Sander de Smalen

fd1f8a5

2021-01-22 21:25:50 +0000

[diff] [blame]

4408

InstructionCost AArch64TTIImpl::getGatherScatterOpCost(

Caroline Concatto

2020-12-17 16:15:28 +0000

[diff] [blame]

4409

unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

4410

Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {

Antonio Frighetto

138e6c1

2023-10-27 17:30:31 +0200

[diff] [blame]

4411

if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))

Caroline Concatto

01c190e

2021-01-07 09:07:06 +0000

[diff] [blame]

4412

return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,

4413

Alignment, CostKind, I);

Caroline Concatto

2020-12-17 16:15:28 +0000

[diff] [blame]

4414

auto *VT = cast<VectorType>(DataTy);

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

4415

auto LT = getTypeLegalizationCost(DataTy);

Kerry McLaughlin

5db5275

2021-06-08 10:49:22 +0100

[diff] [blame]

4416

if (!LT.first.isValid())

4417

return InstructionCost::getInvalid();

4418

David Sherwood

2024-06-25 15:04:24 +0100

[diff] [blame]

4419

// Return an invalid cost for element types that we are unable to lower.

Antonio Frighetto

138e6c1

2023-10-27 17:30:31 +0200

[diff] [blame]

4420

if (!LT.second.isVector() ||

David Sherwood

2024-06-25 15:04:24 +0100

[diff] [blame]

4421

!isElementTypeLegalForScalableVector(VT->getElementType()) ||

4422

VT->getElementType()->isIntegerTy(1))

Antonio Frighetto

138e6c1

2023-10-27 17:30:31 +0200

[diff] [blame]

4423

return InstructionCost::getInvalid();

4424

Sander de Smalen

2021-07-14 09:43:30 +0100

[diff] [blame]

4425

// The code-generator is currently not able to handle scalable vectors

4426

// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting

4427

// it. This change will be removed when code-generation for these types is

4428

// sufficiently reliable.

David Sherwood

2024-06-25 15:04:24 +0100

[diff] [blame]

4429

if (VT->getElementCount() == ElementCount::getScalable(1))

Sander de Smalen

2021-07-14 09:43:30 +0100

[diff] [blame]

4430

return InstructionCost::getInvalid();

4431

Caroline Concatto

2020-12-17 16:15:28 +0000

[diff] [blame]

4432

ElementCount LegalVF = LT.second.getVectorElementCount();

Sander de Smalen

2021-01-23 12:14:21 +0000

[diff] [blame]

4433

InstructionCost MemOpCost =

Alexey Bataev

d53e245

2022-08-19 05:13:25 -0700

[diff] [blame]

4434

getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,

Philip Reames

c9608d5

2022-08-22 15:26:36 -0700

[diff] [blame]

4435

{TTI::OK_AnyValue, TTI::OP_None}, I);

David Sherwood

8b0448c

2021-12-06 11:02:29 +0000

[diff] [blame]

4436

// Add on an overhead cost for using gathers/scatters.

Madhur Amilkanthwar

b73771c

2024-08-14 10:12:40 +0530

[diff] [blame]

4437

MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);

David Sherwood

9448cdc

2021-09-22 10:54:05 +0100

[diff] [blame]

4438

return LT.first * MemOpCost * getMaxNumElements(LegalVF);

Caroline Concatto

2020-12-17 16:15:28 +0000

[diff] [blame]

4439

}

4440

Caroline Concatto

37f4ccb

2020-11-06 15:53:59 +0000

[diff] [blame]

4441

bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {

4442

return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();

4443

}

4444

Sander de Smalen

2021-01-23 12:14:21 +0000

[diff] [blame]

4445

InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,

Sergei Barannikov

3334c35

2025-04-22 11:40:12 +0300

[diff] [blame]

4446

Align Alignment,

Sander de Smalen

2021-01-23 12:14:21 +0000

[diff] [blame]

4447

unsigned AddressSpace,

4448

TTI::TargetCostKind CostKind,

Philip Reames

c9608d5

2022-08-22 15:26:36 -0700

[diff] [blame]

4449

TTI::OperandValueInfo OpInfo,

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

4450

const Instruction *I) const {

Sjoerd Meijer

ee75213

2021-07-01 14:45:54 +0100

[diff] [blame]

4451

EVT VT = TLI->getValueType(DL, Ty, true);

Sam Parker

5b5e78ad

2020-06-08 15:25:03 +0100

[diff] [blame]

4452

// Type legalization can't handle structs

Sjoerd Meijer

ee75213

2021-07-01 14:45:54 +0100

[diff] [blame]

4453

if (VT == MVT::Other)

Sam Parker

5b5e78ad

2020-06-08 15:25:03 +0100

[diff] [blame]

4454

return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,

4455

CostKind);

4456

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

4457

auto LT = getTypeLegalizationCost(Ty);

Kerry McLaughlin

5db5275

2021-06-08 10:49:22 +0100

[diff] [blame]

4458

if (!LT.first.isValid())

4459

return InstructionCost::getInvalid();

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4460

Sander de Smalen

2021-07-14 09:43:30 +0100

[diff] [blame]

4461

// The code-generator is currently not able to handle scalable vectors

4462

// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting

4463

// it. This change will be removed when code-generation for these types is

4464

// sufficiently reliable.

David Sherwood

2024-06-25 15:04:24 +0100

[diff] [blame]

4465

// We also only support full register predicate loads and stores.

Sander de Smalen

2021-07-14 09:43:30 +0100

[diff] [blame]

4466

if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))

David Sherwood

2024-06-25 15:04:24 +0100

[diff] [blame]

4467

if (VTy->getElementCount() == ElementCount::getScalable(1) ||

4468

(VTy->getElementType()->isIntegerTy(1) &&

4469

!VTy->getElementCount().isKnownMultipleOf(

4470

ElementCount::getScalable(16))))

Sander de Smalen

2021-07-14 09:43:30 +0100

[diff] [blame]

4471

return InstructionCost::getInvalid();

4472

Florian Hahn

acd9cc7

2021-04-15 09:22:32 +0100

[diff] [blame]

4473

// TODO: consider latency as well for TCK_SizeAndLatency.

4474

if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)

4475

return LT.first;

4476

4477

if (CostKind != TTI::TCK_RecipThroughput)

4478

return 1;

4479

Matthew Simpson

2c8de19

2016-12-15 18:36:59 +0000

[diff] [blame]

4480

if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&

Sergei Barannikov

3334c35

2025-04-22 11:40:12 +0300

[diff] [blame]

4481

LT.second.is128BitVector() && Alignment < Align(16)) {

Evandro Menezes

330e1b8

2017-01-10 23:42:21 +0000

[diff] [blame]

4482

// Unaligned stores are extremely inefficient. We don't split all

4483

// unaligned 128-bit stores because the negative impact that has shown in

4484

// practice on inlined block copy code.

4485

// We make such stores expensive so that we will only vectorize if there

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4486

// are 6 other instructions getting vectorized.

Evandro Menezes

330e1b8

2017-01-10 23:42:21 +0000

[diff] [blame]

4487

const int AmortizationCost = 6;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

4488

4489

return LT.first * 2 * AmortizationCost;

4490

}

4491

Sjoerd Meijer

5c94fab

2022-12-16 09:20:37 +0000

[diff] [blame]

4492

// Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.

4493

if (Ty->isPtrOrPtrVectorTy())

4494

return LT.first;

4495

Florian Hahn

e473daa

2024-01-17 21:32:06 +0000

[diff] [blame]

4496

if (useNeonVector(Ty)) {

4497

// Check truncating stores and extending loads.

4498

if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {

4499

// v4i8 types are lowered to scalar a load/store and sshll/xtn.

4500

if (VT == MVT::v4i8)

4501

return 2;

4502

// Otherwise we need to scalarize.

4503

return cast<FixedVectorType>(Ty)->getNumElements() * 2;

4504

}

4505

EVT EltVT = VT.getVectorElementType();

4506

unsigned EltSize = EltVT.getScalarSizeInBits();

4507

if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||

Sergei Barannikov

3334c35

2025-04-22 11:40:12 +0300

[diff] [blame]

4508

VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))

Florian Hahn

e473daa

2024-01-17 21:32:06 +0000

[diff] [blame]

4509

return LT.first;

4510

// FIXME: v3i8 lowering currently is very inefficient, due to automatic

4511

// widening to v4i8, which produces suboptimal results.

4512

if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)

4513

return LT.first;

4514

4515

// Check non-power-of-2 loads/stores for legal vector element types with

4516

// NEON. Non-power-of-2 memory ops will get broken down to a set of

4517

// operations on smaller power-of-2 ops, including ld1/st1.

4518

LLVMContext &C = Ty->getContext();

4519

InstructionCost Cost(0);

4520

SmallVector<EVT> TypeWorklist;

4521

TypeWorklist.push_back(VT);

4522

while (!TypeWorklist.empty()) {

4523

EVT CurrVT = TypeWorklist.pop_back_val();

4524

unsigned CurrNumElements = CurrVT.getVectorNumElements();

4525

if (isPowerOf2_32(CurrNumElements)) {

Cost += 1;

continue;

}

unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;

4531

TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));

4532

TypeWorklist.push_back(

4533

EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));

4534

}

4535

return Cost;

Tim Northover

2014-05-24 12:50:23 +0000

[diff] [blame]

}

return LT.first;

}

James Molloy

2014-08-05 12:30:34 +0000

[diff] [blame]

4540

Sander de Smalen

2021-01-23 12:14:21 +0000

[diff] [blame]

4541

InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(

Guillaume Chatelet

fdc7c7f

2020-06-26 11:00:53 +0000

[diff] [blame]

4542

unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,

4543

Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

4544

bool UseMaskForCond, bool UseMaskForGaps) const {

Hao Liu

2015-06-26 02:32:07 +0000

[diff] [blame]

4545

assert(Factor >= 2 && "Invalid interleave factor");

Graham Hunter

2023-03-21 11:48:49 +0000

[diff] [blame]

4546

auto *VecVTy = cast<VectorType>(VecTy);

4547

Philip Reames

b3c687b

2024-10-15 07:37:46 -0700

[diff] [blame]

4548

if (VecTy->isScalableTy() && !ST->hasSVE())

Graham Hunter

2023-03-21 11:48:49 +0000

[diff] [blame]

4549

return InstructionCost::getInvalid();

Hao Liu

2015-06-26 02:32:07 +0000

[diff] [blame]

4550

Igor Kirillov

17bde32

2023-06-12 10:18:16 +0000

[diff] [blame]

4551

// Vectorization for masked interleaved accesses is only enabled for scalable

4552

// VF.

4553

if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))

4554

return InstructionCost::getInvalid();

4555

4556

if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {

Graham Hunter

2023-03-21 11:48:49 +0000

[diff] [blame]

4557

unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();

Christopher Tetreault

616d8d9

2020-06-16 13:42:58 -0700

[diff] [blame]

4558

auto *SubVecTy =

Graham Hunter

2023-03-21 11:48:49 +0000

[diff] [blame]

4559

VectorType::get(VecVTy->getElementType(),

4560

VecVTy->getElementCount().divideCoefficientBy(Factor));

Hao Liu

2015-06-26 02:32:07 +0000

[diff] [blame]

4561

4562

// ldN/stN only support legal vector types of size 64 or 128 in bits.

Matthew Simpson

aee9771

2017-03-02 15:15:35 +0000

[diff] [blame]

4563

// Accesses having vector types that are a multiple of 128 bits can be

4564

// matched to more than one ldN/stN instruction.

Bradley Smith

13faa5f

2021-10-18 12:29:26 +0000

[diff] [blame]

4565

bool UseScalable;

Graham Hunter

2023-03-21 11:48:49 +0000

[diff] [blame]

4566

if (MinElts % Factor == 0 &&

Bradley Smith

13faa5f

2021-10-18 12:29:26 +0000

[diff] [blame]

4567

TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))

4568

return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);

Hao Liu

2015-06-26 02:32:07 +0000

[diff] [blame]

4569

}

4570

4571

return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,

Sam Parker

2020-04-28 14:11:27 +0100

[diff] [blame]

4572

Alignment, AddressSpace, CostKind,

Dorit Nuzman

34da6dd

2018-10-31 09:57:56 +0000

[diff] [blame]

4573

UseMaskForCond, UseMaskForGaps);

Hao Liu

2015-06-26 02:32:07 +0000

[diff] [blame]

4574

}

4575

Daniil Fukalov

e1cb98b

2021-05-20 12:09:16 +0300

[diff] [blame]

4576

InstructionCost

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

4577

AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {

Sander de Smalen

2021-01-23 12:14:21 +0000

[diff] [blame]

4578

InstructionCost Cost = 0;

Sam Parker

2020-04-28 14:11:27 +0100

[diff] [blame]

4579

TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

James Molloy

2014-08-05 12:30:34 +0000

[diff] [blame]

4580

for (auto *I : Tys) {

4581

if (!I->isVectorTy())

4582

continue;

Christopher Tetreault

ab35ba5

2020-06-30 11:07:24 -0700

[diff] [blame]

4583

if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==

4584

128)

Sam Parker

2020-04-28 14:11:27 +0100

[diff] [blame]

4585

Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +

4586

getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);

James Molloy

2014-08-05 12:30:34 +0000

[diff] [blame]

4587

}

Daniil Fukalov

e1cb98b

2021-05-20 12:09:16 +0300

[diff] [blame]

4588

return Cost;

James Molloy

2014-08-05 12:30:34 +0000

[diff] [blame]

4589

}

James Molloy

a88896b

2014-08-21 00:02:51 +0000

[diff] [blame]

4590

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

4591

unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) const {

Matthias Braun

651cff4

2016-06-02 18:03:53 +0000

[diff] [blame]

4592

return ST->getMaxInterleaveFactor();

James Molloy

a88896b

2014-08-21 00:02:51 +0000

[diff] [blame]

4593

}

Kevin Qin

72a799a

2014-10-09 10:13:27 +0000

[diff] [blame]

4594

Geoff Berry

2017-06-28 18:53:09 +0000

[diff] [blame]

4595

// For Falkor, we want to avoid having too many strided loads in a loop since

4596

// that can exhaust the HW prefetcher resources. We adjust the unroller

4597

// MaxCount preference below to attempt to ensure unrolling doesn't create too

4598

// many strided loads.

4599

static void

4600

getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,

4601

TargetTransformInfo::UnrollingPreferences &UP) {

Geoff Berry

0abd980

2017-06-28 19:36:10 +0000

[diff] [blame]

4602

enum { MaxStridedLoads = 7 };

Geoff Berry

2017-06-28 18:53:09 +0000

[diff] [blame]

4603

auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {

4604

int StridedLoads = 0;

4605

// FIXME? We could make this more precise by looking at the CFG and

4606

// e.g. not counting loads in each side of an if-then-else diamond.

4607

for (const auto BB : L->blocks()) {

4608

for (auto &I : *BB) {

4609

LoadInst *LMemI = dyn_cast<LoadInst>(&I);

if (!LMemI)

continue;

Value *PtrValue = LMemI->getPointerOperand();

4614

if (L->isLoopInvariant(PtrValue))

4615

continue;

4616

4617

const SCEV *LSCEV = SE.getSCEV(PtrValue);

4618

const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);

4619

if (!LSCEVAddRec || !LSCEVAddRec->isAffine())

4620

continue;

4621

4622

// FIXME? We could take pairing of unrolled load copies into account

4623

// by looking at the AddRec, but we would probably have to limit this

4624

// to loops with no stores or other memory optimization barriers.

4625

++StridedLoads;

4626

// We've seen enough strided loads that seeing more won't make a

4627

// difference.

4628

if (StridedLoads > MaxStridedLoads / 2)

return StridedLoads;

}

}

return StridedLoads;

};

int StridedLoads = countStridedLoads(L, SE);

Nicola Zaghen

d34e60c

2018-05-14 12:53:11 +0000

[diff] [blame]

4636

LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads

4637

<< " strided loads\n");

Geoff Berry

2017-06-28 18:53:09 +0000

[diff] [blame]

4638

// Pick the largest power of 2 unroll count that won't result in too many

4639

// strided loads.

4640

if (StridedLoads) {

4641

UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);

Nicola Zaghen

d34e60c

2018-05-14 12:53:11 +0000

[diff] [blame]

4642

LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "

4643

<< UP.MaxCount << '\n');

Geoff Berry

2017-06-28 18:53:09 +0000

[diff] [blame]

}

}

David Sherwood

2025-04-09 10:34:27 +0100

[diff] [blame]

4647

// This function returns true if the loop:

4648

// 1. Has a valid cost, and

4649

// 2. Has a cost within the supplied budget.

4650

// Otherwise it returns false.

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

4651

static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI,

David Sherwood

2025-04-09 10:34:27 +0100

[diff] [blame]

4652

InstructionCost Budget,

4653

unsigned *FinalSize) {

4654

// Estimate the size of the loop.

4655

InstructionCost LoopCost = 0;

4656

4657

for (auto *BB : L->getBlocks()) {

4658

for (auto &I : *BB) {

4659

SmallVector<const Value *, 4> Operands(I.operand_values());

4660

InstructionCost Cost =

4661

TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);

4662

// This can happen with intrinsics that don't currently have a cost model

4663

// or for some operations that require SVE.

if (!Cost.isValid())

return false;

LoopCost += Cost;

if (LoopCost > Budget)

return false;

}

}

if (FinalSize)

David Green

98b6f8d

2025-04-23 07:46:27 +0100

[diff] [blame]

4674

*FinalSize = LoopCost.getValue();

David Sherwood

2025-04-09 10:34:27 +0100

[diff] [blame]

return true;

}

static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

4679

const AArch64TTIImpl &TTI) {

David Sherwood

2025-04-09 10:34:27 +0100

[diff] [blame]

4680

// Only consider loops with unknown trip counts for which we can determine

4681

// a symbolic expression. Multi-exit loops with small known trip counts will

4682

// likely be unrolled anyway.

4683

const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);

4684

if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))

4685

return false;

4686

4687

// It might not be worth unrolling loops with low max trip counts. Restrict

4688

// this to max trip counts > 32 for now.

4689

unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);

4690

if (MaxTC > 0 && MaxTC <= 32)

4691

return false;

4692

4693

// Make sure the loop size is <= 5.

4694

if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))

4695

return false;

4696

4697

// Small search loops with multiple exits can be highly beneficial to unroll.

4698

// We only care about loops with exactly two exiting blocks, although each

4699

// block could jump to the same exit block.

4700

ArrayRef<BasicBlock *> Blocks = L->getBlocks();

4701

if (Blocks.size() != 2)

4702

return false;

4703

4704

if (any_of(Blocks, [](BasicBlock *BB) {

4705

return !isa<BranchInst>(BB->getTerminator());

}))

return false;

return true;

}

Florian Hahn

2024-12-09 14:28:31 +0000

[diff] [blame]

4712

/// For Apple CPUs, we want to runtime-unroll loops to make better use if the

4713

/// OOO engine's wide instruction window and various predictors.

4714

static void

4715

getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,

4716

TargetTransformInfo::UnrollingPreferences &UP,

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

4717

const AArch64TTIImpl &TTI) {

Florian Hahn

2024-12-09 14:28:31 +0000

[diff] [blame]

4718

// Limit loops with structure that is highly likely to benefit from runtime

Florian Hahn

46a13a5

2025-02-27 14:42:45 +0000

[diff] [blame]

4719

// unrolling; that is we exclude outer loops and loops with many blocks (i.e.

4720

// likely with complex control flow). Note that the heuristics here may be

4721

// overly conservative and we err on the side of avoiding runtime unrolling

4722

// rather than unroll excessively. They are all subject to further refinement.

4723

if (!L->isInnermost() || L->getNumBlocks() > 8)

Florian Hahn

2024-12-09 14:28:31 +0000

[diff] [blame]

4724

return;

4725

David Sherwood

2025-04-09 10:34:27 +0100

[diff] [blame]

4726

// Loops with multiple exits are handled by common code.

4727

if (!L->getExitBlock())

4728

return;

4729

Florian Hahn

46a13a5

2025-02-27 14:42:45 +0000

[diff] [blame]

4730

const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);

Florian Hahn

2024-12-09 14:28:31 +0000

[diff] [blame]

4731

if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||

4732

(SE.getSmallConstantMaxTripCount(L) > 0 &&

4733

SE.getSmallConstantMaxTripCount(L) <= 32))

4734

return;

David Sherwood

2025-04-09 10:34:27 +0100

[diff] [blame]

4735

Florian Hahn

2024-12-09 14:28:31 +0000

[diff] [blame]

4736

if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))

4737

return;

4738

Florian Hahn

46a13a5

2025-02-27 14:42:45 +0000

[diff] [blame]

4739

if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L))

4740

return;

4741

Florian Hahn

2024-12-09 14:28:31 +0000

[diff] [blame]

4742

// Limit to loops with trip counts that are cheap to expand.

4743

UP.SCEVExpansionBudget = 1;

4744

4745

// Try to unroll small, single block loops, if they have load/store

4746

// dependencies, to expose more parallel memory access streams.

Florian Hahn

2024-12-22 13:10:54 +0000

[diff] [blame]

4747

BasicBlock *Header = L->getHeader();

4748

if (Header == L->getLoopLatch()) {

David Sherwood

2025-04-09 10:34:27 +0100

[diff] [blame]

4749

// Estimate the size of the loop.

4750

unsigned Size;

4751

if (!isLoopSizeWithinBudget(L, TTI, 8, &Size))

Florian Hahn

2024-12-22 13:10:54 +0000

[diff] [blame]

4752

return;

Florian Hahn

2024-12-09 14:28:31 +0000

[diff] [blame]

4753

Florian Hahn

2024-12-22 13:10:54 +0000

[diff] [blame]

4754

SmallPtrSet<Value *, 8> LoadedValues;

4755

SmallVector<StoreInst *> Stores;

4756

for (auto *BB : L->blocks()) {

4757

for (auto &I : *BB) {

4758

Value *Ptr = getLoadStorePointerOperand(&I);

4759

if (!Ptr)

4760

continue;

4761

const SCEV *PtrSCEV = SE.getSCEV(Ptr);

4762

if (SE.isLoopInvariant(PtrSCEV, L))

4763

continue;

4764

if (isa<LoadInst>(&I))

4765

LoadedValues.insert(&I);

4766

else

4767

Stores.push_back(cast<StoreInst>(&I));

4768

}

Florian Hahn

2024-12-09 14:28:31 +0000

[diff] [blame]

4769

}

Florian Hahn

2024-12-22 13:10:54 +0000

[diff] [blame]

4770

4771

// Try to find an unroll count that maximizes the use of the instruction

4772

// window, i.e. trying to fetch as many instructions per cycle as possible.

4773

unsigned MaxInstsPerLine = 16;

4774

unsigned UC = 1;

4775

unsigned BestUC = 1;

4776

unsigned SizeWithBestUC = BestUC * Size;

4777

while (UC <= 8) {

4778

unsigned SizeWithUC = UC * Size;

4779

if (SizeWithUC > 48)

4780

break;

4781

if ((SizeWithUC % MaxInstsPerLine) == 0 ||

4782

(SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {

4783

BestUC = UC;

4784

SizeWithBestUC = BestUC * Size;

}

UC++;

}

if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {

4790

return LoadedValues.contains(SI->getOperand(0));

}))

return;

UP.Runtime = true;

UP.DefaultUnrollRuntimeCount = BestUC;

4796

return;

Florian Hahn

2024-12-09 14:28:31 +0000

[diff] [blame]

4797

}

4798

Florian Hahn

2024-12-22 13:10:54 +0000

[diff] [blame]

4799

// Try to runtime-unroll loops with early-continues depending on loop-varying

4800

// loads; this helps with branch-prediction for the early-continues.

4801

auto *Term = dyn_cast<BranchInst>(Header->getTerminator());

4802

auto *Latch = L->getLoopLatch();

4803

SmallVector<BasicBlock *> Preds(predecessors(Latch));

4804

if (!Term || !Term->isConditional() || Preds.size() == 1 ||

4805

none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||

4806

none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))

Florian Hahn

2024-12-09 14:28:31 +0000

[diff] [blame]

4807

return;

4808

Florian Hahn

2024-12-22 13:10:54 +0000

[diff] [blame]

4809

std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =

4810

[&](Instruction *I, unsigned Depth) -> bool {

4811

if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)

4812

return false;

4813

4814

if (isa<LoadInst>(I))

4815

return true;

4816

4817

return any_of(I->operands(), [&](Value *V) {

4818

auto *I = dyn_cast<Instruction>(V);

4819

return I && DependsOnLoopLoad(I, Depth + 1);

});

};

CmpPredicate Pred;

Instruction *I;

if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),

4825

m_Value())) &&

4826

DependsOnLoopLoad(I, 0)) {

4827

UP.Runtime = true;

4828

}

Florian Hahn

2024-12-09 14:28:31 +0000

[diff] [blame]

4829

}

4830

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

4831

void AArch64TTIImpl::getUnrollingPreferences(

4832

Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,

4833

OptimizationRemarkEmitter *ORE) const {

Kevin Qin

2015-03-09 06:14:28 +0000

[diff] [blame]

4834

// Enable partial unrolling and runtime unrolling.

Roman Lebedev

6f6e9a8

2021-08-03 00:57:26 +0300

[diff] [blame]

4835

BaseT::getUnrollingPreferences(L, SE, UP, ORE);

Kevin Qin

2015-03-09 06:14:28 +0000

[diff] [blame]

4836

Jingu Kang

94c49529

2021-07-14 11:43:29 +0100

[diff] [blame]

4837

UP.UpperBound = true;

4838

Kevin Qin

2015-03-09 06:14:28 +0000

[diff] [blame]

4839

// For inner loop, it is more likely to be a hot one, and the runtime check

4840

// can be promoted out from LICM pass, so the overhead is less, let's try

4841

// a larger threshold to unroll more loops.

4842

if (L->getLoopDepth() > 1)

4843

UP.PartialThreshold *= 2;

4844

Kevin Qin

72a799a

2014-10-09 10:13:27 +0000

[diff] [blame]

4845

// Disable partial & runtime unrolling on -Os.

4846

UP.PartialOptSizeThreshold = 0;

Geoff Berry

2017-06-28 18:53:09 +0000

[diff] [blame]

4847

David Sherwood

2025-04-09 10:34:27 +0100

[diff] [blame]

4848

// Scan the loop: don't unroll loops with calls as this could prevent

4849

// inlining. Don't unroll vector loops either, as they don't benefit much from

4850

// unrolling.

4851

for (auto *BB : L->getBlocks()) {

4852

for (auto &I : *BB) {

4853

// Don't unroll vectorised loop.

4854

if (I.getType()->isVectorTy())

4855

return;

4856

4857

if (isa<CallBase>(I)) {

4858

if (isa<CallInst>(I) || isa<InvokeInst>(I))

4859

if (const Function *F = cast<CallBase>(I).getCalledFunction())

4860

if (!isLoweredToCall(F))

continue;

return;

}

}

}

Florian Hahn

2024-12-09 14:28:31 +0000

[diff] [blame]

4867

// Apply subtarget-specific unrolling preferences.

4868

switch (ST->getProcFamily()) {

4869

case AArch64Subtarget::AppleA14:

4870

case AArch64Subtarget::AppleA15:

4871

case AArch64Subtarget::AppleA16:

4872

case AArch64Subtarget::AppleM4:

4873

getAppleRuntimeUnrollPreferences(L, SE, UP, *this);

4874

break;

4875

case AArch64Subtarget::Falkor:

4876

if (EnableFalkorHWPFUnrollFix)

4877

getFalkorUnrollingPreferences(L, SE, UP);

break;

default:

break;

}

Nicholas Guy

2021-03-04 14:36:13 +0000

[diff] [blame]

4882

David Sherwood

2025-04-09 10:34:27 +0100

[diff] [blame]

4883

// If this is a small, multi-exit loop similar to something like std::find,

4884

// then there is typically a performance improvement achieved by unrolling.

4885

if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {

4886

UP.RuntimeUnrollMultiExit = true;

4887

UP.Runtime = true;

4888

// Limit unroll count.

4889

UP.DefaultUnrollRuntimeCount = 4;

4890

// Allow slightly more costly trip-count expansion to catch search loops

4891

// with pointer inductions.

4892

UP.SCEVExpansionBudget = 5;

4893

return;

Nicholas Guy

2021-03-04 14:36:13 +0000

[diff] [blame]

4894

}

4895

4896

// Enable runtime unrolling for in-order models

4897

// If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by

4898

// checking for that case, we can ensure that the default behaviour is

4899

// unchanged

David Green

6424abc

2025-02-07 10:16:57 +0000

[diff] [blame]

4900

if (ST->getProcFamily() != AArch64Subtarget::Generic &&

Nicholas Guy

2021-03-04 14:36:13 +0000

[diff] [blame]

4901

!ST->getSchedModel().isOutOfOrder()) {

4902

UP.Runtime = true;

4903

UP.Partial = true;

Nicholas Guy

2021-03-04 14:36:13 +0000

[diff] [blame]

4904

UP.UnrollRemainder = true;

4905

UP.DefaultUnrollRuntimeCount = 4;

Nicholas Guy

3043cbc

2021-05-26 14:49:58 +0100

[diff] [blame]

4906

4907

UP.UnrollAndJam = true;

4908

UP.UnrollAndJamInnerLoopThreshold = 60;

Nicholas Guy

2021-03-04 14:36:13 +0000

[diff] [blame]

4909

}

Kevin Qin

72a799a

2014-10-09 10:13:27 +0000

[diff] [blame]

4910

}

Chad Rosier

2015-01-26 22:51:15 +0000

[diff] [blame]

4911

Sidharth Baveja

e541e1b

2020-07-10 18:38:08 +0000

[diff] [blame]

4912

void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

4913

TTI::PeelingPreferences &PP) const {

Sidharth Baveja

e541e1b

2020-07-10 18:38:08 +0000

[diff] [blame]

4914

BaseT::getPeelingPreferences(L, SE, PP);

4915

}

4916

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

4917

Value *

4918

AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,

4919

Type *ExpectedType) const {

Chad Rosier

2015-01-26 22:51:15 +0000

[diff] [blame]

4920

switch (Inst->getIntrinsicID()) {

4921

default:

4922

return nullptr;

4923

case Intrinsic::aarch64_neon_st2:

4924

case Intrinsic::aarch64_neon_st3:

4925

case Intrinsic::aarch64_neon_st4: {

4926

// Create a struct type

4927

StructType *ST = dyn_cast<StructType>(ExpectedType);

4928

if (!ST)

4929

return nullptr;

Kazu Hirata

c1e32b3

2021-10-02 12:06:29 -0700

[diff] [blame]

4930

unsigned NumElts = Inst->arg_size() - 1;

Chad Rosier

2015-01-26 22:51:15 +0000

[diff] [blame]

4931

if (ST->getNumElements() != NumElts)

4932

return nullptr;

4933

for (unsigned i = 0, e = NumElts; i != e; ++i) {

4934

if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))

4935

return nullptr;

4936

}

Manuel Brito

1e55d5b

2022-11-21 18:41:01 +0000

[diff] [blame]

4937

Value *Res = PoisonValue::get(ExpectedType);

Chad Rosier

2015-01-26 22:51:15 +0000

[diff] [blame]

4938

IRBuilder<> Builder(Inst);

4939

for (unsigned i = 0, e = NumElts; i != e; ++i) {

4940

Value *L = Inst->getArgOperand(i);

4941

Res = Builder.CreateInsertValue(Res, L, i);

}

return Res;

}

case Intrinsic::aarch64_neon_ld2:

4946

case Intrinsic::aarch64_neon_ld3:

4947

case Intrinsic::aarch64_neon_ld4:

4948

if (Inst->getType() == ExpectedType)

return Inst;

return nullptr;

}

}

Chandler Carruth

2015-01-31 03:43:40 +0000

[diff] [blame]

4954

bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

4955

MemIntrinsicInfo &Info) const {

Chad Rosier

2015-01-26 22:51:15 +0000

[diff] [blame]

4956

switch (Inst->getIntrinsicID()) {

4957

default:

4958

break;

4959

case Intrinsic::aarch64_neon_ld2:

4960

case Intrinsic::aarch64_neon_ld3:

4961

case Intrinsic::aarch64_neon_ld4:

4962

Info.ReadMem = true;

4963

Info.WriteMem = false;

Chad Rosier

2015-01-26 22:51:15 +0000

[diff] [blame]

4964

Info.PtrVal = Inst->getArgOperand(0);

4965

break;

4966

case Intrinsic::aarch64_neon_st2:

4967

case Intrinsic::aarch64_neon_st3:

4968

case Intrinsic::aarch64_neon_st4:

4969

Info.ReadMem = false;

4970

Info.WriteMem = true;

Kazu Hirata

c1e32b3

2021-10-02 12:06:29 -0700

[diff] [blame]

4971

Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);

Chad Rosier

2015-01-26 22:51:15 +0000

[diff] [blame]

break;

}

switch (Inst->getIntrinsicID()) {

4976

default:

4977

return false;

4978

case Intrinsic::aarch64_neon_ld2:

4979

case Intrinsic::aarch64_neon_st2:

4980

Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;

4981

break;

4982

case Intrinsic::aarch64_neon_ld3:

4983

case Intrinsic::aarch64_neon_st3:

4984

Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;

4985

break;

4986

case Intrinsic::aarch64_neon_ld4:

4987

case Intrinsic::aarch64_neon_st4:

4988

Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;

break;

}

return true;

}

Adam Nemet

53e758f

2016-03-18 00:27:29 +0000

[diff] [blame]

4993

Jun Bum Lim

dee5565

2017-04-03 19:20:07 +0000

[diff] [blame]

4994

/// See if \p I should be considered for address type promotion. We check if \p

4995

/// I is a sext with right type and used in memory accesses. If it used in a

4996

/// "complex" getelementptr, we allow it to be promoted without finding other

4997

/// sext instructions that sign extended the same initial value. A getelementptr

4998

/// is considered as "complex" if it has more than 2 operands.

4999

bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

5000

const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {

Jun Bum Lim

dee5565

2017-04-03 19:20:07 +0000

[diff] [blame]

5001

bool Considerable = false;

5002

AllowPromotionWithoutCommonHeader = false;

5003

if (!isa<SExtInst>(&I))

5004

return false;

5005

Type *ConsideredSExtType =

5006

Type::getInt64Ty(I.getParent()->getParent()->getContext());

5007

if (I.getType() != ConsideredSExtType)

5008

return false;

5009

// See if the sext is the one with the right type and used in at least one

5010

// GetElementPtrInst.

5011

for (const User *U : I.users()) {

5012

if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {

5013

Considerable = true;

5014

// A getelementptr is considered as "complex" if it has more than 2

5015

// operands. We will promote a SExt used in such complex GEP as we

5016

// expect some computation to be merged if they are done on 64 bits.

5017

if (GEPInst->getNumOperands() > 2) {

5018

AllowPromotionWithoutCommonHeader = true;

break;

}

}

}

return Considerable;

}

Simon Pilgrim

2021-06-11 10:19:37 +0100

[diff] [blame]

5026

bool AArch64TTIImpl::isLegalToVectorizeReduction(

5027

const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {

Kerry McLaughlin

ba1e150

2021-02-16 10:43:42 +0000

[diff] [blame]

5028

if (!VF.isScalable())

5029

return true;

5030

5031

Type *Ty = RdxDesc.getRecurrenceType();

Kerry McLaughlin

a751240

2021-07-06 10:49:43 +0100

[diff] [blame]

5032

if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))

Kerry McLaughlin

ba1e150

2021-02-16 10:43:42 +0000

[diff] [blame]

5033

return false;

5034

5035

switch (RdxDesc.getRecurrenceKind()) {

5036

case RecurKind::Add:

5037

case RecurKind::FAdd:

case RecurKind::And:

case RecurKind::Or:

case RecurKind::Xor:

case RecurKind::SMin:

5042

case RecurKind::SMax:

5043

case RecurKind::UMin:

5044

case RecurKind::UMax:

5045

case RecurKind::FMin:

5046

case RecurKind::FMax:

Rosie Sumpter

c2441b6

2021-10-11 15:50:44 +0100

[diff] [blame]

5047

case RecurKind::FMulAdd:

Mel Chen

425e9e8

2023-07-19 02:51:15 -0700

[diff] [blame]

5048

case RecurKind::IAnyOf:

5049

case RecurKind::FAnyOf:

Kerry McLaughlin

ba1e150

2021-02-16 10:43:42 +0000

[diff] [blame]

return true;

default:

return false;

}

}

Sander de Smalen

2021-01-22 22:07:09 +0000

[diff] [blame]

5056

InstructionCost

David Green

12025ce

2023-07-04 15:02:30 +0100

[diff] [blame]

5057

AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,

5058

FastMathFlags FMF,

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

5059

TTI::TargetCostKind CostKind) const {

David Green

2024-08-09 14:25:07 +0100

[diff] [blame]

5060

// The code-generator is currently not able to handle scalable vectors

5061

// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting

5062

// it. This change will be removed when code-generation for these types is

5063

// sufficiently reliable.

5064

if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))

5065

if (VTy->getElementCount() == ElementCount::getScalable(1))

5066

return InstructionCost::getInvalid();

5067

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

5068

std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

David Green

649cf45

2021-08-05 23:23:24 +0100

[diff] [blame]

5069

5070

if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())

David Green

12025ce

2023-07-04 15:02:30 +0100

[diff] [blame]

5071

return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);

David Green

649cf45

2021-08-05 23:23:24 +0100

[diff] [blame]

5072

Sander de Smalen

db134e2

2021-01-22 21:44:23 +0000

[diff] [blame]

5073

InstructionCost LegalizationCost = 0;

Caroline Concatto

2020-12-21 15:04:29 +0000

[diff] [blame]

5074

if (LT.first > 1) {

5075

Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());

David Green

12025ce

2023-07-04 15:02:30 +0100

[diff] [blame]

5076

IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);

David Green

649cf45

2021-08-05 23:23:24 +0100

[diff] [blame]

5077

LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);

Caroline Concatto

2020-12-21 15:04:29 +0000

[diff] [blame]

5078

}

5079

5080

return LegalizationCost + /*Cost of horizontal reduction*/ 2;

5081

}

5082

Sander de Smalen

bd86824

2021-01-22 21:33:51 +0000

[diff] [blame]

5083

InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

5084

unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

5085

std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);

Sander de Smalen

2021-04-14 16:53:01 +0100

[diff] [blame]

5086

InstructionCost LegalizationCost = 0;

Caroline Concatto

2020-12-21 15:04:29 +0000

[diff] [blame]

5087

if (LT.first > 1) {

5088

Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());

5089

LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);

5090

LegalizationCost *= LT.first - 1;

5091

}

5092

5093

int ISD = TLI->InstructionOpcodeToISD(Opcode);

5094

assert(ISD && "Invalid opcode");

5095

// Add the final reduction cost for the legal horizontal reduction

switch (ISD) {

case ISD::ADD:

case ISD::AND:

case ISD::OR:

case ISD::XOR:

case ISD::FADD:

return LegalizationCost + 2;

5103

default:

Sander de Smalen

bd86824

2021-01-22 21:33:51 +0000

[diff] [blame]

5104

return InstructionCost::getInvalid();

Caroline Concatto

2020-12-21 15:04:29 +0000

[diff] [blame]

}

}

Sander de Smalen

2021-01-22 21:33:51 +0000

[diff] [blame]

5108

InstructionCost

5109

AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,

Krzysztof Parzyszek

2022-12-02 09:35:05 -0800

[diff] [blame]

5110

std::optional<FastMathFlags> FMF,

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

5111

TTI::TargetCostKind CostKind) const {

David Green

2024-08-09 14:25:07 +0100

[diff] [blame]

5112

// The code-generator is currently not able to handle scalable vectors

5113

// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting

5114

// it. This change will be removed when code-generation for these types is

5115

// sufficiently reliable.

5116

if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))

5117

if (VTy->getElementCount() == ElementCount::getScalable(1))

5118

return InstructionCost::getInvalid();

5119

David Sherwood

0aff179

2021-07-07 13:18:20 +0100

[diff] [blame]

5120

if (TTI::requiresOrderedReduction(FMF)) {

David Sherwood

219d451

2021-08-18 09:40:21 +0100

[diff] [blame]

5121

if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {

5122

InstructionCost BaseCost =

5123

BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);

5124

// Add on extra cost to reflect the extra overhead on some CPUs. We still

5125

// end up vectorizing for more computationally intensive loops.

5126

return BaseCost + FixedVTy->getNumElements();

5127

}

David Sherwood

0aff179

2021-07-07 13:18:20 +0100

[diff] [blame]

5128

5129

if (Opcode != Instruction::FAdd)

5130

return InstructionCost::getInvalid();

5131

5132

auto *VTy = cast<ScalableVectorType>(ValTy);

5133

InstructionCost Cost =

5134

getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);

5135

Cost *= getMaxNumElements(VTy->getElementCount());

return Cost;

}

Caroline Concatto

2020-12-21 15:04:29 +0000

[diff] [blame]

5139

if (isa<ScalableVectorType>(ValTy))

David Green

38c9a40

2021-07-09 11:51:16 +0100

[diff] [blame]

5140

return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);

Matthew Simpson

2018-03-16 11:34:15 +0000

[diff] [blame]

5141

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

5142

std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);

Matthew Simpson

2018-03-16 11:34:15 +0000

[diff] [blame]

5143

MVT MTy = LT.second;

5144

int ISD = TLI->InstructionOpcodeToISD(Opcode);

5145

assert(ISD && "Invalid opcode");

5146

5147

// Horizontal adds can use the 'addv' instruction. We model the cost of these

David Green

c9cebda

2021-07-22 18:19:54 +0100

[diff] [blame]

5148

// instructions as twice a normal vector add, plus 1 for each legalization

5149

// step (LT.first). This is the only arithmetic vector reduction operation for

5150

// which we have an instruction.

Rosie Sumpter

2021-06-15 10:29:27 +0100

[diff] [blame]

5151

// OR, XOR and AND costs should match the codegen from:

5152

// OR: llvm/test/CodeGen/AArch64/reduce-or.ll

5153

// XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll

5154

// AND: llvm/test/CodeGen/AArch64/reduce-and.ll

Matthew Simpson

2018-03-16 11:34:15 +0000

[diff] [blame]

5155

static const CostTblEntry CostTblNoPairwise[]{

David Green

c9cebda

2021-07-22 18:19:54 +0100

[diff] [blame]

5156

{ISD::ADD, MVT::v8i8, 2},

5157

{ISD::ADD, MVT::v16i8, 2},

5158

{ISD::ADD, MVT::v4i16, 2},

5159

{ISD::ADD, MVT::v8i16, 2},

5160

{ISD::ADD, MVT::v4i32, 2},

Vasileios Porpodas

f669030

2022-07-28 17:01:15 -0700

[diff] [blame]

5161

{ISD::ADD, MVT::v2i64, 2},

Rosie Sumpter

2021-06-15 10:29:27 +0100

[diff] [blame]

5162

{ISD::OR, MVT::v8i8, 15},

5163

{ISD::OR, MVT::v16i8, 17},

5164

{ISD::OR, MVT::v4i16, 7},

5165

{ISD::OR, MVT::v8i16, 9},

5166

{ISD::OR, MVT::v2i32, 3},

5167

{ISD::OR, MVT::v4i32, 5},

5168

{ISD::OR, MVT::v2i64, 3},

5169

{ISD::XOR, MVT::v8i8, 15},

5170

{ISD::XOR, MVT::v16i8, 17},

5171

{ISD::XOR, MVT::v4i16, 7},

5172

{ISD::XOR, MVT::v8i16, 9},

5173

{ISD::XOR, MVT::v2i32, 3},

5174

{ISD::XOR, MVT::v4i32, 5},

5175

{ISD::XOR, MVT::v2i64, 3},

5176

{ISD::AND, MVT::v8i8, 15},

5177

{ISD::AND, MVT::v16i8, 17},

5178

{ISD::AND, MVT::v4i16, 7},

5179

{ISD::AND, MVT::v8i16, 9},

5180

{ISD::AND, MVT::v2i32, 3},

5181

{ISD::AND, MVT::v4i32, 5},

5182

{ISD::AND, MVT::v2i64, 3},

Matthew Simpson

2018-03-16 11:34:15 +0000

[diff] [blame]

5183

};

Rosie Sumpter

2021-06-15 10:29:27 +0100

[diff] [blame]

5184

switch (ISD) {

5185

default:

5186

break;

Sushant Gokhale

c5672e2

2024-09-24 14:35:01 +0530

[diff] [blame]

5187

case ISD::FADD:

5188

if (Type *EltTy = ValTy->getScalarType();

5189

// FIXME: For half types without fullfp16 support, this could extend and

5190

// use a fp32 faddp reduction but current codegen unrolls.

5191

MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||

5192

(EltTy->isHalfTy() && ST->hasFullFP16()))) {

5193

const unsigned NElts = MTy.getVectorNumElements();

5194

if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&

5195

isPowerOf2_32(NElts))

5196

// Reduction corresponding to series of fadd instructions is lowered to

5197

// series of faddp instructions. faddp has latency/throughput that

5198

// matches fadd instruction and hence, every faddp instruction can be

5199

// considered to have a relative cost = 1 with

5200

// CostKind = TCK_RecipThroughput.

5201

// An faddp will pairwise add vector elements, so the size of input

5202

// vector reduces by half every time, requiring

5203

// #(faddp instructions) = log2_32(NElts).

5204

return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);

5205

}

5206

break;

Rosie Sumpter

2021-06-15 10:29:27 +0100

[diff] [blame]

5207

case ISD::ADD:

5208

if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))

David Green

c9cebda

2021-07-22 18:19:54 +0100

[diff] [blame]

5209

return (LT.first - 1) + Entry->Cost;

Rosie Sumpter

2021-06-15 10:29:27 +0100

[diff] [blame]

break;

case ISD::XOR:

case ISD::AND:

case ISD::OR:

const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);

5215

if (!Entry)

5216

break;

5217

auto *ValVTy = cast<FixedVectorType>(ValTy);

David Green

e79fac2

2023-06-01 09:28:48 +0100

[diff] [blame]

5218

if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&

Rosie Sumpter

2021-06-15 10:29:27 +0100

[diff] [blame]

5219

isPowerOf2_32(ValVTy->getNumElements())) {

5220

InstructionCost ExtraCost = 0;

5221

if (LT.first != 1) {

5222

// Type needs to be split, so there is an extra cost of LT.first - 1

5223

// arithmetic ops.

5224

auto *Ty = FixedVectorType::get(ValTy->getElementType(),

5225

MTy.getVectorNumElements());

5226

ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);

5227

ExtraCost *= LT.first - 1;

5228

}

David Green

e79fac2

2023-06-01 09:28:48 +0100

[diff] [blame]

5229

// All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov

5230

auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;

5231

return Cost + ExtraCost;

Rosie Sumpter

2021-06-15 10:29:27 +0100

[diff] [blame]

5232

}

5233

break;

5234

}

David Sherwood

0aff179

2021-07-07 13:18:20 +0100

[diff] [blame]

5235

return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);

Matthew Simpson

2018-03-16 11:34:15 +0000

[diff] [blame]

5236

}

Matthew Simpson

2018-04-26 13:48:33 +0000

[diff] [blame]

5237

David Green

e5f4019

2025-02-15 20:33:03 +0000

[diff] [blame]

5238

InstructionCost AArch64TTIImpl::getExtendedReductionCost(

5239

unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

5240

std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {

David Green

e5f4019

2025-02-15 20:33:03 +0000

[diff] [blame]

5241

EVT VecVT = TLI->getValueType(DL, VecTy);

5242

EVT ResVT = TLI->getValueType(DL, ResTy);

5243

5244

if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&

5245

VecVT.getSizeInBits() >= 64) {

5246

std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);

5247

5248

// The legal cases are:

5249

// UADDLV 8/16/32->32

5250

// UADDLP 32->64

5251

unsigned RevVTSize = ResVT.getSizeInBits();

5252

if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&

5253

RevVTSize <= 32) ||

5254

((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&

5255

RevVTSize <= 32) ||

5256

((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&

5257

RevVTSize <= 64))

5258

return (LT.first - 1) * 2 + 2;

5259

}

5260

5261

return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,

CostKind);

}

InstructionCost

AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,

5267

VectorType *VecTy,

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

5268

TTI::TargetCostKind CostKind) const {

David Green

e5f4019

2025-02-15 20:33:03 +0000

[diff] [blame]

5269

EVT VecVT = TLI->getValueType(DL, VecTy);

5270

EVT ResVT = TLI->getValueType(DL, ResTy);

5271

5272

if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple()) {

5273

std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);

5274

5275

// The legal cases with dotprod are

5276

// UDOT 8->32

5277

// Which requires an additional uaddv to sum the i32 values.

5278

if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&

ResVT == MVT::i32)

return LT.first + 2;

}

return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, VecTy, CostKind);

5284

}

5285

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

5286

InstructionCost

5287

AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index,

5288

TTI::TargetCostKind CostKind) const {

Caroline Concatto

2021-06-18 15:39:03 +0100

[diff] [blame]

5289

static const CostTblEntry ShuffleTbl[] = {

5290

{ TTI::SK_Splice, MVT::nxv16i8, 1 },

5291

{ TTI::SK_Splice, MVT::nxv8i16, 1 },

5292

{ TTI::SK_Splice, MVT::nxv4i32, 1 },

5293

{ TTI::SK_Splice, MVT::nxv2i64, 1 },

5294

{ TTI::SK_Splice, MVT::nxv2f16, 1 },

5295

{ TTI::SK_Splice, MVT::nxv4f16, 1 },

5296

{ TTI::SK_Splice, MVT::nxv8f16, 1 },

5297

{ TTI::SK_Splice, MVT::nxv2bf16, 1 },

5298

{ TTI::SK_Splice, MVT::nxv4bf16, 1 },

5299

{ TTI::SK_Splice, MVT::nxv8bf16, 1 },

5300

{ TTI::SK_Splice, MVT::nxv2f32, 1 },

5301

{ TTI::SK_Splice, MVT::nxv4f32, 1 },

5302

{ TTI::SK_Splice, MVT::nxv2f64, 1 },

5303

};

5304

Paul Walker

3bb2287

2022-08-26 14:32:46 +0100

[diff] [blame]

5305

// The code-generator is currently not able to handle scalable vectors

5306

// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting

5307

// it. This change will be removed when code-generation for these types is

5308

// sufficiently reliable.

5309

if (Tp->getElementCount() == ElementCount::getScalable(1))

5310

return InstructionCost::getInvalid();

5311

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

5312

std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);

Caroline Concatto

2021-06-18 15:39:03 +0100

[diff] [blame]

5313

Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());

Caroline Concatto

2021-06-18 15:39:03 +0100

[diff] [blame]

5314

EVT PromotedVT = LT.second.getScalarType() == MVT::i1

5315

? TLI->getPromotedVTForPredicate(EVT(LT.second))

5316

: LT.second;

5317

Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());

5318

InstructionCost LegalizationCost = 0;

5319

if (Index < 0) {

5320

LegalizationCost =

5321

getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,

5322

CmpInst::BAD_ICMP_PREDICATE, CostKind) +

5323

getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,

5324

CmpInst::BAD_ICMP_PREDICATE, CostKind);

5325

}

5326

5327

// Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp

5328

// Cost performed on a promoted type.

5329

if (LT.second.getScalarType() == MVT::i1) {

5330

LegalizationCost +=

5331

getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,

5332

TTI::CastContextHint::None, CostKind) +

5333

getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,

5334

TTI::CastContextHint::None, CostKind);

5335

}

5336

const auto *Entry =

5337

CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());

5338

assert(Entry && "Illegal Type for Splice");

5339

LegalizationCost += Entry->Cost;

5340

return LegalizationCost * LT.first;

5341

}

5342

David Sherwood

2025-01-20 14:07:03 +0000

[diff] [blame]

5343

InstructionCost AArch64TTIImpl::getPartialReductionCost(

5344

unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,

5345

ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,

5346

TTI::PartialReductionExtendKind OpBExtend,

5347

std::optional<unsigned> BinOp) const {

5348

InstructionCost Invalid = InstructionCost::getInvalid();

5349

InstructionCost Cost(TTI::TCC_Basic);

5350

Nicholas Guy

9c89faa6

2025-02-13 10:35:45 +0000

[diff] [blame]

5351

// Sub opcodes currently only occur in chained cases.

5352

// Independent partial reduction subtractions are still costed as an add

5353

if (Opcode != Instruction::Add && Opcode != Instruction::Sub)

David Sherwood

2025-01-20 14:07:03 +0000

[diff] [blame]

5354

return Invalid;

5355

5356

if (InputTypeA != InputTypeB)

5357

return Invalid;

5358

5359

EVT InputEVT = EVT::getEVT(InputTypeA);

5360

EVT AccumEVT = EVT::getEVT(AccumType);

5361

Sam Tebbs

c7995a6

2025-02-05 13:34:43 +0000

[diff] [blame]

5362

unsigned VFMinValue = VF.getKnownMinValue();

5363

5364

if (VF.isScalable()) {

5365

if (!ST->isSVEorStreamingSVEAvailable())

5366

return Invalid;

5367

5368

// Don't accept a partial reduction if the scaled accumulator is vscale x 1,

5369

// since we can't lower that type.

5370

unsigned Scale =

5371

AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits();

5372

if (VFMinValue == Scale)

5373

return Invalid;

5374

}

David Sherwood

efc7234

2025-02-11 15:10:39 +0000

[diff] [blame]

5375

if (VF.isFixed() &&

5376

(!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64))

David Sherwood

2025-01-20 14:07:03 +0000

[diff] [blame]

5377

return Invalid;

5378

5379

if (InputEVT == MVT::i8) {

Sam Tebbs

c7995a6

2025-02-05 13:34:43 +0000

[diff] [blame]

5380

switch (VFMinValue) {

David Sherwood

2025-01-20 14:07:03 +0000

[diff] [blame]

default:

return Invalid;

case 8:

if (AccumEVT == MVT::i32)

5385

Cost *= 2;

5386

else if (AccumEVT != MVT::i64)

return Invalid;

break;

case 16:

if (AccumEVT == MVT::i64)

5391

Cost *= 2;

5392

else if (AccumEVT != MVT::i32)

return Invalid;

break;

}

} else if (InputEVT == MVT::i16) {

5397

// FIXME: Allow i32 accumulator but increase cost, as we would extend

5398

// it to i64.

Sam Tebbs

c7995a6

2025-02-05 13:34:43 +0000

[diff] [blame]

5399

if (VFMinValue != 8 || AccumEVT != MVT::i64)

David Sherwood

2025-01-20 14:07:03 +0000

[diff] [blame]

return Invalid;

} else

return Invalid;

Sam Tebbs

2025-05-01 16:06:37 +0100

[diff] [blame]

5404

// AArch64 supports lowering mixed fixed-width extensions to a usdot but only

5405

// if the i8mm feature is available.

David Sherwood

2025-01-20 14:07:03 +0000

[diff] [blame]

5406

if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None ||

Sam Tebbs

2876dbc

2025-05-01 16:06:37 +0100

[diff] [blame]

5407

(OpAExtend != OpBExtend && !ST->hasMatMulInt8()))

David Sherwood

2025-01-20 14:07:03 +0000

[diff] [blame]

5408

return Invalid;

5409

5410

if (!BinOp || *BinOp != Instruction::Mul)

return Invalid;

return Cost;

}

David Green

2024-04-09 16:36:08 +0100

[diff] [blame]

5416

InstructionCost AArch64TTIImpl::getShuffleCost(

5417

TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,

5418

TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,

Sergei Barannikov

2025-04-21 21:42:40 +0300

[diff] [blame]

5419

ArrayRef<const Value *> Args, const Instruction *CxtI) const {

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

5420

std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);

David Green

2024-04-09 16:36:08 +0100

[diff] [blame]

5421

David Green

2022-04-27 13:51:50 +0100

[diff] [blame]

5422

// If we have a Mask, and the LT is being legalized somehow, split the Mask

5423

// into smaller vectors and sum the cost of each shuffle.

David Green

46cef9a

2022-04-27 15:36:15 +0100

[diff] [blame]

5424

if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&

David Green

2022-04-27 13:51:50 +0100

[diff] [blame]

5425

Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&

Alexey Bataev

263a00f

2023-10-02 06:44:01 -0700

[diff] [blame]

5426

Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {

David Green

2024-04-09 16:36:08 +0100

[diff] [blame]

5427

David Green

18bb175

2024-04-21 13:53:22 +0100

[diff] [blame]

5428

// Check for LD3/LD4 instructions, which are represented in llvm IR as

5429

// deinterleaving-shuffle(load). The shuffle cost could potentially be free,

5430

// but we model it with a cost of LT.first so that LD3/LD4 have a higher

5431

// cost than just the load.

5432

if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&

5433

(ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 3) ||

5434

ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4)))

5435

return std::max<InstructionCost>(1, LT.first / 4);

5436

David Green

2024-04-09 16:36:08 +0100

[diff] [blame]

5437

// Check for ST3/ST4 instructions, which are represented in llvm IR as

5438

// store(interleaving-shuffle). The shuffle cost could potentially be free,

David Green

18bb175

2024-04-21 13:53:22 +0100

[diff] [blame]

5439

// but we model it with a cost of LT.first so that ST3/ST4 have a higher

David Green

2024-04-09 16:36:08 +0100

[diff] [blame]

5440

// cost than just the store.

5441

if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&

5442

(ShuffleVectorInst::isInterleaveMask(

5443

Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||

5444

ShuffleVectorInst::isInterleaveMask(

5445

Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))

5446

return LT.first;

5447

Alexey Bataev

263a00f

2023-10-02 06:44:01 -0700

[diff] [blame]

5448

unsigned TpNumElts = Mask.size();

David Green

2022-04-27 13:51:50 +0100

[diff] [blame]

5449

unsigned LTNumElts = LT.second.getVectorNumElements();

5450

unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;

5451

VectorType *NTp =

5452

VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());

5453

InstructionCost Cost;

5454

for (unsigned N = 0; N < NumVecs; N++) {

5455

SmallVector<int> NMask;

5456

// Split the existing mask into chunks of size LTNumElts. Track the source

5457

// sub-vectors to ensure the result has at most 2 inputs.

5458

unsigned Source1, Source2;

5459

unsigned NumSources = 0;

5460

for (unsigned E = 0; E < LTNumElts; E++) {

5461

int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]

ManuelJBrito

d22edb9

2023-04-27 16:22:57 +0100

[diff] [blame]

5462

: PoisonMaskElem;

David Green

2022-04-27 13:51:50 +0100

[diff] [blame]

5463

if (MaskElt < 0) {

ManuelJBrito

d22edb9

2023-04-27 16:22:57 +0100

[diff] [blame]

5464

NMask.push_back(PoisonMaskElem);

David Green

2022-04-27 13:51:50 +0100

[diff] [blame]

continue;

}

// Calculate which source from the input this comes from and whether it

5469

// is new to us.

5470

unsigned Source = MaskElt / LTNumElts;

5471

if (NumSources == 0) {

5472

Source1 = Source;

5473

NumSources = 1;

5474

} else if (NumSources == 1 && Source != Source1) {

5475

Source2 = Source;

5476

NumSources = 2;

5477

} else if (NumSources >= 2 && Source != Source1 && Source != Source2) {

NumSources++;

}

// Add to the new mask. For the NumSources>2 case these are not correct,

5482

// but are only used for the modular lane number.

5483

if (Source == Source1)

5484

NMask.push_back(MaskElt % LTNumElts);

5485

else if (Source == Source2)

5486

NMask.push_back(MaskElt % LTNumElts + LTNumElts);

5487

else

5488

NMask.push_back(MaskElt % LTNumElts);

5489

}

5490

// If the sub-mask has at most 2 input sub-vectors then re-cost it using

David Spickett

d1f3a92

2024-07-29 11:24:39 +0000

[diff] [blame]

5491

// getShuffleCost. If not then cost it using the worst case as the number

5492

// of element moves into a new vector.

David Green

2022-04-27 13:51:50 +0100

[diff] [blame]

5493

if (NumSources <= 2)

5494

Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc

5495

: TTI::SK_PermuteTwoSrc,

David Green

2024-04-09 16:36:08 +0100

[diff] [blame]

5496

NTp, NMask, CostKind, 0, nullptr, Args, CxtI);

David Green

2022-04-27 13:51:50 +0100

[diff] [blame]

else

Cost += LTNumElts;

}

return Cost;

}

Alexey Bataev

2023-08-08 09:57:50 -0700

[diff] [blame]

5503

Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);

Alexey Bataev

7bc079c

2024-02-12 07:09:49 -0500

[diff] [blame]

5504

bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;

David Green

c05fc9b

2025-01-09 12:10:43 +0000

[diff] [blame]

5505

// A subvector extract can be implemented with an ext (or trivial extract, if

David Green

a8dab1a

2025-01-08 08:13:07 +0000

[diff] [blame]

5506

// from lane 0). This currently only handles low or high extracts to prevent

5507

// SLP vectorizer regressions.

5508

if (IsExtractSubvector && LT.second.isFixedLengthVector()) {

5509

if (LT.second.is128BitVector() &&

5510

cast<FixedVectorType>(SubTp)->getNumElements() ==

5511

LT.second.getVectorNumElements() / 2) {

5512

if (Index == 0)

5513

return 0;

David Green

32bc029

2025-01-08 08:59:15 +0000

[diff] [blame]

5514

if (Index == (int)LT.second.getVectorNumElements() / 2)

David Green

a8dab1a

2025-01-08 08:13:07 +0000

[diff] [blame]

5515

return 1;

5516

}

Alexey Bataev

7bc079c

2024-02-12 07:09:49 -0500

[diff] [blame]

5517

Kind = TTI::SK_PermuteSingleSrc;

David Green

a8dab1a

2025-01-08 08:13:07 +0000

[diff] [blame]

5518

}

David Green

2022-04-27 12:09:01 +0100

[diff] [blame]

5519

Sjoerd Meijer

775451b

2023-03-13 13:05:34 +0000

[diff] [blame]

5520

// Check for broadcast loads, which are supported by the LD1R instruction.

5521

// In terms of code-size, the shuffle vector is free when a load + dup get

5522

// folded into a LD1R. That's what we check and return here. For performance

5523

// and reciprocal throughput, a LD1R is not completely free. In this case, we

5524

// return the cost for the broadcast below (i.e. 1 for most/all types), so

5525

// that we model the load + dup sequence slightly higher because LD1R is a

5526

// high latency instruction.

5527

if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {

David Green

2022-04-27 12:09:01 +0100

[diff] [blame]

5528

bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);

5529

if (IsLoad && LT.second.isVector() &&

5530

isLegalBroadcastLoad(Tp->getElementType(),

David Green

2022-04-27 13:51:50 +0100

[diff] [blame]

5531

LT.second.getVectorElementCount()))

Sjoerd Meijer

775451b

2023-03-13 13:05:34 +0000

[diff] [blame]

5532

return 0;

David Green

2022-04-27 12:09:01 +0100

[diff] [blame]

5533

}

5534

5535

// If we have 4 elements for the shuffle and a Mask, get the cost straight

5536

// from the perfect shuffle tables.

5537

if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&

5538

(Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&

5539

all_of(Mask, [](int E) { return E < 8; }))

5540

return getPerfectShuffleCost(Mask);

5541

David Green

f0e79d9

2024-04-09 17:16:14 +0100

[diff] [blame]

5542

// Check for identity masks, which we can treat as free.

5543

if (!Mask.empty() && LT.second.isFixedLengthVector() &&

5544

(Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&

5545

all_of(enumerate(Mask), [](const auto &M) {

5546

return M.value() < 0 || M.value() == (int)M.index();

}))

return 0;

David Green

2024-04-11 08:45:28 +0100

[diff] [blame]

5550

// Check for other shuffles that are not SK_ kinds but we have native

5551

// instructions for, for example ZIP and UZP.

5552

unsigned Unused;

5553

if (LT.second.isFixedLengthVector() &&

5554

LT.second.getVectorNumElements() == Mask.size() &&

5555

(Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&

David Green

363ec6f

2024-05-06 18:37:04 +0100

[diff] [blame]

5556

(isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||

5557

isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||

David Green

48397fe

2025-02-25 10:32:45 +0000

[diff] [blame]

5558

isREVMask(Mask, LT.second.getScalarSizeInBits(),

5559

LT.second.getVectorNumElements(), 16) ||

5560

isREVMask(Mask, LT.second.getScalarSizeInBits(),

5561

LT.second.getVectorNumElements(), 32) ||

5562

isREVMask(Mask, LT.second.getScalarSizeInBits(),

5563

LT.second.getVectorNumElements(), 64) ||

David Green

6c2cc82

2024-04-14 12:09:14 +0100

[diff] [blame]

5564

// Check for non-zero lane splats

5565

all_of(drop_begin(Mask),

5566

[&Mask](int M) { return M < 0 || M == Mask[0]; })))

David Green

a536743

2024-04-11 08:45:28 +0100

[diff] [blame]

5567

return 1;

5568

Simon Pilgrim

071e822

2018-10-25 10:52:36 +0000

[diff] [blame]

5569

if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||

Caroline Concatto

b52e6c5

2021-01-27 15:59:27 +0000

[diff] [blame]

5570

Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||

David Green

0cf9e47

2022-08-22 12:44:57 +0100

[diff] [blame]

5571

Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {

Simon Pilgrim

9c8f937

2018-06-22 09:45:31 +0000

[diff] [blame]

5572

static const CostTblEntry ShuffleTbl[] = {

liqinweng

2022-09-08 18:33:29 +0800

[diff] [blame]

5573

// Broadcast shuffle kinds can be performed with 'dup'.

5574

{TTI::SK_Broadcast, MVT::v8i8, 1},

5575

{TTI::SK_Broadcast, MVT::v16i8, 1},

5576

{TTI::SK_Broadcast, MVT::v4i16, 1},

5577

{TTI::SK_Broadcast, MVT::v8i16, 1},

5578

{TTI::SK_Broadcast, MVT::v2i32, 1},

5579

{TTI::SK_Broadcast, MVT::v4i32, 1},

5580

{TTI::SK_Broadcast, MVT::v2i64, 1},

David Green

180865a

2023-03-14 21:25:18 +0000

[diff] [blame]

5581

{TTI::SK_Broadcast, MVT::v4f16, 1},

5582

{TTI::SK_Broadcast, MVT::v8f16, 1},

liqinweng

2022-09-08 18:33:29 +0800

[diff] [blame]

5583

{TTI::SK_Broadcast, MVT::v2f32, 1},

5584

{TTI::SK_Broadcast, MVT::v4f32, 1},

5585

{TTI::SK_Broadcast, MVT::v2f64, 1},

5586

// Transpose shuffle kinds can be performed with 'trn1/trn2' and

5587

// 'zip1/zip2' instructions.

5588

{TTI::SK_Transpose, MVT::v8i8, 1},

5589

{TTI::SK_Transpose, MVT::v16i8, 1},

5590

{TTI::SK_Transpose, MVT::v4i16, 1},

5591

{TTI::SK_Transpose, MVT::v8i16, 1},

5592

{TTI::SK_Transpose, MVT::v2i32, 1},

5593

{TTI::SK_Transpose, MVT::v4i32, 1},

5594

{TTI::SK_Transpose, MVT::v2i64, 1},

David Green

180865a

2023-03-14 21:25:18 +0000

[diff] [blame]

5595

{TTI::SK_Transpose, MVT::v4f16, 1},

5596

{TTI::SK_Transpose, MVT::v8f16, 1},

liqinweng

2022-09-08 18:33:29 +0800

[diff] [blame]

5597

{TTI::SK_Transpose, MVT::v2f32, 1},

5598

{TTI::SK_Transpose, MVT::v4f32, 1},

5599

{TTI::SK_Transpose, MVT::v2f64, 1},

5600

// Select shuffle kinds.

5601

// TODO: handle vXi8/vXi16.

5602

{TTI::SK_Select, MVT::v2i32, 1}, // mov.

5603

{TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).

5604

{TTI::SK_Select, MVT::v2i64, 1}, // mov.

5605

{TTI::SK_Select, MVT::v2f32, 1}, // mov.

5606

{TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).

5607

{TTI::SK_Select, MVT::v2f64, 1}, // mov.

5608

// PermuteSingleSrc shuffle kinds.

5609

{TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.

5610

{TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.

5611

{TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.

5612

{TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.

5613

{TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.

5614

{TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.

5615

{TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.

5616

{TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.

David Green

3875c38

2022-09-08 19:54:12 +0100

[diff] [blame]

5617

{TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same

5618

{TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl

5619

{TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl

liqinweng

2022-09-08 18:33:29 +0800

[diff] [blame]

5620

{TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl

5621

{TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl

5622

{TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl

5623

// Reverse can be lowered with `rev`.

5624

{TTI::SK_Reverse, MVT::v2i32, 1}, // REV64

5625

{TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT

5626

{TTI::SK_Reverse, MVT::v2i64, 1}, // EXT

5627

{TTI::SK_Reverse, MVT::v2f32, 1}, // REV64

5628

{TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT

5629

{TTI::SK_Reverse, MVT::v2f64, 1}, // EXT

5630

{TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT

David Green

5c6453d

2025-02-24 08:37:15 +0000

[diff] [blame]

5631

{TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT

liqinweng

2022-09-08 18:33:29 +0800

[diff] [blame]

5632

{TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT

5633

{TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT

5634

{TTI::SK_Reverse, MVT::v4f16, 1}, // REV64

David Green

5c6453d

2025-02-24 08:37:15 +0000

[diff] [blame]

5635

{TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64

liqinweng

2022-09-08 18:33:29 +0800

[diff] [blame]

5636

{TTI::SK_Reverse, MVT::v4i16, 1}, // REV64

5637

{TTI::SK_Reverse, MVT::v8i8, 1}, // REV64

5638

// Splice can all be lowered as `ext`.

5639

{TTI::SK_Splice, MVT::v2i32, 1},

5640

{TTI::SK_Splice, MVT::v4i32, 1},

5641

{TTI::SK_Splice, MVT::v2i64, 1},

5642

{TTI::SK_Splice, MVT::v2f32, 1},

5643

{TTI::SK_Splice, MVT::v4f32, 1},

5644

{TTI::SK_Splice, MVT::v2f64, 1},

5645

{TTI::SK_Splice, MVT::v8f16, 1},

5646

{TTI::SK_Splice, MVT::v8bf16, 1},

5647

{TTI::SK_Splice, MVT::v8i16, 1},

5648

{TTI::SK_Splice, MVT::v16i8, 1},

liqinweng

2022-09-08 18:33:29 +0800

[diff] [blame]

5649

{TTI::SK_Splice, MVT::v4f16, 1},

David Green

5c6453d

2025-02-24 08:37:15 +0000

[diff] [blame]

5650

{TTI::SK_Splice, MVT::v4bf16, 1},

liqinweng

2022-09-08 18:33:29 +0800

[diff] [blame]

5651

{TTI::SK_Splice, MVT::v4i16, 1},

5652

{TTI::SK_Splice, MVT::v8i8, 1},

5653

// Broadcast shuffle kinds for scalable vectors

5654

{TTI::SK_Broadcast, MVT::nxv16i8, 1},

5655

{TTI::SK_Broadcast, MVT::nxv8i16, 1},

5656

{TTI::SK_Broadcast, MVT::nxv4i32, 1},

5657

{TTI::SK_Broadcast, MVT::nxv2i64, 1},

5658

{TTI::SK_Broadcast, MVT::nxv2f16, 1},

5659

{TTI::SK_Broadcast, MVT::nxv4f16, 1},

5660

{TTI::SK_Broadcast, MVT::nxv8f16, 1},

5661

{TTI::SK_Broadcast, MVT::nxv2bf16, 1},

5662

{TTI::SK_Broadcast, MVT::nxv4bf16, 1},

5663

{TTI::SK_Broadcast, MVT::nxv8bf16, 1},

5664

{TTI::SK_Broadcast, MVT::nxv2f32, 1},

5665

{TTI::SK_Broadcast, MVT::nxv4f32, 1},

5666

{TTI::SK_Broadcast, MVT::nxv2f64, 1},

5667

{TTI::SK_Broadcast, MVT::nxv16i1, 1},

5668

{TTI::SK_Broadcast, MVT::nxv8i1, 1},

5669

{TTI::SK_Broadcast, MVT::nxv4i1, 1},

5670

{TTI::SK_Broadcast, MVT::nxv2i1, 1},

5671

// Handle the cases for vector.reverse with scalable vectors

5672

{TTI::SK_Reverse, MVT::nxv16i8, 1},

5673

{TTI::SK_Reverse, MVT::nxv8i16, 1},

5674

{TTI::SK_Reverse, MVT::nxv4i32, 1},

5675

{TTI::SK_Reverse, MVT::nxv2i64, 1},

5676

{TTI::SK_Reverse, MVT::nxv2f16, 1},

5677

{TTI::SK_Reverse, MVT::nxv4f16, 1},

5678

{TTI::SK_Reverse, MVT::nxv8f16, 1},

5679

{TTI::SK_Reverse, MVT::nxv2bf16, 1},

5680

{TTI::SK_Reverse, MVT::nxv4bf16, 1},

5681

{TTI::SK_Reverse, MVT::nxv8bf16, 1},

5682

{TTI::SK_Reverse, MVT::nxv2f32, 1},

5683

{TTI::SK_Reverse, MVT::nxv4f32, 1},

5684

{TTI::SK_Reverse, MVT::nxv2f64, 1},

5685

{TTI::SK_Reverse, MVT::nxv16i1, 1},

5686

{TTI::SK_Reverse, MVT::nxv8i1, 1},

5687

{TTI::SK_Reverse, MVT::nxv4i1, 1},

5688

{TTI::SK_Reverse, MVT::nxv2i1, 1},

Matthew Simpson

2018-04-26 13:48:33 +0000

[diff] [blame]

5689

};

Simon Pilgrim

9c8f937

2018-06-22 09:45:31 +0000

[diff] [blame]

5690

if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))

Matthew Simpson

2018-04-26 13:48:33 +0000

[diff] [blame]

5691

return LT.first * Entry->Cost;

5692

}

David Green

fa784f6

2022-04-07 19:27:41 +0100

[diff] [blame]

5693

Caroline Concatto

2021-06-18 15:39:03 +0100

[diff] [blame]

5694

if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))

David Green

2ba455f

2025-04-21 06:31:03 +0100

[diff] [blame]

5695

return getSpliceCost(Tp, Index, CostKind);

David Green

fa784f6

2022-04-07 19:27:41 +0100

[diff] [blame]

5696

5697

// Inserting a subvector can often be done with either a D, S or H register

5698

// move, so long as the inserted vector is "aligned".

5699

if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&

5700

LT.second.getSizeInBits() <= 128 && SubTp) {

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

5701

std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);

David Green

fa784f6

2022-04-07 19:27:41 +0100

[diff] [blame]

5702

if (SubLT.second.isVector()) {

5703

int NumElts = LT.second.getVectorNumElements();

5704

int NumSubElts = SubLT.second.getVectorNumElements();

5705

if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)

return SubLT.first;

}

}

Alexey Bataev

2024-02-12 07:09:49 -0500

[diff] [blame]

5710

// Restore optimal kind.

5711

if (IsExtractSubvector)

5712

Kind = TTI::SK_ExtractSubvector;

David Green

2024-04-09 16:36:08 +0100

[diff] [blame]

5713

return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,

5714

CxtI);

Matthew Simpson

2018-04-26 13:48:33 +0000

[diff] [blame]

5715

}

David Sherwood

2022-07-12 12:03:39 +0100

[diff] [blame]

5716

David Sherwood

2023-03-14 18:15:03 +0000

[diff] [blame]

5717

static bool containsDecreasingPointers(Loop *TheLoop,

5718

PredicatedScalarEvolution *PSE) {

Philip Reames

e41dce4

2023-05-11 09:47:37 -0700

[diff] [blame]

5719

const auto &Strides = DenseMap<Value *, const SCEV *>();

David Sherwood

2023-03-14 18:15:03 +0000

[diff] [blame]

5720

for (BasicBlock *BB : TheLoop->blocks()) {

5721

// Scan the instructions in the block and look for addresses that are

5722

// consecutive and decreasing.

5723

for (Instruction &I : *BB) {

5724

if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {

5725

Value *Ptr = getLoadStorePointerOperand(&I);

5726

Type *AccessTy = getLoadStoreType(&I);

5727

if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,

5728

/*ShouldCheckWrap=*/false)

.value_or(0) < 0)

return true;

}

}

}

return false;

}

David Green

2024-12-31 11:07:42 +0000

[diff] [blame]

5737

bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const {

5738

if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())

5739

return SVEPreferFixedOverScalableIfEqualCost;

5740

return ST->useFixedOverScalableIfEqualCost();

5741

}

5742

Sjoerd Meijer

9bccf61

2024-11-20 09:33:39 +0000

[diff] [blame]

5743

unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {

5744

return ST->getEpilogueVectorizationMinVF();

5745

}

5746

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

5747

bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {

David Sherwood

2023-04-20 12:34:55 +0000

[diff] [blame]

5748

if (!ST->hasSVE())

David Sherwood

2022-07-12 12:03:39 +0100

[diff] [blame]

5749

return false;

5750

David Sherwood

4ef9cb6

2022-07-18 10:36:11 +0100

[diff] [blame]

5751

// We don't currently support vectorisation with interleaving for SVE - with

5752

// such loops we're better off not using tail-folding. This gives us a chance

5753

// to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.

David Sherwood

2023-04-04 13:58:58 +0000

[diff] [blame]

5754

if (TFI->IAI->hasGroups())

David Sherwood

4ef9cb6

2022-07-18 10:36:11 +0100

[diff] [blame]

5755

return false;

5756

David Sherwood

2023-04-20 12:34:55 +0000

[diff] [blame]

5757

TailFoldingOpts Required = TailFoldingOpts::Disabled;

David Sherwood

2023-04-04 13:58:58 +0000

[diff] [blame]

5758

if (TFI->LVL->getReductionVars().size())

David Sherwood

2023-04-20 12:34:55 +0000

[diff] [blame]

5759

Required |= TailFoldingOpts::Reductions;

David Sherwood

2023-04-04 13:58:58 +0000

[diff] [blame]

5760

if (TFI->LVL->getFixedOrderRecurrences().size())

David Sherwood

2023-04-20 12:34:55 +0000

[diff] [blame]

5761

Required |= TailFoldingOpts::Recurrences;

David Sherwood

2023-03-14 18:15:03 +0000

[diff] [blame]

5762

5763

// We call this to discover whether any load/store pointers in the loop have

5764

// negative strides. This will require extra work to reverse the loop

5765

// predicate, which may be expensive.

David Sherwood

2023-04-04 13:58:58 +0000

[diff] [blame]

5766

if (containsDecreasingPointers(TFI->LVL->getLoop(),

5767

TFI->LVL->getPredicatedScalarEvolution()))

David Sherwood

2023-04-20 12:34:55 +0000

[diff] [blame]

5768

Required |= TailFoldingOpts::Reverse;

5769

if (Required == TailFoldingOpts::Disabled)

5770

Required |= TailFoldingOpts::Simple;

David Sherwood

2022-07-12 12:03:39 +0100

[diff] [blame]

5771

David Sherwood

c7dbe32

2023-04-25 08:46:41 +0000

[diff] [blame]

5772

if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),

Required))

return false;

// Don't tail-fold for tight loops where we would be better off interleaving

5777

// with an unpredicated loop.

5778

unsigned NumInsns = 0;

5779

for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {

5780

NumInsns += BB->sizeWithoutDebug();

5781

}

5782

5783

// We expect 4 of these to be a IV PHI, IV add, IV compare and branch.

5784

return NumInsns >= SVETailFoldInsnThreshold;

David Sherwood

2022-07-12 12:03:39 +0100

[diff] [blame]

5785

}

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

5786

5787

InstructionCost

5788

AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,

Graham Hunter

2e8d815

2024-05-10 11:22:11 +0100

[diff] [blame]

5789

StackOffset BaseOffset, bool HasBaseReg,

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

5790

int64_t Scale, unsigned AddrSpace) const {

5791

// Scaling factors are not free at all.

5792

// Operands | Rt Latency

5793

// -------------------------------------------

5794

// Rt, [Xn, Xm] | 4

5795

// -------------------------------------------

5796

// Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5

5797

// Rt, [Xn, Wm, <extend> #imm] |

5798

TargetLoweringBase::AddrMode AM;

5799

AM.BaseGV = BaseGV;

Graham Hunter

2e8d815

2024-05-10 11:22:11 +0100

[diff] [blame]

5800

AM.BaseOffs = BaseOffset.getFixed();

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

5801

AM.HasBaseReg = HasBaseReg;

5802

AM.Scale = Scale;

Graham Hunter

2e8d815

2024-05-10 11:22:11 +0100

[diff] [blame]

5803

AM.ScalableOffset = BaseOffset.getScalable();

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

5804

if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))

5805

// Scale represents reg2 * scale, thus account for 1 if

5806

// it is not equal to 0 or 1.

5807

return AM.Scale != 0 && AM.Scale != 1;

Craig Topper

39c454a

2025-03-05 09:10:45 -0800

[diff] [blame]

5808

return InstructionCost::getInvalid();

Daniil Fukalov

2022-08-18 00:38:34 +0300

[diff] [blame]

5809

}

David Green

a2d68b4

2024-01-22 23:46:58 +0000

[diff] [blame]

5810

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

5811

bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(

5812

const Instruction *I) const {

Florian Hahn

9a0f251

2024-11-30 21:05:41 +0000

[diff] [blame]

5813

if (EnableOrLikeSelectOpt) {

5814

// For the binary operators (e.g. or) we need to be more careful than

5815

// selects, here we only transform them if they are already at a natural

5816

// break point in the code - the end of a block with an unconditional

5817

// terminator.

5818

if (I->getOpcode() == Instruction::Or &&

5819

isa<BranchInst>(I->getNextNode()) &&

5820

cast<BranchInst>(I->getNextNode())->isUnconditional())

5821

return true;

5822

5823

if (I->getOpcode() == Instruction::Add ||

5824

I->getOpcode() == Instruction::Sub)

5825

return true;

5826

}

David Green

a2d68b4

2024-01-22 23:46:58 +0000

[diff] [blame]

5827

return BaseT::shouldTreatInstructionLikeSelect(I);

Sander de Smalen

3abf55a

2024-01-31 11:38:29 +0000

[diff] [blame]

5828

}

Graham Hunter

e16f2f5

2024-06-06 14:45:36 +0100

[diff] [blame]

5829

Sergei Barannikov

2025-04-22 06:27:29 +0300

[diff] [blame]

5830

bool AArch64TTIImpl::isLSRCostLess(

5831

const TargetTransformInfo::LSRCost &C1,

5832

const TargetTransformInfo::LSRCost &C2) const {

Graham Hunter

e16f2f5

2024-06-06 14:45:36 +0100

[diff] [blame]

5833

// AArch64 specific here is adding the number of instructions to the

5834

// comparison (though not as the first consideration, as some targets do)

5835

// along with changing the priority of the base additions.

5836

// TODO: Maybe a more nuanced tradeoff between instruction count

5837

// and number of registers? To be investigated at a later date.

5838

if (EnableLSRCostOpt)

5839

return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,

5840

C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <

5841

std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,

5842

C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);

5843

5844

return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);

Sander de Smalen

2024-06-24 11:06:16 +0100

[diff] [blame]

5845

}

Jeffrey Byrnes

2024-10-09 14:30:09 -0700

[diff] [blame]

5846

5847

static bool isSplatShuffle(Value *V) {

5848

if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))

5849

return all_equal(Shuf->getShuffleMask());

return false;

}

/// Check if both Op1 and Op2 are shufflevector extracts of either the lower

5854

/// or upper half of the vector elements.

5855

static bool areExtractShuffleVectors(Value *Op1, Value *Op2,

5856

bool AllowSplat = false) {

5857

// Scalable types can't be extract shuffle vectors.

5858

if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())

5859

return false;

5860

5861

auto areTypesHalfed = [](Value *FullV, Value *HalfV) {

5862

auto *FullTy = FullV->getType();

5863

auto *HalfTy = HalfV->getType();

5864

return FullTy->getPrimitiveSizeInBits().getFixedValue() ==

5865

2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();

5866

};

5867

5868

auto extractHalf = [](Value *FullV, Value *HalfV) {

5869

auto *FullVT = cast<FixedVectorType>(FullV->getType());

5870

auto *HalfVT = cast<FixedVectorType>(HalfV->getType());

5871

return FullVT->getNumElements() == 2 * HalfVT->getNumElements();

5872

};

5873

5874

ArrayRef<int> M1, M2;

5875

Value *S1Op1 = nullptr, *S2Op1 = nullptr;

5876

if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||

5877

!match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))

5878

return false;

5879

5880

// If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that

5881

// it is not checked as an extract below.

5882

if (AllowSplat && isSplatShuffle(Op1))

5883

S1Op1 = nullptr;

5884

if (AllowSplat && isSplatShuffle(Op2))

5885

S2Op1 = nullptr;

5886

5887

// Check that the operands are half as wide as the result and we extract

5888

// half of the elements of the input vectors.

5889

if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||

5890

(S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))

5891

return false;

5892

5893

// Check the mask extracts either the lower or upper half of vector

// elements.

int M1Start = 0;

int M2Start = 0;

int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;

5898

if ((S1Op1 &&

5899

!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||

5900

(S2Op1 &&

5901

!ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))

5902

return false;

5903

5904

if ((M1Start != 0 && M1Start != (NumElements / 2)) ||

5905

(M2Start != 0 && M2Start != (NumElements / 2)))

5906

return false;

5907

if (S1Op1 && S2Op1 && M1Start != M2Start)

return false;

return true;

}

/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth

5914

/// of the vector elements.

5915

static bool areExtractExts(Value *Ext1, Value *Ext2) {

5916

auto areExtDoubled = [](Instruction *Ext) {

5917

return Ext->getType()->getScalarSizeInBits() ==

5918

2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();

5919

};

5920

5921

if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||

5922

!match(Ext2, m_ZExtOrSExt(m_Value())) ||

5923

!areExtDoubled(cast<Instruction>(Ext1)) ||

5924

!areExtDoubled(cast<Instruction>(Ext2)))

return false;

return true;

}

/// Check if Op could be used with vmull_high_p64 intrinsic.

5931

static bool isOperandOfVmullHighP64(Value *Op) {

5932

Value *VectorOperand = nullptr;

5933

ConstantInt *ElementIndex = nullptr;

5934

return match(Op, m_ExtractElt(m_Value(VectorOperand),

5935

m_ConstantInt(ElementIndex))) &&

5936

ElementIndex->getValue() == 1 &&

5937

isa<FixedVectorType>(VectorOperand->getType()) &&

5938

cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;

5939

}

5940

5941

/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.

5942

static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {

5943

return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);

5944

}

5945

5946

static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {

5947

// Restrict ourselves to the form CodeGenPrepare typically constructs.

5948

auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);

5949

if (!GEP || GEP->getNumOperands() != 2)

5950

return false;

5951

5952

Value *Base = GEP->getOperand(0);

5953

Value *Offsets = GEP->getOperand(1);

5954

5955

// We only care about scalar_base+vector_offsets.

5956

if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())

5957

return false;

5958

5959

// Sink extends that would allow us to use 32-bit offset vectors.

5960

if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {

5961

auto *OffsetsInst = cast<Instruction>(Offsets);

5962

if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&

5963

OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)

5964

Ops.push_back(&GEP->getOperandUse(1));

}

// Sink the GEP.

return true;

}

/// We want to sink following cases:

5972

/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;

5973

/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);

5974

static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {

5975

if (match(Op, m_VScale()))

5976

return true;

5977

if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||

5978

match(Op, m_Mul(m_VScale(), m_ConstantInt()))) {

5979

Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));

5980

return true;

5981

}

5982

if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||

5983

match(Op, m_Mul(m_ZExt(m_VScale()), m_ConstantInt()))) {

5984

Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);

5985

Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));

5986

Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));

return true;

}

return false;

}

/// Check if sinking \p I's operands to I's basic block is profitable, because

5993

/// the operands can be folded into a target instruction, e.g.

5994

/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).

5995

bool AArch64TTIImpl::isProfitableToSinkOperands(

5996

Instruction *I, SmallVectorImpl<Use *> &Ops) const {

5997

if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {

5998

switch (II->getIntrinsicID()) {

5999

case Intrinsic::aarch64_neon_smull:

6000

case Intrinsic::aarch64_neon_umull:

6001

if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),

6002

/*AllowSplat=*/true)) {

6003

Ops.push_back(&II->getOperandUse(0));

6004

Ops.push_back(&II->getOperandUse(1));

return true;

}

[[fallthrough]];

case Intrinsic::fma:

case Intrinsic::fmuladd:

6011

if (isa<VectorType>(I->getType()) &&

6012

cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&

!ST->hasFullFP16())

return false;

[[fallthrough]];

case Intrinsic::aarch64_neon_sqdmull:

6017

case Intrinsic::aarch64_neon_sqdmulh:

6018

case Intrinsic::aarch64_neon_sqrdmulh:

6019

// Sink splats for index lane variants

6020

if (isSplatShuffle(II->getOperand(0)))

6021

Ops.push_back(&II->getOperandUse(0));

6022

if (isSplatShuffle(II->getOperand(1)))

6023

Ops.push_back(&II->getOperandUse(1));

6024

return !Ops.empty();

6025

case Intrinsic::aarch64_neon_fmlal:

6026

case Intrinsic::aarch64_neon_fmlal2:

6027

case Intrinsic::aarch64_neon_fmlsl:

6028

case Intrinsic::aarch64_neon_fmlsl2:

6029

// Sink splats for index lane variants

6030

if (isSplatShuffle(II->getOperand(1)))

6031

Ops.push_back(&II->getOperandUse(1));

6032

if (isSplatShuffle(II->getOperand(2)))

6033

Ops.push_back(&II->getOperandUse(2));

6034

return !Ops.empty();

6035

case Intrinsic::aarch64_sve_ptest_first:

6036

case Intrinsic::aarch64_sve_ptest_last:

6037

if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))

6038

if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)

6039

Ops.push_back(&II->getOperandUse(0));

6040

return !Ops.empty();

6041

case Intrinsic::aarch64_sme_write_horiz:

6042

case Intrinsic::aarch64_sme_write_vert:

6043

case Intrinsic::aarch64_sme_writeq_horiz:

6044

case Intrinsic::aarch64_sme_writeq_vert: {

6045

auto *Idx = dyn_cast<Instruction>(II->getOperand(1));

6046

if (!Idx || Idx->getOpcode() != Instruction::Add)

6047

return false;

6048

Ops.push_back(&II->getOperandUse(1));

6049

return true;

6050

}

6051

case Intrinsic::aarch64_sme_read_horiz:

6052

case Intrinsic::aarch64_sme_read_vert:

6053

case Intrinsic::aarch64_sme_readq_horiz:

6054

case Intrinsic::aarch64_sme_readq_vert:

6055

case Intrinsic::aarch64_sme_ld1b_vert:

6056

case Intrinsic::aarch64_sme_ld1h_vert:

6057

case Intrinsic::aarch64_sme_ld1w_vert:

6058

case Intrinsic::aarch64_sme_ld1d_vert:

6059

case Intrinsic::aarch64_sme_ld1q_vert:

6060

case Intrinsic::aarch64_sme_st1b_vert:

6061

case Intrinsic::aarch64_sme_st1h_vert:

6062

case Intrinsic::aarch64_sme_st1w_vert:

6063

case Intrinsic::aarch64_sme_st1d_vert:

6064

case Intrinsic::aarch64_sme_st1q_vert:

6065

case Intrinsic::aarch64_sme_ld1b_horiz:

6066

case Intrinsic::aarch64_sme_ld1h_horiz:

6067

case Intrinsic::aarch64_sme_ld1w_horiz:

6068

case Intrinsic::aarch64_sme_ld1d_horiz:

6069

case Intrinsic::aarch64_sme_ld1q_horiz:

6070

case Intrinsic::aarch64_sme_st1b_horiz:

6071

case Intrinsic::aarch64_sme_st1h_horiz:

6072

case Intrinsic::aarch64_sme_st1w_horiz:

6073

case Intrinsic::aarch64_sme_st1d_horiz:

6074

case Intrinsic::aarch64_sme_st1q_horiz: {

6075

auto *Idx = dyn_cast<Instruction>(II->getOperand(3));

6076

if (!Idx || Idx->getOpcode() != Instruction::Add)

6077

return false;

6078

Ops.push_back(&II->getOperandUse(3));

6079

return true;

6080

}

6081

case Intrinsic::aarch64_neon_pmull:

6082

if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))

6083

return false;

6084

Ops.push_back(&II->getOperandUse(0));

6085

Ops.push_back(&II->getOperandUse(1));

6086

return true;

6087

case Intrinsic::aarch64_neon_pmull64:

6088

if (!areOperandsOfVmullHighP64(II->getArgOperand(0),

6089

II->getArgOperand(1)))

6090

return false;

6091

Ops.push_back(&II->getArgOperandUse(0));

6092

Ops.push_back(&II->getArgOperandUse(1));

6093

return true;

6094

case Intrinsic::masked_gather:

6095

if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))

6096

return false;

6097

Ops.push_back(&II->getArgOperandUse(0));

6098

return true;

6099

case Intrinsic::masked_scatter:

6100

if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))

6101

return false;

6102

Ops.push_back(&II->getArgOperandUse(1));

return true;

default:

return false;

}

}

David Sherwood

2025-01-06 13:17:14 +0000

[diff] [blame]

6109

auto ShouldSinkCondition = [](Value *Cond) -> bool {

6110

auto *II = dyn_cast<IntrinsicInst>(Cond);

6111

return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&

6112

isa<ScalableVectorType>(II->getOperand(0)->getType());

6113

};

6114

Jeffrey Byrnes

2024-10-09 14:30:09 -0700

[diff] [blame]

6115

switch (I->getOpcode()) {

6116

case Instruction::GetElementPtr:

6117

case Instruction::Add:

6118

case Instruction::Sub:

David Sherwood

346185c

2025-01-06 13:17:14 +0000

[diff] [blame]

6119

// Sink vscales closer to uses for better isel

Jeffrey Byrnes

2024-10-09 14:30:09 -0700

[diff] [blame]

6120

for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {

6121

if (shouldSinkVScale(I->getOperand(Op), Ops)) {

6122

Ops.push_back(&I->getOperandUse(Op));

return true;

}

}

break;

David Sherwood

346185c

2025-01-06 13:17:14 +0000

[diff] [blame]

6127

case Instruction::Select: {

6128

if (!ShouldSinkCondition(I->getOperand(0)))

6129

return false;

6130

6131

Ops.push_back(&I->getOperandUse(0));

6132

return true;

6133

}

6134

case Instruction::Br: {

6135

if (cast<BranchInst>(I)->isUnconditional())

6136

return false;

6137

6138

if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))

6139

return false;

6140

6141

Ops.push_back(&I->getOperandUse(0));

6142

return true;

6143

}

Jeffrey Byrnes

2024-10-09 14:30:09 -0700

[diff] [blame]

default:

break;

}

if (!I->getType()->isVectorTy())

6149

return false;

6150

6151

switch (I->getOpcode()) {

6152

case Instruction::Sub:

6153

case Instruction::Add: {

6154

if (!areExtractExts(I->getOperand(0), I->getOperand(1)))

6155

return false;

6156

6157

// If the exts' operands extract either the lower or upper elements, we

6158

// can sink them too.

6159

auto Ext1 = cast<Instruction>(I->getOperand(0));

6160

auto Ext2 = cast<Instruction>(I->getOperand(1));

6161

if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {

6162

Ops.push_back(&Ext1->getOperandUse(0));

6163

Ops.push_back(&Ext2->getOperandUse(0));

6164

}

6165

6166

Ops.push_back(&I->getOperandUse(0));

6167

Ops.push_back(&I->getOperandUse(1));

return true;

}

case Instruction::Or: {

6172

// Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->

6173

// bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)

6174

if (ST->hasNEON()) {

6175

Instruction *OtherAnd, *IA, *IB;

6176

Value *MaskValue;

6177

// MainAnd refers to And instruction that has 'Not' as one of its operands

6178

if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),

6179

m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),

6180

m_Instruction(IA)))))) {

6181

if (match(OtherAnd,

6182

m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {

6183

Instruction *MainAnd = I->getOperand(0) == OtherAnd

6184

? cast<Instruction>(I->getOperand(1))

6185

: cast<Instruction>(I->getOperand(0));

6186

6187

// Both Ands should be in same basic block as Or

6188

if (I->getParent() != MainAnd->getParent() ||

6189

I->getParent() != OtherAnd->getParent())

6190

return false;

6191

6192

// Non-mask operands of both Ands should also be in same basic block

6193

if (I->getParent() != IA->getParent() ||

6194

I->getParent() != IB->getParent())

return false;

Ops.push_back(

&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));

6199

Ops.push_back(&I->getOperandUse(0));

6200

Ops.push_back(&I->getOperandUse(1));

return true;

}

}

}

return false;

}

case Instruction::Mul: {

Hari Limaye

2024-12-06 12:45:18 +0000

[diff] [blame]

6210

auto ShouldSinkSplatForIndexedVariant = [](Value *V) {

6211

auto *Ty = cast<VectorType>(V->getType());

6212

// For SVE the lane-indexing is within 128-bits, so we can't fold splats.

6213

if (Ty->isScalableTy())

6214

return false;

6215

6216

// Indexed variants of Mul exist for i16 and i32 element types only.

6217

return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;

6218

};

6219

Jeffrey Byrnes

2024-10-09 14:30:09 -0700

[diff] [blame]

6220

int NumZExts = 0, NumSExts = 0;

6221

for (auto &Op : I->operands()) {

6222

// Make sure we are not already sinking this operand

6223

if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))

6224

continue;

6225

Hari Limaye

2024-12-06 12:45:18 +0000

[diff] [blame]

6226

if (match(&Op, m_ZExtOrSExt(m_Value()))) {

6227

auto *Ext = cast<Instruction>(Op);

6228

auto *ExtOp = Ext->getOperand(0);

6229

if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))

6230

Ops.push_back(&Ext->getOperandUse(0));

6231

Ops.push_back(&Op);

6232

6233

if (isa<SExtInst>(Ext))

NumSExts++;

else

NumZExts++;

Jeffrey Byrnes

2024-10-09 14:30:09 -0700

[diff] [blame]

continue;

}

ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);

Hari Limaye

2024-12-06 12:45:18 +0000

[diff] [blame]

6242

if (!Shuffle)

6243

continue;

Jeffrey Byrnes

2024-10-09 14:30:09 -0700

[diff] [blame]

6244

6245

// If the Shuffle is a splat and the operand is a zext/sext, sinking the

6246

// operand and the s/zext can help create indexed s/umull. This is

6247

// especially useful to prevent i64 mul being scalarized.

Hari Limaye

2024-12-06 12:45:18 +0000

[diff] [blame]

6248

if (isSplatShuffle(Shuffle) &&

Jeffrey Byrnes

2024-10-09 14:30:09 -0700

[diff] [blame]

6249

match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {

6250

Ops.push_back(&Shuffle->getOperandUse(0));

6251

Ops.push_back(&Op);

6252

if (match(Shuffle->getOperand(0), m_SExt(m_Value())))

NumSExts++;

else

NumZExts++;

continue;

}

Jeffrey Byrnes

2024-10-09 14:30:09 -0700

[diff] [blame]

6259

Value *ShuffleOperand = Shuffle->getOperand(0);

6260

InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);

if (!Insert)

continue;

Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));

if (!OperandInstr)

continue;

ConstantInt *ElementConstant =

6269

dyn_cast<ConstantInt>(Insert->getOperand(2));

6270

// Check that the insertelement is inserting into element 0

6271

if (!ElementConstant || !ElementConstant->isZero())

6272

continue;

6273

6274

unsigned Opcode = OperandInstr->getOpcode();

6275

if (Opcode == Instruction::SExt)

6276

NumSExts++;

6277

else if (Opcode == Instruction::ZExt)

6278

NumZExts++;

6279

else {

6280

// If we find that the top bits are known 0, then we can sink and allow

6281

// the backend to generate a umull.

6282

unsigned Bitwidth = I->getType()->getScalarSizeInBits();

6283

APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);

6284

const DataLayout &DL = I->getDataLayout();

6285

if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))

continue;

NumZExts++;

}

David Green

2025-01-10 11:54:46 +0000

[diff] [blame]

6290

// And(Load) is excluded to prevent CGP getting stuck in a loop of sinking

6291

// the And, just to hoist it again back to the load.

6292

if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))

6293

Ops.push_back(&Insert->getOperandUse(1));

Jeffrey Byrnes

2024-10-09 14:30:09 -0700

[diff] [blame]

6294

Ops.push_back(&Shuffle->getOperandUse(0));

Ops.push_back(&Op);

}

Hari Limaye

2024-12-06 12:45:18 +0000

[diff] [blame]

6298

// It is profitable to sink if we found two of the same type of extends.

6299

if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))

6300

return true;

6301

6302

// Otherwise, see if we should sink splats for indexed variants.

6303

if (!ShouldSinkSplatForIndexedVariant(I))

return false;

Ops.clear();

if (isSplatShuffle(I->getOperand(0)))

6308

Ops.push_back(&I->getOperandUse(0));

6309

if (isSplatShuffle(I->getOperand(1)))

6310

Ops.push_back(&I->getOperandUse(1));

6311

6312

return !Ops.empty();

Jeffrey Byrnes

2024-10-09 14:30:09 -0700

[diff] [blame]

6313

}

Hari Limaye

4f0403f

2024-11-19 12:59:22 +0000

[diff] [blame]

6314

case Instruction::FMul: {

6315

// For SVE the lane-indexing is within 128-bits, so we can't fold splats.

6316

if (I->getType()->isScalableTy())

6317

return false;

6318

6319

if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&

!ST->hasFullFP16())

return false;

// Sink splats for index lane variants

6324

if (isSplatShuffle(I->getOperand(0)))

6325

Ops.push_back(&I->getOperandUse(0));

6326

if (isSplatShuffle(I->getOperand(1)))

6327

Ops.push_back(&I->getOperandUse(1));

6328

return !Ops.empty();

6329

}

Jeffrey Byrnes