[LoopVectorize] Support vectorization of compressing patterns in VPlan
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1aed98e..bd30825 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1403,6 +1403,7 @@
Normal, ///< The cast is used with a normal load/store.
Masked, ///< The cast is used with a masked load/store.
GatherScatter, ///< The cast is used with a gather/scatter.
+ Compressed, ///< The cast is used with an expand load/compress store.
Interleave, ///< The cast is used with an interleaved load/store.
Reversed, ///< The cast is used with a reversed load/store.
};
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index d654ac3..757bff2 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -269,6 +269,10 @@
/// induction descriptor.
using InductionList = MapVector<PHINode *, InductionDescriptor>;
+ /// MonotonicPHIList saves monotonic phi variables and maps them to the
+ /// monotonic phi descriptor.
+ using MonotonicPHIList = MapVector<PHINode *, MonotonicDescriptor>;
+
/// RecurrenceSet contains the phi nodes that are recurrences other than
/// inductions and reductions.
using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
@@ -304,6 +308,11 @@
/// Returns the induction variables found in the loop.
const InductionList &getInductionVars() const { return Inductions; }
+ /// Returns the monotonic phi variables found in the loop.
+ const MonotonicPHIList &getMonotonicPHIs() const { return MonotonicPHIs; }
+
+ bool hasMonotonicPHIs() const { return !MonotonicPHIs.empty(); }
+
/// Return the fixed-order recurrences found in the loop.
RecurrenceSet &getFixedOrderRecurrences() { return FixedOrderRecurrences; }
@@ -361,6 +370,12 @@
/// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const;
+ /// Returns true if Phi is monotonic variable.
+ bool isMonotonicPHI(PHINode *Phi) const;
+
+ /// Check if memory access is compressed when vectorizing.
+ bool isCompressedPtr(Type *AccessTy, Value *Ptr, BasicBlock *BB) const;
+
/// Returns true if \p V is invariant across all loop iterations according to
/// SCEV.
bool isInvariant(Value *V) const;
@@ -597,6 +612,9 @@
/// variables can be pointers.
InductionList Inductions;
+ /// Holds all of the monotonic phi variables that we found in the loop.
+ MonotonicPHIList MonotonicPHIs;
+
/// Holds all the casts that participate in the update chain of the induction
/// variables, and that have been proven to be redundant (possibly under a
/// runtime guard). These casts can be ignored when creating the vectorized
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 8e09e6f..cdfd556 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -43,6 +43,10 @@
cl::desc("Enable recognition of non-constant strided "
"pointer induction variables."));
+static cl::opt<bool> EnableMonotonicPatterns(
+ "lv-monotonic-patterns", cl::init(true), cl::Hidden,
+ cl::desc("Enable recognition of monotonic patterns."));
+
static cl::opt<bool>
HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
cl::desc("Allow enabling loop hints to reorder "
@@ -468,6 +472,30 @@
return 0;
}
+bool LoopVectorizationLegality::isMonotonicPHI(PHINode *Phi) const {
+ return MonotonicPHIs.count(Phi);
+}
+
+bool LoopVectorizationLegality::isCompressedPtr(Type *AccessTy, Value *Ptr,
+ BasicBlock *BB) const {
+ MonotonicDescriptor Desc;
+ if (!MonotonicDescriptor::isMonotonicVal(Ptr, TheLoop, Desc, *PSE.getSE()))
+ return false;
+
+ // Check if memory operation will use the same mask as monotonic phi.
+ // TODO: relax restrictions of current implementation.
+ if (Desc.getPredicateEdge() !=
+ MonotonicDescriptor::Edge(BB, BB->getUniqueSuccessor()))
+ return false;
+
+ // Check if pointer step equals access size.
+ auto *Step =
+ dyn_cast<SCEVConstant>(Desc.getExpr()->getStepRecurrence(*PSE.getSE()));
+ if (!Step)
+ return false;
+ return Step->getAPInt() == BB->getDataLayout().getTypeAllocSize(AccessTy);
+}
+
bool LoopVectorizationLegality::isInvariant(Value *V) const {
return LAI->isInvariant(V);
}
@@ -874,6 +902,13 @@
continue;
}
+ MonotonicDescriptor MD;
+ if (EnableMonotonicPatterns && MonotonicDescriptor::isMonotonicPHI(
+ Phi, TheLoop, MD, *PSE.getSE())) {
+ MonotonicPHIs[Phi] = MD;
+ continue;
+ }
+
if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
AllowedExit.insert(Phi);
FixedOrderRecurrences.insert(Phi);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 490d0af..32f0d8b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1095,6 +1095,7 @@
CM_Widen_Reverse, // For consecutive accesses with stride -1.
CM_Interleave,
CM_GatherScatter,
+ CM_Compressed,
CM_Scalarize,
CM_VectorCall,
CM_IntrinsicCall
@@ -1308,9 +1309,9 @@
getDivRemSpeculationCost(Instruction *I,
ElementCount VF) const;
- /// Returns widening decision (CM_Widen or CM_Widen_Reverse) if \p I is a
- /// memory instruction with consecutive access that can be widened, or
- /// CM_Unknown otherwise.
+ /// Returns widening decision (CM_Widen, CM_Widen_Reverse or CM_Compressed) if
+ /// \p I is a memory instruction with consecutive access that can be widened,
+ /// or CM_Unknown otherwise.
InstWidening memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
/// Returns true if \p I is a memory instruction in an interleaved-group
@@ -3263,6 +3264,9 @@
auto *Ptr = getLoadStorePointerOperand(I);
auto *ScalarTy = getLoadStoreType(I);
+ if (Legal->isCompressedPtr(ScalarTy, Ptr, I->getParent()))
+ return CM_Compressed;
+
// In order to be widened, the pointer should be consecutive, first of all.
auto Stride = Legal->isConsecutivePtr(ScalarTy, Ptr);
if (!Stride)
@@ -3372,9 +3376,9 @@
if (IsUniformMemOpUse(I))
return true;
- return (WideningDecision == CM_Widen ||
- WideningDecision == CM_Widen_Reverse ||
- WideningDecision == CM_Interleave);
+ return (
+ WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
+ WideningDecision == CM_Interleave || WideningDecision == CM_Compressed);
};
// Returns true if Ptr is the pointer operand of a memory access instruction
@@ -3514,6 +3518,39 @@
AddToWorklistIfAllowed(IndUpdate);
}
+ // Handle monotonic phis (similarly to induction vars).
+ for (const auto &MonotonicPHI : Legal->getMonotonicPHIs()) {
+ auto *Phi = MonotonicPHI.first;
+ auto *PhiUpdate = cast<Instruction>(Phi->getIncomingValueForBlock(Latch));
+ const auto &Desc = MonotonicPHI.second;
+
+ auto UniformPhi = llvm::all_of(Phi->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ if (I == Desc.getStepInst())
+ return true;
+ if (auto *PN = dyn_cast<PHINode>(I); PN && Desc.getChain().contains(PN))
+ return true;
+ return !TheLoop->contains(I) || Worklist.count(I) ||
+ IsVectorizedMemAccessUse(I, Phi);
+ });
+ if (!UniformPhi)
+ continue;
+
+ auto UniformPhiUpdate =
+ llvm::all_of(PhiUpdate->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ if (I == Phi)
+ return true;
+ return !TheLoop->contains(I) || Worklist.count(I) ||
+ IsVectorizedMemAccessUse(I, Phi);
+ });
+ if (!UniformPhiUpdate)
+ continue;
+
+ AddToWorklistIfAllowed(Phi);
+ AddToWorklistIfAllowed(PhiUpdate);
+ }
+
Uniforms[VF].insert_range(Worklist);
}
@@ -4272,6 +4309,7 @@
case VPDef::VPEVLBasedIVPHISC:
case VPDef::VPPredInstPHISC:
case VPDef::VPBranchOnMaskSC:
+ case VPDef::VPMonotonicPHISC:
continue;
case VPDef::VPReductionSC:
case VPDef::VPActiveLaneMaskPHISC:
@@ -4992,6 +5030,10 @@
if (Legal->hasUncountableEarlyExit())
return 1;
+ // Monotonic vars don't support interleaving.
+ if (Legal->hasMonotonicPHIs())
+ return 1;
+
const bool HasReductions = !Legal->getReductionVars().empty();
// If we did not calculate the cost for VF (because the user selected the VF)
@@ -5577,12 +5619,17 @@
Instruction *I, ElementCount VF, InstWidening Decision) {
Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
+ const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ if (Decision == CM_Compressed)
+ return TTI.getExpandCompressMemoryOpCost(I->getOpcode(), VectorTy,
+ /*VariableMask*/ true, Alignment,
+ CostKind, I);
+
assert((Decision == CM_Widen || Decision == CM_Widen_Reverse) &&
"Expected widen decision.");
- const Align Alignment = getLoadStoreAlignment(I);
InstructionCost Cost = 0;
if (Legal->isMaskRequired(I)) {
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
@@ -6292,6 +6339,11 @@
// the scalar version.
if (isUniformAfterVectorization(I, VF))
VF = ElementCount::getFixed(1);
+ else if (auto *Phi = dyn_cast<PHINode>(I)) {
+ // Prohibit scalarization of monotonic phis.
+ if (Legal->isMonotonicPHI(Phi))
+ return InstructionCost::getInvalid();
+ }
if (VF.isVector() && isProfitableToScalarize(I, VF))
return InstsToScalarize[VF][I];
@@ -6647,6 +6699,8 @@
switch (getWideningDecision(I, VF)) {
case LoopVectorizationCostModel::CM_GatherScatter:
return TTI::CastContextHint::GatherScatter;
+ case LoopVectorizationCostModel::CM_Compressed:
+ return TTI::CastContextHint::Compressed;
case LoopVectorizationCostModel::CM_Interleave:
return TTI::CastContextHint::Interleave;
case LoopVectorizationCostModel::CM_Scalarize:
@@ -7238,6 +7292,16 @@
}
}
+ for (const auto &[MonotonicPhi, MonotonicDesc] : Legal->getMonotonicPHIs()) {
+ // TODO: currently, we restrict vectorization of non-uniform monotonic phis
+ // by reporting Invalid cost for it. This can be relaxed in future.
+ if (VF.isVector() && !CM.isUniformAfterVectorization(MonotonicPhi, VF))
+ Cost = InstructionCost::getInvalid();
+ else
+ Cost += TTI.getCFInstrCost(Instruction::PHI, CostCtx.CostKind);
+ CostCtx.SkipCostComputation.insert(MonotonicPhi);
+ }
+
// Pre-compute the costs for branches except for the backedge, as the number
// of replicate regions in a VPlan may not directly match the number of
// branches, which would lead to different decisions.
@@ -8229,8 +8293,9 @@
LoopVectorizationCostModel::InstWidening Decision =
CM.getWideningDecision(I, Range.Start);
bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
+ bool Compressed = Decision == LoopVectorizationCostModel::CM_Compressed;
bool Consecutive =
- Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+ Reverse || Compressed || Decision == LoopVectorizationCostModel::CM_Widen;
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
if (Consecutive) {
@@ -8258,11 +8323,12 @@
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
- VPIRMetadata(*Load, LVer), I->getDebugLoc());
+ Compressed, VPIRMetadata(*Load, LVer),
+ I->getDebugLoc());
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
- Reverse, VPIRMetadata(*Store, LVer),
+ Reverse, Compressed, VPIRMetadata(*Store, LVer),
I->getDebugLoc());
}
@@ -8771,11 +8837,19 @@
return Recipe;
VPHeaderPHIRecipe *PhiRecipe = nullptr;
- assert((Legal->isReductionVariable(Phi) ||
+ assert((Legal->isMonotonicPHI(Phi) || Legal->isReductionVariable(Phi) ||
Legal->isFixedOrderRecurrence(Phi)) &&
- "can only widen reductions and fixed-order recurrences here");
+ "can only widen monotonic phis, reductions and fixed-order "
+ "recurrences here");
VPValue *StartV = Operands[0];
- if (Legal->isReductionVariable(Phi)) {
+ Value *IncomingVal =
+ Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader());
+ if (Legal->isMonotonicPHI(Phi)) {
+ const MonotonicDescriptor &Desc =
+ Legal->getMonotonicPHIs().find(Phi)->second;
+ assert(Desc.getExpr()->getStart() == PSE.getSCEV(IncomingVal));
+ PhiRecipe = new VPMonotonicPHIRecipe(Phi, Desc, StartV);
+ } else if (Legal->isReductionVariable(Phi)) {
const RecurrenceDescriptor &RdxDesc =
Legal->getReductionVars().find(Phi)->second;
assert(RdxDesc.getRecurrenceStartValue() ==
@@ -9397,6 +9471,27 @@
// bring the VPlan to its final state.
// ---------------------------------------------------------------------------
+ // Adjust the recipes for any monotonic phis.
+ for (VPRecipeBase &R : HeaderVPBB->phis()) {
+ auto *MonotonicPhi = dyn_cast<VPMonotonicPHIRecipe>(&R);
+ if (!MonotonicPhi)
+ continue;
+
+ auto &Desc = MonotonicPhi->getDescriptor();
+ auto [EdgeSrc, EdgeDst] = Desc.getPredicateEdge();
+ auto &SE = *PSE.getSE();
+ auto *Step = vputils::getOrCreateVPValueForSCEVExpr(
+ *Plan, Desc.getExpr()->getStepRecurrence(SE), SE);
+
+ auto *MonotonicI = new VPInstruction(
+ VPInstruction::ComputeMonotonicResult,
+ {MonotonicPhi, RecipeBuilder.getEdgeMask(EdgeSrc, EdgeDst), Step},
+ *Desc.getStepInst());
+ auto *InsertBlock = MonotonicPhi->getBackedgeRecipe().getParent();
+ InsertBlock->insert(MonotonicI, InsertBlock->getFirstNonPhi());
+ MonotonicPhi->getBackedgeValue()->replaceAllUsesWith(MonotonicI);
+ }
+
// Adjust the recipes for any inloop reductions.
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
@@ -10587,6 +10682,15 @@
IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
unsigned SelectedIC = std::max(IC, UserIC);
+
+ if (LVL.hasMonotonicPHIs() && SelectedIC > 1) {
+ reportVectorizationFailure(
+ "Interleaving of loop with monotonic vars",
+ "Interleaving of loops with monotonic vars is not supported",
+ "CantInterleaveWithMonotonicVars", ORE, L);
+ return false;
+ }
+
// Optimistically generate runtime checks if they are needed. Drop them if
// they turn out to not be profitable.
if (VF.Width.isVector() || SelectedIC > 1)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 06b738a..f5b2667 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -308,10 +308,11 @@
VPLane LastLane(IsSingleScalar ? 0 : VF.getKnownMinValue() - 1);
// Check if there is a scalar value for the selected lane.
if (!hasScalarValue(Def, LastLane)) {
- // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
- // VPExpandSCEVRecipes can also be a single scalar.
+ // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes,
+ // VPMonotonicPHIRecipe and VPExpandSCEVRecipes can also be a single scalar.
assert((isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe,
- VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
+ VPMonotonicPHIRecipe, VPExpandSCEVRecipe>(
+ Def->getDefiningRecipe())) &&
"unexpected recipe found to be invariant");
IsSingleScalar = true;
LastLane = 0;
@@ -1005,6 +1006,7 @@
auto *PhiR = cast<VPSingleDefRecipe>(&R);
// VPInstructions currently model scalar Phis only.
bool NeedsScalar = isa<VPInstruction>(PhiR) ||
+ isa<VPMonotonicPHIRecipe>(PhiR) ||
(isa<VPReductionPHIRecipe>(PhiR) &&
cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
Value *Phi = State->get(PhiR, NeedsScalar);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e634de1..9ce743d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -539,6 +539,7 @@
case VPRecipeBase::VPWidenIntOrFpInductionSC:
case VPRecipeBase::VPWidenPointerInductionSC:
case VPRecipeBase::VPReductionPHISC:
+ case VPRecipeBase::VPMonotonicPHISC:
case VPRecipeBase::VPPartialReductionSC:
return true;
case VPRecipeBase::VPBranchOnMaskSC:
@@ -900,6 +901,7 @@
Broadcast,
ComputeFindLastIVResult,
ComputeReductionResult,
+ ComputeMonotonicResult,
// Extracts the last lane from its operand if it is a vector, or the last
// part if scalar. In the latter case, the recipe will be removed during
// unrolling.
@@ -965,6 +967,11 @@
#endif
public:
+ VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, Instruction &I,
+ const Twine &Name = "")
+ : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, I),
+ Opcode(Opcode), Name(Name.str()) {}
+
VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,
const Twine &Name = "")
: VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
@@ -2249,6 +2256,50 @@
}
};
+/// A recipe for handling monotonic phis. The start value is the first operand
+/// of the recipe and the incoming value from the backedge is the second
+/// operand.
+class VPMonotonicPHIRecipe : public VPHeaderPHIRecipe {
+ MonotonicDescriptor Desc;
+
+public:
+ VPMonotonicPHIRecipe(PHINode *Phi, const MonotonicDescriptor &Desc,
+ VPValue *Start)
+ : VPHeaderPHIRecipe(VPDef::VPMonotonicPHISC, Phi, Start), Desc(Desc) {}
+
+ ~VPMonotonicPHIRecipe() override = default;
+
+ VPMonotonicPHIRecipe *clone() override {
+ auto *R = new VPMonotonicPHIRecipe(cast<PHINode>(getUnderlyingInstr()),
+ Desc, getStartValue());
+ R->addOperand(getBackedgeValue());
+ return R;
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPMonotonicPHISC)
+
+ static inline bool classof(const VPHeaderPHIRecipe *R) {
+ return R->getVPDefID() == VPDef::VPMonotonicPHISC;
+ }
+
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ const MonotonicDescriptor &getDescriptor() const { return Desc; }
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
+};
+
/// A recipe for vectorizing a phi-node as a sequence of mask-based select
/// instructions.
class VPBlendRecipe : public VPSingleDefRecipe {
@@ -2974,6 +3025,9 @@
/// Whether the consecutive accessed addresses are in reverse order.
bool Reverse;
+ /// Whether the consecutive accessed addresses are compressed with mask value.
+ bool Compressed;
+
/// Whether the memory access is masked.
bool IsMasked = false;
@@ -2987,11 +3041,12 @@
VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
std::initializer_list<VPValue *> Operands,
- bool Consecutive, bool Reverse,
+ bool Consecutive, bool Reverse, bool Compressed,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I),
- Consecutive(Consecutive), Reverse(Reverse) {
+ Consecutive(Consecutive), Reverse(Reverse), Compressed(Compressed) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
+ assert((Consecutive || !Compressed) && "Compressed implies consecutive");
}
public:
@@ -3018,6 +3073,9 @@
/// order.
bool isReverse() const { return Reverse; }
+ /// Return whether the consecutive loaded/stored addresses are compressed.
+ bool isCompressed() const { return Compressed; }
+
/// Return the address accessed by this recipe.
VPValue *getAddr() const { return getOperand(0); }
@@ -3047,18 +3105,18 @@
/// optional mask.
struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
- bool Consecutive, bool Reverse,
+ bool Consecutive, bool Reverse, bool Compressed,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
- Reverse, Metadata, DL),
+ Reverse, Compressed, Metadata, DL),
VPValue(this, &Load) {
setMask(Mask);
}
VPWidenLoadRecipe *clone() override {
return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
- getMask(), Consecutive, Reverse, *this,
- getDebugLoc());
+ getMask(), Consecutive, Reverse, Compressed,
+ *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC);
@@ -3089,7 +3147,8 @@
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
{L.getAddr(), &EVL}, L.isConsecutive(),
- L.isReverse(), L, L.getDebugLoc()),
+ L.isReverse(), L.isCompressed(), L,
+ L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
@@ -3127,16 +3186,16 @@
struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
VPValue *Mask, bool Consecutive, bool Reverse,
- const VPIRMetadata &Metadata, DebugLoc DL)
+ bool Compressed, const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
- Consecutive, Reverse, Metadata, DL) {
+ Consecutive, Reverse, Compressed, Metadata, DL) {
setMask(Mask);
}
VPWidenStoreRecipe *clone() override {
return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
getStoredValue(), getMask(), Consecutive,
- Reverse, *this, getDebugLoc());
+ Reverse, Compressed, *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -3170,8 +3229,8 @@
VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{S.getAddr(), S.getStoredValue(), &EVL},
- S.isConsecutive(), S.isReverse(), S,
- S.getDebugLoc()) {
+ S.isConsecutive(), S.isReverse(), S.isCompressed(),
+ S, S.getDebugLoc()) {
setMask(Mask);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index ac0f30c..a562214 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -93,6 +93,11 @@
auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
return OrigPhi->getType();
}
+ case VPInstruction::ComputeMonotonicResult: {
+ auto *PhiR = cast<VPMonotonicPHIRecipe>(R->getOperand(0));
+ auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
+ return OrigPhi->getType();
+ }
case VPInstruction::ExplicitVectorLength:
return Type::getIntNTy(Ctx, 32);
case Instruction::PHI:
@@ -266,14 +271,14 @@
TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
.Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe,
VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe,
- VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe>(
- [this](const auto *R) {
- // Handle header phi recipes, except VPWidenIntOrFpInduction
- // which needs special handling due it being possibly truncated.
- // TODO: consider inferring/caching type of siblings, e.g.,
- // backedge value, here and in cases below.
- return inferScalarType(R->getStartValue());
- })
+ VPMonotonicPHIRecipe, VPWidenPointerInductionRecipe,
+ VPEVLBasedIVPHIRecipe>([this](const auto *R) {
+ // Handle header phi recipes, except VPWidenIntOrFpInduction
+ // which needs special handling due it being possibly truncated.
+ // TODO: consider inferring/caching type of siblings, e.g.,
+ // backedge value, here and in cases below.
+ return inferScalarType(R->getStartValue());
+ })
.Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
[](const auto *R) { return R->getScalarType(); })
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 14ed40f..cc6c839 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -722,6 +722,34 @@
return ReducedPartRdx;
}
+ case VPInstruction::ComputeMonotonicResult: {
+ assert(getParent()->getPlan()->getUF() == 1 &&
+ "Expected unroll factor of 1.");
+
+ auto *Phi = State.get(getOperand(0), /*IsScalar*/ true);
+ auto *PhiTy = Phi->getType();
+ Value *Mask = State.get(getOperand(1), 0);
+ auto *MaskTy = Mask->getType();
+ assert(isa<VectorType>(MaskTy) &&
+ cast<VectorType>(MaskTy)->getElementType()->isIntegerTy(1) &&
+ "Mask type should be <N x i1>");
+
+ const auto &DL = State.CFG.PrevBB->getDataLayout();
+ auto *IntTy = PhiTy->isIntegerTy() ? PhiTy : DL.getIndexType(PhiTy);
+
+ auto *Step = State.get(getOperand(2), /*IsScalar*/ true);
+
+ auto &Builder = State.Builder;
+ auto *NumElems = Builder.CreateAddReduce(
+ Builder.CreateZExt(Mask, MaskTy->getWithNewType(IntTy)));
+ auto *Offset = Builder.CreateMul(NumElems, Step);
+
+ return PhiTy->isPointerTy()
+ ? Builder.CreatePtrAdd(Phi, Offset, "monotonic.add",
+ getGEPNoWrapFlags())
+ : Builder.CreateAdd(Phi, Offset, "monotonic.add",
+ hasNoUnsignedWrap(), hasNoSignedWrap());
+ }
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractPenultimateElement: {
unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2;
@@ -840,6 +868,12 @@
I32Ty, {Arg0Ty, I32Ty, I1Ty});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
+ case VPInstruction::ComputeMonotonicResult: {
+ Type *ElementTy = Ctx.Types.inferScalarType(getOperand(0));
+ auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
+ return Ctx.TTI.getArithmeticReductionCost(Instruction::Add, VectorTy,
+ std::nullopt, Ctx.CostKind);
+ }
default:
// TODO: Compute cost other VPInstructions once the legacy cost model has
// been retired.
@@ -856,6 +890,7 @@
getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::ComputeFindLastIVResult ||
getOpcode() == VPInstruction::ComputeReductionResult ||
+ getOpcode() == VPInstruction::ComputeMonotonicResult ||
getOpcode() == VPInstruction::AnyOf;
}
@@ -1053,6 +1088,9 @@
case VPInstruction::ComputeReductionResult:
O << "compute-reduction-result";
break;
+ case VPInstruction::ComputeMonotonicResult:
+ O << "compute-monotonic-result";
+ break;
case VPInstruction::LogicalAnd:
O << "logical-and";
break;
@@ -2933,8 +2971,12 @@
InstructionCost Cost = 0;
if (IsMasked) {
- Cost +=
- Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind);
+ Cost += Compressed
+ ? Ctx.TTI.getExpandCompressMemoryOpCost(Opcode, Ty,
+ /*VariableMask*/ true,
+ Alignment, Ctx.CostKind)
+ : Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS,
+ Ctx.CostKind);
} else {
TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0)
@@ -2972,9 +3014,13 @@
NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
"wide.masked.gather");
} else if (Mask) {
- NewLI =
- Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
- PoisonValue::get(DataTy), "wide.masked.load");
+ NewLI = Compressed
+ ? Builder.CreateMaskedExpandLoad(DataTy, Addr, Alignment, Mask,
+ PoisonValue::get(DataTy),
+ "wide.masked.expand.load")
+ : Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
+ PoisonValue::get(DataTy),
+ "wide.masked.load");
} else {
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
}
@@ -3107,7 +3153,10 @@
if (CreateScatter)
NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
else if (Mask)
- NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
+ NewSI = Compressed
+ ? Builder.CreateMaskedCompressStore(StoredVal, Addr, Alignment,
+ Mask)
+ : Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
else
NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
applyMetadata(*NewSI);
@@ -3907,6 +3956,29 @@
}
#endif
+void VPMonotonicPHIRecipe::execute(VPTransformState &State) {
+ assert(getParent()->getPlan()->getUF() == 1 && "Expected unroll factor 1.");
+ Value *Start = getStartValue()->getLiveInIRValue();
+ BasicBlock *VectorPH =
+ State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
+ PHINode *MonotonicPHI =
+ State.Builder.CreatePHI(Start->getType(), 2, "monotonic.iv");
+ MonotonicPHI->addIncoming(Start, VectorPH);
+ MonotonicPHI->setDebugLoc(getDebugLoc());
+ State.set(this, MonotonicPHI, /*IsScalar=*/true);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPMonotonicPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "MONOTONIC-PHI ";
+
+ printAsOperand(O, SlotTracker);
+ O << " = phi ";
+ printOperands(O, SlotTracker);
+}
+#endif
+
void VPWidenPHIRecipe::execute(VPTransformState &State) {
assert(EnableVPlanNativePath &&
"Non-native vplans are not expected to have VPWidenPHIRecipes.");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8c8297b..d2a3eef 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -80,13 +80,14 @@
if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
NewRecipe = new VPWidenLoadRecipe(
*Load, Ingredient.getOperand(0), nullptr /*Mask*/,
- false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
- Ingredient.getDebugLoc());
+ false /*Consecutive*/, false /*Reverse*/, false /*Compressed*/,
+ VPIRMetadata(*Load), Ingredient.getDebugLoc());
} else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
NewRecipe = new VPWidenStoreRecipe(
*Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
- VPIRMetadata(*Store), Ingredient.getDebugLoc());
+ false /*Compressed*/, VPIRMetadata(*Store),
+ Ingredient.getDebugLoc());
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
@@ -3063,7 +3064,8 @@
auto *L = new VPWidenLoadRecipe(
*cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
- /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
+ /*Reverse=*/false, /*Compressed=*/false, {},
+ LoadGroup->getDebugLoc());
L->insertBefore(LoadGroup);
return L;
}
@@ -3095,7 +3097,7 @@
auto *S = new VPWidenStoreRecipe(
*cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
- /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
+ /*Reverse=*/false, /*Compressed*/ false, {}, StoreGroup->getDebugLoc());
S->insertBefore(StoreGroup);
StoreGroup->eraseFromParent();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 64065ed..29f2864 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -371,12 +371,13 @@
VPWidenIntOrFpInductionSC,
VPWidenPointerInductionSC,
VPReductionPHISC,
+ VPMonotonicPHISC,
// END: SubclassID for recipes that inherit VPHeaderPHIRecipe
// END: Phi-like recipes
VPFirstPHISC = VPWidenPHISC,
VPFirstHeaderPHISC = VPCanonicalIVPHISC,
- VPLastHeaderPHISC = VPReductionPHISC,
- VPLastPHISC = VPReductionPHISC,
+ VPLastHeaderPHISC = VPMonotonicPHISC,
+ VPLastPHISC = VPMonotonicPHISC,
};
VPDef(const unsigned char SC) : SubclassID(SC) {}
diff --git a/llvm/test/Transforms/LoopVectorize/compress-idioms.ll b/llvm/test/Transforms/LoopVectorize/compress-idioms.ll
index 1390092..3d2dd45 100644
--- a/llvm/test/Transforms/LoopVectorize/compress-idioms.ll
+++ b/llvm/test/Transforms/LoopVectorize/compress-idioms.ll
@@ -5,18 +5,54 @@
; CHECK-LABEL: define void @test_store_with_pointer(
; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
; CHECK: [[FOR_BODY_PREHEADER]]:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP12]], 16
+; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[MONOTONIC_IV:%.*]] = phi ptr [ [[DST]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[MONOTONIC_IV]], i32 0
+; CHECK-NEXT: call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr align 4 [[TMP4]], <4 x i1> [[TMP3]])
+; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]])
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT: [[TMP9]] = getelementptr inbounds i8, ptr [[MONOTONIC_IV]], i64 [[TMP8]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], 4
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi ptr [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[DST]], %[[FOR_BODY_PREHEADER]] ], [ [[DST]], %[[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]:
; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
; CHECK: [[FOR_COND_CLEANUP]]:
; CHECK-NEXT: ret void
; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
-; CHECK-NEXT: [[DST_ADDR_09:%.*]] = phi ptr [ [[DST]], %[[FOR_BODY_PREHEADER]] ], [ [[DST_ADDR_1:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT: [[DST_ADDR_09:%.*]] = phi ptr [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[DST_ADDR_1:%.*]], %[[FOR_INC]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]]
@@ -29,7 +65,7 @@
; CHECK-NEXT: [[DST_ADDR_1]] = phi ptr [ [[INCDEC_PTR]], %[[IF_THEN]] ], [ [[DST_ADDR_09]], %[[FOR_BODY]] ]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
;
entry:
%cmp8 = icmp sgt i32 %n, 0
@@ -69,18 +105,56 @@
; CHECK-LABEL: define void @test_store_with_index(
; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
; CHECK: [[FOR_BODY_PREHEADER]]:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], 16
+; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[TMP20:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[MONOTONIC_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[MONOTONIC_ADD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[MONOTONIC_IV]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
+; CHECK-NEXT: call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr align 4 [[TMP13]], <4 x i1> [[TMP3]])
+; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 1
+; CHECK-NEXT: [[MONOTONIC_ADD]] = add nsw i32 [[MONOTONIC_IV]], [[TMP17]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP20]], 4
+; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[MONOTONIC_ADD]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]:
; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
; CHECK: [[FOR_COND_CLEANUP]]:
; CHECK-NEXT: ret void
; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
-; CHECK-NEXT: [[IDX_012:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IDX_1:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT: [[IDX_012:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[IDX_1:%.*]], %[[FOR_INC]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]]
@@ -95,7 +169,7 @@
; CHECK-NEXT: [[IDX_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[IDX_012]], %[[FOR_BODY]] ]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
;
entry:
%cmp11 = icmp sgt i32 %n, 0
@@ -141,14 +215,54 @@
; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
; CHECK: [[FOR_BODY_PREHEADER]]:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP28]]
+; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP28]]
+; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[MONOTONIC_IV:%.*]] = phi ptr [ [[SRC]], %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META9:![0-9]+]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[MONOTONIC_IV]], i32 0
+; CHECK-NEXT: [[WIDE_MASKED_EXPAND_LOAD:%.*]] = call <4 x i32> @llvm.masked.expandload.v4i32(ptr align 4 [[TMP4]], <4 x i1> [[TMP3]], <4 x i32> poison), !alias.scope [[META9]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP27]], i32 0
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[WIDE_MASKED_EXPAND_LOAD]], ptr [[TMP5]], i32 4, <4 x i1> [[TMP3]]), !alias.scope [[META6]], !noalias [[META9]]
+; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i64>
+; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP21]])
+; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4
+; CHECK-NEXT: [[TMP24]] = getelementptr inbounds i8, ptr [[MONOTONIC_IV]], i64 [[TMP23]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi ptr [ [[TMP24]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[FOR_BODY_PREHEADER]] ], [ [[SRC]], %[[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]:
; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
; CHECK: [[FOR_COND_CLEANUP]]:
; CHECK-NEXT: ret void
; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
-; CHECK-NEXT: [[SRC_ADDR_09:%.*]] = phi ptr [ [[SRC]], %[[FOR_BODY_PREHEADER]] ], [ [[SRC_ADDR_1:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT: [[SRC_ADDR_09:%.*]] = phi ptr [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SRC_ADDR_1:%.*]], %[[FOR_INC]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]]
@@ -162,7 +276,7 @@
; CHECK-NEXT: [[SRC_ADDR_1]] = phi ptr [ [[INCDEC_PTR]], %[[IF_THEN]] ], [ [[SRC_ADDR_09]], %[[FOR_BODY]] ]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
;
entry:
%cmp8 = icmp sgt i32 %n, 0
@@ -207,14 +321,56 @@
; CHECK-NEXT: br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
; CHECK: [[FOR_BODY_PREHEADER]]:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
+; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP4]]
+; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[MONOTONIC_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[MONOTONIC_ADD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META13:![0-9]+]], !noalias [[META16:![0-9]+]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[MONOTONIC_IV]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
+; CHECK-NEXT: [[WIDE_MASKED_EXPAND_LOAD:%.*]] = call <4 x i32> @llvm.masked.expandload.v4i32(ptr align 4 [[TMP13]], <4 x i1> [[TMP3]], <4 x i32> poison), !alias.scope [[META16]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP36]], i32 0
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[WIDE_MASKED_EXPAND_LOAD]], ptr [[TMP7]], i32 4, <4 x i1> [[TMP3]]), !alias.scope [[META13]], !noalias [[META16]]
+; CHECK-NEXT: [[TMP29:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP29]])
+; CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[TMP31]], 1
+; CHECK-NEXT: [[MONOTONIC_ADD]] = add nsw i32 [[MONOTONIC_IV]], [[TMP32]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[MONOTONIC_ADD]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]:
; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
; CHECK: [[FOR_COND_CLEANUP]]:
; CHECK-NEXT: ret void
; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
-; CHECK-NEXT: [[IDX_012:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IDX_1:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT: [[IDX_012:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[IDX_1:%.*]], %[[FOR_INC]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]]
@@ -230,7 +386,7 @@
; CHECK-NEXT: [[IDX_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[IDX_012]], %[[FOR_BODY]] ]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
;
entry:
%cmp11 = icmp sgt i32 %n, 0
@@ -339,20 +495,58 @@
; CHECK-LABEL: define i32 @test_multiple_uses(
; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
; CHECK-NEXT: [[CMP12:%.*]] = icmp sgt i32 [[N]], 0
; CHECK-NEXT: br i1 [[CMP12]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
; CHECK: [[FOR_BODY_PREHEADER]]:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP13]], 16
+; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[MONOTONIC_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[MONOTONIC_ADD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[MONOTONIC_IV]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT: call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr align 4 [[TMP7]], <4 x i1> [[TMP4]])
+; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 1
+; CHECK-NEXT: [[MONOTONIC_ADD]] = add nsw i32 [[MONOTONIC_IV]], [[TMP11]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP1]], 4
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[MONOTONIC_ADD]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
-; CHECK-NEXT: [[IDX_1_LCSSA:%.*]] = phi i32 [ [[IDX_1:%.*]], %[[FOR_INC:.*]] ]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-NEXT: [[IDX_1_LCSSA:%.*]] = phi i32 [ [[IDX_1:%.*]], %[[FOR_INC:.*]] ], [ [[MONOTONIC_ADD]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
; CHECK: [[FOR_COND_CLEANUP]]:
; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IDX_1_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
; CHECK-NEXT: ret i32 [[IDX_0_LCSSA]]
; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC]] ]
-; CHECK-NEXT: [[IDX_013:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IDX_1]], %[[FOR_INC]] ]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT: [[IDX_013:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[IDX_1]], %[[FOR_INC]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]]
@@ -367,7 +561,7 @@
; CHECK-NEXT: [[IDX_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[IDX_013]], %[[FOR_BODY]] ]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
;
entry:
%cmp12 = icmp sgt i32 %n, 0
@@ -478,3 +672,27 @@
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK: [[META6]] = !{[[META7:![0-9]+]]}
+; CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]}
+; CHECK: [[META8]] = distinct !{[[META8]], !"LVerDomain"}
+; CHECK: [[META9]] = !{[[META10:![0-9]+]]}
+; CHECK: [[META10]] = distinct !{[[META10]], [[META8]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]}
+; CHECK: [[META13]] = !{[[META14:![0-9]+]]}
+; CHECK: [[META14]] = distinct !{[[META14]], [[META15:![0-9]+]]}
+; CHECK: [[META15]] = distinct !{[[META15]], !"LVerDomain"}
+; CHECK: [[META16]] = !{[[META17:![0-9]+]]}
+; CHECK: [[META17]] = distinct !{[[META17]], [[META15]]}
+; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
+; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]]}
+; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]}
+; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]]}
+;.
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index f0d943f..79195f4 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1084,7 +1084,7 @@
new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
- VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {});
+ VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, false, {}, {});
EXPECT_TRUE(isa<VPUser>(&Recipe));
VPRecipeBase *BaseR = &Recipe;
EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -1201,7 +1201,7 @@
new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
- VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {});
+ VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, false, {}, {});
EXPECT_FALSE(Recipe.mayHaveSideEffects());
EXPECT_TRUE(Recipe.mayReadFromMemory());
EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1215,8 +1215,8 @@
VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
VPValue *StoredV = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3));
- VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, {},
- {});
+ VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, false,
+ {}, {});
EXPECT_TRUE(Recipe.mayHaveSideEffects());
EXPECT_FALSE(Recipe.mayReadFromMemory());
EXPECT_TRUE(Recipe.mayWriteToMemory());