[VPlan] Add hasScalarTail, use instead of !CM.foldTailByMasking() (NFC). (#134674)
Now that VPlan is able to fold away redundant branches to the scalar
preheader, we can directly check in VPlan if the scalar tail may
execute. hasScalarTail returns true if the tail may execute.
We know that the scalar tail won't execute if the scalar preheader
doesn't have any predecessors, i.e. is not reachable.
This removes some late uses of the legacy cost model.
PR: https://github.com/llvm/llvm-project/pull/134674
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index f80379b..8f6a73d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -535,13 +535,13 @@
/// Returns true if the per-lane cost of VectorizationFactor A is lower than
/// that of B.
bool isMoreProfitable(const VectorizationFactor &A,
- const VectorizationFactor &B) const;
+ const VectorizationFactor &B, bool HasTail) const;
/// Returns true if the per-lane cost of VectorizationFactor A is lower than
/// that of B in the context of vectorizing a loop with known \p MaxTripCount.
bool isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B,
- const unsigned MaxTripCount) const;
+ const unsigned MaxTripCount, bool HasTail) const;
/// Determines if we have the infrastructure to vectorize the loop and its
/// epilogue, assuming the main loop is vectorized by \p VF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a4d546f..249e8fc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4256,9 +4256,10 @@
return EstimatedVF;
}
-bool LoopVectorizationPlanner::isMoreProfitable(
- const VectorizationFactor &A, const VectorizationFactor &B,
- const unsigned MaxTripCount) const {
+bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
+ const VectorizationFactor &B,
+ const unsigned MaxTripCount,
+ bool HasTail) const {
InstructionCost CostA = A.Cost;
InstructionCost CostB = B.Cost;
@@ -4296,9 +4297,9 @@
if (!MaxTripCount)
return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
- auto GetCostForTC = [MaxTripCount, this](unsigned VF,
- InstructionCost VectorCost,
- InstructionCost ScalarCost) {
+ auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
+ InstructionCost VectorCost,
+ InstructionCost ScalarCost) {
// If the trip count is a known (possibly small) constant, the trip count
// will be rounded up to an integer number of iterations under
// FoldTailByMasking. The total cost in that case will be
@@ -4307,9 +4308,10 @@
// some extra overheads, but for the purpose of comparing the costs of
// different VFs we can use this to compare the total loop-body cost
// expected after vectorization.
- if (CM.foldTailByMasking())
- return VectorCost * divideCeil(MaxTripCount, VF);
- return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
+ if (HasTail)
+ return VectorCost * (MaxTripCount / VF) +
+ ScalarCost * (MaxTripCount % VF);
+ return VectorCost * divideCeil(MaxTripCount, VF);
};
auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
@@ -4317,10 +4319,12 @@
return CmpFn(RTCostA, RTCostB);
}
-bool LoopVectorizationPlanner::isMoreProfitable(
- const VectorizationFactor &A, const VectorizationFactor &B) const {
+bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
+ const VectorizationFactor &B,
+ bool HasTail) const {
const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
- return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
+ return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount,
+ HasTail);
}
void LoopVectorizationPlanner::emitInvalidCostRemarks(
@@ -4609,7 +4613,7 @@
continue;
}
- if (isMoreProfitable(Candidate, ChosenFactor))
+ if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
ChosenFactor = Candidate;
}
}
@@ -4623,7 +4627,8 @@
}
LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
- !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
+ !isMoreProfitable(ChosenFactor, ScalarCost,
+ !CM.foldTailByMasking())) dbgs()
<< "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n");
return ChosenFactor;
@@ -4789,7 +4794,7 @@
}
if (Result.Width.isScalar() ||
- isMoreProfitable(NextVF, Result, MaxTripCount))
+ isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking()))
Result = NextVF;
}
@@ -7768,11 +7773,11 @@
InstructionCost Cost = cost(*P, VF);
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
- if (isMoreProfitable(CurrentFactor, BestFactor))
+ if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
BestFactor = CurrentFactor;
// If profitable add it to ProfitableVF list.
- if (isMoreProfitable(CurrentFactor, ScalarFactor))
+ if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
ProfitableVFs.push_back(CurrentFactor);
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 7cdcb24..60c9b1c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3790,6 +3790,13 @@
bool hasEarlyExit() const {
return ExitBlocks.size() > 1 || ExitBlocks[0]->getNumPredecessors() > 1;
}
+
+ /// Returns true if the scalar tail may execute after the vector loop. Note
+ /// that this relies on unneeded branches to the scalar tail loop being
+ /// removed.
+ bool hasScalarTail() const {
+ return getScalarPreheader()->getNumPredecessors() != 0;
+ }
};
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)