28#define DEBUG_TYPE "loop-vectorize"
34 cl::desc(
"Maximize bandwidth when selecting vectorization factor which "
35 "will be determined by the smallest type in loop."));
38 "vectorizer-maximize-bandwidth-for-vector-calls",
cl::init(
true),
40 cl::desc(
"Try wider VFs if they enable the use of vector variants"));
44 cl::desc(
"Discard VFs if their register pressure is too high."));
49 "Pretend that scalable vectors are supported, even if the target does "
50 "not support them. This flag should only be used for testing."));
54 cl::desc(
"Prefer in-loop vector reductions, "
55 "overriding the targets preference."));
61 cl::desc(
"Assume the target supports masked memory operations (used for "
66 cl::desc(
"Assume the target supports gather/scatter operations (used for "
78 : TTI.isLegalMaskedStore(Ty, Alignment, AS));
92 (LI && TTI.isLegalMaskedGather(Ty,
Align)) ||
93 (
SI && TTI.isLegalMaskedScatter(Ty,
Align));
100bool VFSelectionContext::useMaxBandwidth(
bool IsScalable)
const {
105 (
TTI.shouldMaximizeVectorBandwidth(RegKind) ||
107 Legal->hasVectorCallVariants())));
116 if (TTI.shouldConsiderVectorizationRegPressure())
123 VF, VF.
isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
124 : MaxPermissibleVFWithoutMaxBW.FixedVF);
128 ElementCount VF,
unsigned MaxTripCount,
unsigned UserIC,
129 bool FoldTailByMasking,
bool RequiresScalarEpilogue)
const {
131 if (VF.
isScalable() &&
F.hasFnAttribute(Attribute::VScaleRange)) {
132 auto Attr =
F.getFnAttribute(Attribute::VScaleRange);
133 auto Min = Attr.getVScaleRangeMin();
140 if (MaxTripCount > 0 && RequiresScalarEpilogue)
145 unsigned IC = UserIC > 0 ? UserIC : 1;
146 unsigned EstimatedVFTimesIC = EstimatedVF * IC;
148 if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC &&
156 if (ClampedUpperTripCount == 0)
157 ClampedUpperTripCount = 1;
158 LLVM_DEBUG(
dbgs() <<
"LV: Clamping the MaxVF to maximum power of two not "
159 "exceeding the constant trip count"
160 << (UserIC > 0 ?
" divided by UserIC" :
"") <<
": "
161 << ClampedUpperTripCount <<
"\n");
168ElementCount VFSelectionContext::getMaximizedVFForTarget(
169 unsigned MaxTripCount,
unsigned SmallestType,
unsigned WidestType,
170 ElementCount MaxSafeVF,
unsigned UserIC,
bool FoldTailByMasking,
171 bool RequiresScalarEpilogue) {
172 bool ComputeScalableMaxVF = MaxSafeVF.
isScalable();
173 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
178 auto MinVF = [](
const ElementCount &
LHS,
const ElementCount &
RHS) {
180 "Scalable flags must match");
188 ComputeScalableMaxVF);
189 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
191 << (MaxVectorElementCount * WidestType) <<
" bits.\n");
193 if (!MaxVectorElementCount) {
195 << (ComputeScalableMaxVF ?
"scalable" :
"fixed")
196 <<
" vector registers.\n");
201 clampVFByMaxTripCount(MaxVectorElementCount, MaxTripCount, UserIC,
202 FoldTailByMasking, RequiresScalarEpilogue);
205 if (MaxVF != MaxVectorElementCount)
209 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
211 MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
213 if (useMaxBandwidth(ComputeScalableMaxVF)) {
216 ComputeScalableMaxVF);
217 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
219 if (ElementCount MinVF =
220 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
223 <<
") with target's minimum: " << MinVF <<
'\n');
228 MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, UserIC,
229 FoldTailByMasking, RequiresScalarEpilogue);
236 if (std::optional<unsigned> MaxVScale =
TTI.getMaxVScale())
239 if (
F.hasFnAttribute(Attribute::VScaleRange))
240 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
245bool VFSelectionContext::isScalableVectorizationAllowed() {
246 if (IsScalableVectorizationAllowed)
247 return *IsScalableVectorizationAllowed;
249 IsScalableVectorizationAllowed =
false;
255 "ScalableVectorizationDisabled", ORE, TheLoop);
259 LLVM_DEBUG(
dbgs() <<
"LV: Scalable vectorization is available\n");
262 std::numeric_limits<ElementCount::ScalarTy>::max());
271 if (!
all_of(Legal->getReductionVars(), [&](
const auto &
Reduction) ->
bool {
272 return TTI.isLegalToVectorizeReduction(Reduction.second, MaxScalableVF);
275 "Scalable vectorization not supported for the reduction "
276 "operations found in this loop.",
277 "ScalableVFUnfeasible", ORE, TheLoop);
283 if (
any_of(ElementTypesInLoop, [&](
Type *Ty) {
284 return !Ty->
isVoidTy() && !TTI.isElementTypeLegalForScalableVector(Ty);
287 "for all element types found in this loop.",
288 "ScalableVFUnfeasible", ORE, TheLoop);
292 if (!Legal->isSafeForAnyVectorWidth() && !
getMaxVScale(F, TTI)) {
294 "for safe distance analysis.",
295 "ScalableVFUnfeasible", ORE, TheLoop);
299 IsScalableVectorizationAllowed =
true;
304VFSelectionContext::getMaxLegalScalableVF(
unsigned MaxSafeElements) {
305 if (!isScalableVectorizationAllowed())
309 std::numeric_limits<ElementCount::ScalarTy>::max());
310 if (Legal->isSafeForAnyVectorWidth())
311 return MaxScalableVF;
313 std::optional<unsigned> MaxVScale =
getMaxVScale(F, TTI);
319 "Max legal vector width too small, scalable vectorization "
321 "ScalableVFUnfeasible", ORE, TheLoop);
323 return MaxScalableVF;
327 unsigned MaxTripCount,
ElementCount UserVF,
unsigned UserIC,
328 bool FoldTailByMasking,
bool RequiresScalarEpilogue) {
335 unsigned MaxSafeElementsPowerOf2 =
337 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
338 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
339 MaxSafeElementsPowerOf2 =
340 std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
344 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);
346 if (!Legal->isSafeForAnyVectorWidth())
347 MaxSafeElements = MaxSafeElementsPowerOf2;
349 LLVM_DEBUG(
dbgs() <<
"LV: The max safe fixed VF is: " << MaxSafeFixedVF
351 LLVM_DEBUG(
dbgs() <<
"LV: The max safe scalable VF is: " << MaxSafeScalableVF
357 UserVF.
isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
374 <<
" is unsafe, clamping to max safe VF="
375 << MaxSafeFixedVF <<
".\n");
378 TheLoop->getStartLoc(),
379 TheLoop->getHeader())
380 <<
"User-specified vectorization factor "
381 <<
ore::NV(
"UserVectorizationFactor", UserVF)
382 <<
" is unsafe, clamping to maximum safe vectorization factor "
383 <<
ore::NV(
"VectorizationFactor", MaxSafeFixedVF);
385 return MaxSafeFixedVF;
390 <<
" is ignored because scalable vectors are not "
394 TheLoop->getStartLoc(),
395 TheLoop->getHeader())
396 <<
"User-specified vectorization factor "
397 <<
ore::NV(
"UserVectorizationFactor", UserVF)
398 <<
" is ignored because the target does not support scalable "
399 "vectors. The compiler will pick a more suitable value.";
403 <<
" is unsafe. Ignoring scalable UserVF.\n");
406 TheLoop->getStartLoc(),
407 TheLoop->getHeader())
408 <<
"User-specified vectorization factor "
409 <<
ore::NV(
"UserVectorizationFactor", UserVF)
410 <<
" is unsafe. Ignoring the hint to let the compiler pick a "
411 "more suitable value.";
416 LLVM_DEBUG(
dbgs() <<
"LV: The Smallest and Widest types: " << SmallestType
417 <<
" / " << WidestType <<
" bits.\n");
421 if (
auto MaxVF = getMaximizedVFForTarget(
422 MaxTripCount, SmallestType, WidestType, MaxSafeFixedVF, UserIC,
423 FoldTailByMasking, RequiresScalarEpilogue))
424 Result.FixedVF = MaxVF;
426 if (
auto MaxVF = getMaximizedVFForTarget(
427 MaxTripCount, SmallestType, WidestType, MaxSafeScalableVF, UserIC,
428 FoldTailByMasking, RequiresScalarEpilogue))
430 Result.ScalableVF = MaxVF;
438std::pair<unsigned, unsigned>
440 unsigned MinWidth = -1U;
441 unsigned MaxWidth = 8;
446 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
447 for (
const auto &[
_, RdxDesc] : Legal->getReductionVars()) {
452 std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
453 RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
454 MaxWidth = std::max(MaxWidth,
455 RdxDesc.getRecurrenceType()->getScalarSizeInBits());
458 for (
Type *
T : ElementTypesInLoop) {
459 MinWidth = std::min<unsigned>(
460 MinWidth,
DL.getTypeSizeInBits(
T->getScalarType()).getFixedValue());
461 MaxWidth = std::max<unsigned>(
462 MaxWidth,
DL.getTypeSizeInBits(
T->getScalarType()).getFixedValue());
465 return {MinWidth, MaxWidth};
470 ElementTypesInLoop.clear();
478 if (ValuesToIgnore && ValuesToIgnore->
contains(&
I))
488 if (!Legal->isReductionVariable(PN))
491 Legal->getRecurrenceDescriptor(PN);
501 T = ST->getValueOperand()->getType();
504 "Expected the load/store/recurrence type to be sized");
506 ElementTypesInLoop.insert(
T);
511void VFSelectionContext::initializeVScaleForTuning() {
515 if (
F.hasFnAttribute(Attribute::VScaleRange)) {
516 auto Attr =
F.getFnAttribute(Attribute::VScaleRange);
517 auto Min = Attr.getVScaleRangeMin();
518 auto Max = Attr.getVScaleRangeMax();
519 if (Max && Min == Max) {
520 VScaleForTuning = Max;
525 VScaleForTuning = TTI.getVScaleForTuning();
530 return !Hints->allowReordering() && RdxDesc.
isOrdered();
536 Loop *L =
const_cast<Loop *
>(TheLoop);
537 if (Legal->getRuntimePointerChecking()->Need) {
539 "Runtime ptr check is required with -Os/-Oz",
540 "runtime pointer checks needed. Enable vectorization of this "
541 "loop with '#pragma clang loop vectorize(enable)' when "
542 "compiling with -Os/-Oz",
543 "CantVersionLoopWithOptForSize", ORE, L);
547 if (!PSE.getPredicate().isAlwaysTrue()) {
549 "Runtime SCEV check is required with -Os/-Oz",
550 "runtime SCEV checks needed. Enable vectorization of this "
551 "loop with '#pragma clang loop vectorize(enable)' when "
552 "compiling with -Os/-Oz",
553 "CantVersionLoopWithOptForSize", ORE, L);
558 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
560 "Runtime stride check for small trip count",
561 "runtime stride == 1 checks needed. Enable vectorization of "
562 "this loop without such check by compiling with -Os/-Oz",
563 "CantVersionLoopWithOptForSize", ORE, L);
576 if (!InLoopReductions.empty())
579 for (
const auto &Reduction : Legal->getReductionVars()) {
580 PHINode *Phi = Reduction.first;
602 !TTI.preferInLoopReduction(Kind, Phi->getType()))
610 bool InLoop = !ReductionOperations.
empty();
613 InLoopReductions.insert(Phi);
616 for (
auto *
I : ReductionOperations) {
617 InLoopReductionImmediateChains[
I] = LastChain;
621 LLVM_DEBUG(
dbgs() <<
"LV: Using " << (InLoop ?
"inloop" :
"out of loop")
622 <<
" reduction for phi: " << *Phi <<
"\n");
635 "Scalable vectorization requested but not supported by the target",
636 "the scalable user-specified vectorization width for outer-loop "
637 "vectorization cannot be used because the target does not support "
639 "ScalableVFUnfeasible", ORE, TheLoop);
647 auto RegKind = TTI.enableScalableVectorization()
652 unsigned N =
RegSize.getKnownMinValue() / WidestType;
659 <<
"overriding computed VF.\n");
664 "VF needs to be a power of two");
668 <<
"VF " << VF <<
" to build VPlans.\n");
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
loop Loop Strength Reduction
This file defines the LoopVectorizationLegality class.
static cl::opt< bool > ForceTargetSupportsGatherScatterOps("force-target-supports-gather-scatter-ops", cl::init(false), cl::Hidden, cl::desc("Assume the target supports gather/scatter operations (used for " "testing)."))
cl::opt< bool > VPlanBuildOuterloopStressTest
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static cl::opt< bool > ConsiderRegPressure("vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden, cl::desc("Discard VFs if their register pressure is too high."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static cl::opt< bool > ForceTargetSupportsMaskedMemoryOps("force-target-supports-masked-memory-ops", cl::init(false), cl::Hidden, cl::desc("Assume the target supports masked memory operations (used for " "testing)."))
Note: This currently only applies to llvm.masked.load and llvm.masked.store.
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
This file provides a LoopVectorizationPlanner class.
LLVM Basic Block Representation.
A parsed version of the target data layout string in and methods for querying it.
constexpr bool isVector() const
One or more elements.
static constexpr ElementCount getScalable(ScalarTy MinVal)
static constexpr ElementCount getFixed(ScalarTy MinVal)
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
constexpr bool isScalar() const
Exactly one element.
bool isScalableVectorizationDisabled() const
Represents a single loop in the control flow graph.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
bool hasUsesOutsideReductionChain() const
Returns true if the reduction PHI has any uses outside the reduction chain.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool contains(ConstPtrType Ptr) const
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
bool isVoidTy() const
Return true if this is 'void'.
FixedScalableVFPair computeVPlanOuterloopVF(ElementCount UserVF)
Returns a scalable VF to use for outer-loop vectorization if the target supports it and a fixed VF ot...
std::pair< unsigned, unsigned > getSmallestAndWidestTypes() const
bool supportsScalableVectors() const
bool runtimeChecksRequired()
Check whether vectorization would require runtime checks.
bool isLegalGatherOrScatter(Value *V, ElementCount VF) const
Returns true if the target machine can represent V as a masked gather or scatter operation.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC, bool FoldTailByMasking, bool RequiresScalarEpilogue)
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
bool shouldConsiderRegPressureForVF(ElementCount VF) const
void collectElementTypesForWidening(const SmallPtrSetImpl< const Value * > *ValuesToIgnore=nullptr)
Collect element types in the loop that need widening.
bool isLegalMaskedLoadOrStore(Instruction *I, ElementCount VF) const
Returns true if the target machine supports masked loads or stores for I's data type and alignment.
void computeMinimalBitwidths()
Compute smallest bitwidth each instruction can be represented with.
LLVM Value Representation.
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
constexpr bool isZero() const
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
RecurKind
These are the kinds of recurrences that we support.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
cl::opt< bool > PreferInLoopReductions
This struct is a compact representation of a valid (non-zero power of two) alignment.
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()