LLVM 22.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <memory>
99#include <optional>
100#include <set>
101#include <string>
102#include <tuple>
103#include <utility>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107using namespace slpvectorizer;
108using namespace std::placeholders;
109
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
112
113STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
114
115DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
116 "Controls which SLP graphs should be vectorized.");
117
118static cl::opt<bool>
119 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
120 cl::desc("Run the SLP vectorization passes"));
121
122static cl::opt<bool>
123 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
124 cl::desc("Enable vectorization for wider vector utilization"));
125
126static cl::opt<int>
128 cl::desc("Only vectorize if you gain more than this "
129 "number "));
130
132 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
133 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
135
136static cl::opt<bool>
137ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
138 cl::desc("Attempt to vectorize horizontal reductions"));
139
141 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
142 cl::desc(
143 "Attempt to vectorize horizontal reductions feeding into a store"));
144
146 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
147 cl::desc("Improve the code quality by splitting alternate instructions"));
148
149static cl::opt<int>
151 cl::desc("Attempt to vectorize for this register size in bits"));
152
155 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
156
157/// Limits the size of scheduling regions in a block.
158/// It avoid long compile times for _very_ large blocks where vector
159/// instructions are spread over a wide range.
160/// This limit is way higher than needed by real-world functions.
161static cl::opt<int>
162ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
163 cl::desc("Limit the size of the SLP scheduling region per block"));
164
166 "slp-min-reg-size", cl::init(128), cl::Hidden,
167 cl::desc("Attempt to vectorize for this register size in bits"));
168
170 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
171 cl::desc("Limit the recursion depth when building a vectorizable tree"));
172
174 "slp-min-tree-size", cl::init(3), cl::Hidden,
175 cl::desc("Only vectorize small trees if they are fully vectorizable"));
176
177// The maximum depth that the look-ahead score heuristic will explore.
178// The higher this value, the higher the compilation time overhead.
180 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
181 cl::desc("The maximum look-ahead depth for operand reordering scores"));
182
183// The maximum depth that the look-ahead score heuristic will explore
184// when it probing among candidates for vectorization tree roots.
185// The higher this value, the higher the compilation time overhead but unlike
186// similar limit for operands ordering this is less frequently used, hence
187// impact of higher value is less noticeable.
189 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
190 cl::desc("The maximum look-ahead depth for searching best rooting option"));
191
193 "slp-min-strided-loads", cl::init(2), cl::Hidden,
194 cl::desc("The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
196
198 "slp-max-stride", cl::init(8), cl::Hidden,
199 cl::desc("The maximum stride, considered to be profitable."));
200
201static cl::opt<bool>
202 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
203 cl::desc("Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
205
206static cl::opt<bool>
207 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
208 cl::desc("Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
210
211static cl::opt<bool>
212 ViewSLPTree("view-slp-tree", cl::Hidden,
213 cl::desc("Display the SLP trees with Graphviz"));
214
216 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
217 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
218
219/// Enables vectorization of copyable elements.
221 "slp-copyable-elements", cl::init(true), cl::Hidden,
222 cl::desc("Try to replace values with the idempotent instructions for "
223 "better vectorization."));
224
225// Limit the number of alias checks. The limit is chosen so that
226// it has no negative effect on the llvm benchmarks.
227static const unsigned AliasedCheckLimit = 10;
228
229// Limit of the number of uses for potentially transformed instructions/values,
230// used in checks to avoid compile-time explode.
231static constexpr int UsesLimit = 64;
232
233// Another limit for the alias checks: The maximum distance between load/store
234// instructions where alias checks are done.
235// This limit is useful for very large basic blocks.
236static const unsigned MaxMemDepDistance = 160;
237
238/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
239/// regions to be handled.
240static const int MinScheduleRegionSize = 16;
241
242/// Maximum allowed number of operands in the PHI nodes.
243static const unsigned MaxPHINumOperands = 128;
244
245/// Predicate for the element types that the SLP vectorizer supports.
246///
247/// The most important thing to filter here are types which are invalid in LLVM
248/// vectors. We also filter target specific types which have absolutely no
249/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
250/// avoids spending time checking the cost model and realizing that they will
251/// be inevitably scalarized.
252static bool isValidElementType(Type *Ty) {
253 // TODO: Support ScalableVectorType.
255 Ty = Ty->getScalarType();
256 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
257 !Ty->isPPC_FP128Ty();
258}
259
260/// Returns the type of the given value/instruction \p V. If it is store,
261/// returns the type of its value operand, for Cmp - the types of the compare
262/// operands and for insertelement - the type os the inserted operand.
263/// Otherwise, just the type of the value is returned.
265 if (auto *SI = dyn_cast<StoreInst>(V))
266 return SI->getValueOperand()->getType();
267 if (auto *CI = dyn_cast<CmpInst>(V))
268 return CI->getOperand(0)->getType();
269 if (auto *IE = dyn_cast<InsertElementInst>(V))
270 return IE->getOperand(1)->getType();
271 return V->getType();
272}
273
274/// \returns the number of elements for Ty.
275static unsigned getNumElements(Type *Ty) {
277 "ScalableVectorType is not supported.");
278 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
279 return VecTy->getNumElements();
280 return 1;
281}
282
283/// \returns the vector type of ScalarTy based on vectorization factor.
284static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
285 return FixedVectorType::get(ScalarTy->getScalarType(),
286 VF * getNumElements(ScalarTy));
287}
288
289/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
290/// which forms type, which splits by \p TTI into whole vector types during
291/// legalization.
293 Type *Ty, unsigned Sz) {
294 if (!isValidElementType(Ty))
295 return bit_ceil(Sz);
296 // Find the number of elements, which forms full vectors.
297 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
298 if (NumParts == 0 || NumParts >= Sz)
299 return bit_ceil(Sz);
300 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
301}
302
303/// Returns the number of elements of the given type \p Ty, not greater than \p
304/// Sz, which forms type, which splits by \p TTI into whole vector types during
305/// legalization.
306static unsigned
308 unsigned Sz) {
309 if (!isValidElementType(Ty))
310 return bit_floor(Sz);
311 // Find the number of elements, which forms full vectors.
312 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
313 if (NumParts == 0 || NumParts >= Sz)
314 return bit_floor(Sz);
315 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
316 if (RegVF > Sz)
317 return bit_floor(Sz);
318 return (Sz / RegVF) * RegVF;
319}
320
321static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
322 SmallVectorImpl<int> &Mask) {
323 // The ShuffleBuilder implementation use shufflevector to splat an "element".
324 // But the element have different meaning for SLP (scalar) and REVEC
325 // (vector). We need to expand Mask into masks which shufflevector can use
326 // directly.
327 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
328 for (unsigned I : seq<unsigned>(Mask.size()))
329 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
330 I * VecTyNumElements, VecTyNumElements)))
331 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
332 : Mask[I] * VecTyNumElements + J;
333 Mask.swap(NewMask);
334}
335
336/// \returns the number of groups of shufflevector
337/// A group has the following features
338/// 1. All of value in a group are shufflevector.
339/// 2. The mask of all shufflevector is isExtractSubvectorMask.
340/// 3. The mask of all shufflevector uses all of the elements of the source.
341/// e.g., it is 1 group (%0)
342/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
343/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
344/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
345/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
346/// it is 2 groups (%3 and %4)
347/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
348/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
349/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
350/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
351/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
352/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
353/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
354/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
355/// it is 0 group
356/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
357/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
358/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
359/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
361 if (VL.empty())
362 return 0;
364 return 0;
365 auto *SV = cast<ShuffleVectorInst>(VL.front());
366 unsigned SVNumElements =
367 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
370 return 0;
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
373 return 0;
374 unsigned NumGroup = 0;
375 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
376 auto *SV = cast<ShuffleVectorInst>(VL[I]);
377 Value *Src = SV->getOperand(0);
378 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
379 SmallBitVector ExpectedIndex(GroupSize);
380 if (!all_of(Group, [&](Value *V) {
381 auto *SV = cast<ShuffleVectorInst>(V);
382 // From the same source.
383 if (SV->getOperand(0) != Src)
384 return false;
385 int Index;
386 if (!SV->isExtractSubvectorMask(Index))
387 return false;
388 ExpectedIndex.set(Index / ShuffleMaskSize);
389 return true;
390 }))
391 return 0;
392 if (!ExpectedIndex.all())
393 return 0;
394 ++NumGroup;
395 }
396 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
397 return NumGroup;
398}
399
400/// \returns a shufflevector mask which is used to vectorize shufflevectors
401/// e.g.,
402/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
403/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
404/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
405/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
406/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
407/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
408/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
409/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
410/// the result is
411/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
413 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
414 auto *SV = cast<ShuffleVectorInst>(VL.front());
415 unsigned SVNumElements =
416 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
417 SmallVector<int> Mask;
418 unsigned AccumulateLength = 0;
419 for (Value *V : VL) {
420 auto *SV = cast<ShuffleVectorInst>(V);
421 for (int M : SV->getShuffleMask())
422 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
425 }
426 return Mask;
427}
428
429/// \returns True if the value is a constant (but not globals/constant
430/// expressions).
431static bool isConstant(Value *V) {
433}
434
435/// Checks if \p V is one of vector-like instructions, i.e. undef,
436/// insertelement/extractelement with constant indices for fixed vector type or
437/// extractvalue instruction.
441 return false;
442 auto *I = dyn_cast<Instruction>(V);
443 if (!I || isa<ExtractValueInst>(I))
444 return true;
445 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
446 return false;
448 return isConstant(I->getOperand(1));
449 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
450 return isConstant(I->getOperand(2));
451}
452
453/// Returns power-of-2 number of elements in a single register (part), given the
454/// total number of elements \p Size and number of registers (parts) \p
455/// NumParts.
456static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
457 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
458}
459
460/// Returns correct remaining number of elements, considering total amount \p
461/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
462/// and current register (part) \p Part.
463static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
464 unsigned Part) {
465 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
466}
467
468#if !defined(NDEBUG)
469/// Print a short descriptor of the instruction bundle suitable for debug output.
470static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
471 std::string Result;
472 raw_string_ostream OS(Result);
473 if (Idx >= 0)
474 OS << "Idx: " << Idx << ", ";
475 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
476 return Result;
477}
478#endif
479
480/// \returns true if all of the instructions in \p VL are in the same block or
481/// false otherwise.
483 auto *It = find_if(VL, IsaPred<Instruction>);
484 if (It == VL.end())
485 return false;
488 return true;
489
490 BasicBlock *BB = I0->getParent();
491 for (Value *V : iterator_range(It, VL.end())) {
492 if (isa<PoisonValue>(V))
493 continue;
494 auto *II = dyn_cast<Instruction>(V);
495 if (!II)
496 return false;
497
498 if (BB != II->getParent())
499 return false;
500 }
501 return true;
502}
503
504/// \returns True if all of the values in \p VL are constants (but not
505/// globals/constant expressions).
507 // Constant expressions and globals can't be vectorized like normal integer/FP
508 // constants.
509 return all_of(VL, isConstant);
510}
511
512/// \returns True if all of the values in \p VL are identical or some of them
513/// are UndefValue.
514static bool isSplat(ArrayRef<Value *> VL) {
515 Value *FirstNonUndef = nullptr;
516 for (Value *V : VL) {
517 if (isa<UndefValue>(V))
518 continue;
519 if (!FirstNonUndef) {
520 FirstNonUndef = V;
521 continue;
522 }
523 if (V != FirstNonUndef)
524 return false;
525 }
526 return FirstNonUndef != nullptr;
527}
528
529/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
530/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
531/// patterns that make it effectively commutative (like equality comparisons
532/// with zero).
533/// In most cases, users should not call this function directly (since \p I and
534/// \p InstWithUses are the same). However, when analyzing interchangeable
535/// instructions, we need to use the converted opcode along with the original
536/// uses.
537/// \param I The instruction to check for commutativity
538/// \param ValWithUses The value whose uses are analyzed for special
539/// patterns
540static bool isCommutative(Instruction *I, Value *ValWithUses) {
541 if (auto *Cmp = dyn_cast<CmpInst>(I))
542 return Cmp->isCommutative();
543 if (auto *BO = dyn_cast<BinaryOperator>(I))
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
546 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
547 all_of(
548 ValWithUses->uses(),
549 [](const Use &U) {
550 // Commutative, if icmp eq/ne sub, 0
551 CmpPredicate Pred;
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
555 return true;
556 // Commutative, if abs(sub nsw, true) or abs(sub, false).
557 ConstantInt *Flag;
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
562 Flag->isOne());
563 })) ||
564 (BO->getOpcode() == Instruction::FSub &&
565 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
566 all_of(ValWithUses->uses(), [](const Use &U) {
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
569 }));
570 return I->isCommutative();
571}
572
573/// This is a helper function to check whether \p I is commutative.
574/// This is a convenience wrapper that calls the two-parameter version of
575/// isCommutative with the same instruction for both parameters. This is
576/// the common case where the instruction being checked for commutativity
577/// is the same as the instruction whose uses are analyzed for special
578/// patterns (see the two-parameter version above for details).
579/// \param I The instruction to check for commutativity
580/// \returns true if the instruction is commutative, false otherwise
581static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
582
583/// \returns number of operands of \p I, considering commutativity. Returns 2
584/// for commutative instrinsics.
585/// \param I The instruction to check for commutativity
588 // IntrinsicInst::isCommutative returns true if swapping the first "two"
589 // arguments to the intrinsic produces the same result.
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
592 }
593 return I->getNumOperands();
594}
595
596template <typename T>
597static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
598 unsigned Offset) {
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
601 "unsupported T");
602 int Index = Offset;
603 if (const auto *IE = dyn_cast<T>(Inst)) {
604 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
605 if (!VT)
606 return std::nullopt;
607 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
608 if (!CI)
609 return std::nullopt;
610 if (CI->getValue().uge(VT->getNumElements()))
611 return std::nullopt;
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
614 return Index;
615 }
616 return std::nullopt;
617}
618
619/// \returns inserting or extracting index of InsertElement, ExtractElement or
620/// InsertValue instruction, using Offset as base offset for index.
621/// \returns std::nullopt if the index is not an immediate.
622static std::optional<unsigned> getElementIndex(const Value *Inst,
623 unsigned Offset = 0) {
624 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
625 return Index;
627 return Index;
628
629 int Index = Offset;
630
631 const auto *IV = dyn_cast<InsertValueInst>(Inst);
632 if (!IV)
633 return std::nullopt;
634
635 Type *CurrentType = IV->getType();
636 for (unsigned I : IV->indices()) {
637 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(I);
640 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
643 } else {
644 return std::nullopt;
645 }
646 Index += I;
647 }
648 return Index;
649}
650
651/// \returns true if all of the values in \p VL use the same opcode.
652/// For comparison instructions, also checks if predicates match.
653/// PoisonValues are considered matching.
654/// Interchangeable instructions are not considered.
656 auto *It = find_if(VL, IsaPred<Instruction>);
657 if (It == VL.end())
658 return true;
659 Instruction *MainOp = cast<Instruction>(*It);
660 unsigned Opcode = MainOp->getOpcode();
661 bool IsCmpOp = isa<CmpInst>(MainOp);
662 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
664 return std::all_of(It, VL.end(), [&](Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
670 });
671}
672
673namespace {
674/// Specifies the way the mask should be analyzed for undefs/poisonous elements
675/// in the shuffle mask.
676enum class UseMask {
677 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
678 ///< check for the mask elements for the first argument (mask
679 ///< indices are in range [0:VF)).
680 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
681 ///< for the mask elements for the second argument (mask indices
682 ///< are in range [VF:2*VF))
683 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
684 ///< future shuffle elements and mark them as ones as being used
685 ///< in future. Non-undef elements are considered as unused since
686 ///< they're already marked as used in the mask.
687};
688} // namespace
689
690/// Prepares a use bitset for the given mask either for the first argument or
691/// for the second.
693 UseMask MaskArg) {
694 SmallBitVector UseMask(VF, true);
695 for (auto [Idx, Value] : enumerate(Mask)) {
696 if (Value == PoisonMaskElem) {
697 if (MaskArg == UseMask::UndefsAsMask)
698 UseMask.reset(Idx);
699 continue;
700 }
701 if (MaskArg == UseMask::FirstArg && Value < VF)
702 UseMask.reset(Value);
703 else if (MaskArg == UseMask::SecondArg && Value >= VF)
704 UseMask.reset(Value - VF);
705 }
706 return UseMask;
707}
708
709/// Checks if the given value is actually an undefined constant vector.
710/// Also, if the \p UseMask is not empty, tries to check if the non-masked
711/// elements actually mask the insertelement buildvector, if any.
712template <bool IsPoisonOnly = false>
714 const SmallBitVector &UseMask = {}) {
715 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
717 if (isa<T>(V))
718 return Res;
719 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
720 if (!VecTy)
721 return Res.reset();
722 auto *C = dyn_cast<Constant>(V);
723 if (!C) {
724 if (!UseMask.empty()) {
725 const Value *Base = V;
726 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
727 Base = II->getOperand(0);
728 if (isa<T>(II->getOperand(1)))
729 continue;
730 std::optional<unsigned> Idx = getElementIndex(II);
731 if (!Idx) {
732 Res.reset();
733 return Res;
734 }
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
736 Res.reset(*Idx);
737 }
738 // TODO: Add analysis for shuffles here too.
739 if (V == Base) {
740 Res.reset();
741 } else {
742 SmallBitVector SubMask(UseMask.size(), false);
743 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
744 }
745 } else {
746 Res.reset();
747 }
748 return Res;
749 }
750 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
751 if (Constant *Elem = C->getAggregateElement(I))
752 if (!isa<T>(Elem) &&
753 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
754 Res.reset(I);
755 }
756 return Res;
757}
758
759/// Checks if the vector of instructions can be represented as a shuffle, like:
760/// %x0 = extractelement <4 x i8> %x, i32 0
761/// %x3 = extractelement <4 x i8> %x, i32 3
762/// %y1 = extractelement <4 x i8> %y, i32 1
763/// %y2 = extractelement <4 x i8> %y, i32 2
764/// %x0x0 = mul i8 %x0, %x0
765/// %x3x3 = mul i8 %x3, %x3
766/// %y1y1 = mul i8 %y1, %y1
767/// %y2y2 = mul i8 %y2, %y2
768/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
769/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
770/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
771/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
772/// ret <4 x i8> %ins4
773/// can be transformed into:
774/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
775/// i32 6>
776/// %2 = mul <4 x i8> %1, %1
777/// ret <4 x i8> %2
778/// Mask will return the Shuffle Mask equivalent to the extracted elements.
779/// TODO: Can we split off and reuse the shuffle mask detection from
780/// ShuffleVectorInst/getShuffleCost?
781static std::optional<TargetTransformInfo::ShuffleKind>
783 AssumptionCache *AC) {
784 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
785 if (It == VL.end())
786 return std::nullopt;
787 unsigned Size =
788 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
790 if (!EI)
791 return S;
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
793 if (!VTy)
794 return S;
795 return std::max(S, VTy->getNumElements());
796 });
797
798 Value *Vec1 = nullptr;
799 Value *Vec2 = nullptr;
800 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
801 auto *EE = dyn_cast<ExtractElementInst>(V);
802 if (!EE)
803 return false;
804 Value *Vec = EE->getVectorOperand();
805 if (isa<UndefValue>(Vec))
806 return false;
807 return isGuaranteedNotToBePoison(Vec, AC);
808 });
809 enum ShuffleMode { Unknown, Select, Permute };
810 ShuffleMode CommonShuffleMode = Unknown;
811 Mask.assign(VL.size(), PoisonMaskElem);
812 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
813 // Undef can be represented as an undef element in a vector.
814 if (isa<UndefValue>(VL[I]))
815 continue;
816 auto *EI = cast<ExtractElementInst>(VL[I]);
817 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
818 return std::nullopt;
819 auto *Vec = EI->getVectorOperand();
820 // We can extractelement from undef or poison vector.
822 continue;
823 // All vector operands must have the same number of vector elements.
824 if (isa<UndefValue>(Vec)) {
825 Mask[I] = I;
826 } else {
827 if (isa<UndefValue>(EI->getIndexOperand()))
828 continue;
829 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
830 if (!Idx)
831 return std::nullopt;
832 // Undefined behavior if Idx is negative or >= Size.
833 if (Idx->getValue().uge(Size))
834 continue;
835 unsigned IntIdx = Idx->getValue().getZExtValue();
836 Mask[I] = IntIdx;
837 }
838 if (isUndefVector(Vec).all() && HasNonUndefVec)
839 continue;
840 // For correct shuffling we have to have at most 2 different vector operands
841 // in all extractelement instructions.
842 if (!Vec1 || Vec1 == Vec) {
843 Vec1 = Vec;
844 } else if (!Vec2 || Vec2 == Vec) {
845 Vec2 = Vec;
846 Mask[I] += Size;
847 } else {
848 return std::nullopt;
849 }
850 if (CommonShuffleMode == Permute)
851 continue;
852 // If the extract index is not the same as the operation number, it is a
853 // permutation.
854 if (Mask[I] % Size != I) {
855 CommonShuffleMode = Permute;
856 continue;
857 }
858 CommonShuffleMode = Select;
859 }
860 // If we're not crossing lanes in different vectors, consider it as blending.
861 if (CommonShuffleMode == Select && Vec2)
863 // If Vec2 was never used, we have a permutation of a single vector, otherwise
864 // we have permutation of 2 vectors.
867}
868
869/// \returns True if Extract{Value,Element} instruction extracts element Idx.
870static std::optional<unsigned> getExtractIndex(const Instruction *E) {
871 unsigned Opcode = E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
876 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
877 if (!CI)
878 return std::nullopt;
879 return CI->getZExtValue();
880 }
881 auto *EI = cast<ExtractValueInst>(E);
882 if (EI->getNumIndices() != 1)
883 return std::nullopt;
884 return *EI->idx_begin();
885}
886
887namespace llvm {
888/// Checks if the provided value does not require scheduling. It does not
889/// require scheduling if this is not an instruction or it is an instruction
890/// that does not read/write memory and all operands are either not instructions
891/// or phi nodes or instructions from different blocks.
892static bool areAllOperandsNonInsts(Value *V);
893/// Checks if the provided value does not require scheduling. It does not
894/// require scheduling if this is not an instruction or it is an instruction
895/// that does not read/write memory and all users are phi nodes or instructions
896/// from the different blocks.
897static bool isUsedOutsideBlock(Value *V);
898/// Checks if the specified value does not require scheduling. It does not
899/// require scheduling if all operands and all users do not need to be scheduled
900/// in the current basic block.
901static bool doesNotNeedToBeScheduled(Value *V);
902} // namespace llvm
903
904namespace {
905/// \returns true if \p Opcode is allowed as part of the main/alternate
906/// instruction for SLP vectorization.
907///
908/// Example of unsupported opcode is SDIV that can potentially cause UB if the
909/// "shuffled out" lane would result in division by zero.
910bool isValidForAlternation(unsigned Opcode) {
911 return !Instruction::isIntDivRem(Opcode);
912}
913
914/// Helper class that determines VL can use the same opcode.
915/// Alternate instruction is supported. In addition, it supports interchangeable
916/// instruction. An interchangeable instruction is an instruction that can be
917/// converted to another instruction with same semantics. For example, x << 1 is
918/// equal to x * 2. x * 1 is equal to x | 0.
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
921 /// Sort SupportedOp because it is used by binary_search.
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
925 enum : MaskType {
926 ShlBIT = 0b1,
927 AShrBIT = 0b10,
928 MulBIT = 0b100,
929 AddBIT = 0b1000,
930 SubBIT = 0b10000,
931 AndBIT = 0b100000,
932 OrBIT = 0b1000000,
933 XorBIT = 0b10000000,
934 MainOpBIT = 0b100000000,
936 };
937 /// Return a non-nullptr if either operand of I is a ConstantInt.
938 /// The second return value represents the operand position. We check the
939 /// right-hand side first (1). If the right hand side is not a ConstantInt and
940 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
941 /// side (0).
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(const Instruction *I) {
944 unsigned Opcode = I->getOpcode();
945 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
946 (void)SupportedOp;
947 auto *BinOp = cast<BinaryOperator>(I);
948 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
949 return {CI, 1};
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
952 return {nullptr, 0};
953 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
954 return {CI, 0};
955 return {nullptr, 0};
956 }
957 struct InterchangeableInfo {
958 const Instruction *I = nullptr;
959 /// The bit it sets represents whether MainOp can be converted to.
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
962 /// We cannot create an interchangeable instruction that does not exist in
963 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
964 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
965 /// 1]. SeenBefore is used to know what operations have been seen before.
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(const Instruction *I) : I(I) {}
968 /// Return false allows BinOpSameOpcodeHelper to find an alternate
969 /// instruction. Directly setting the mask will destroy the mask state,
970 /// preventing us from determining which instruction it should convert to.
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
975 return true;
976 }
977 return false;
978 }
979 bool equal(unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
981 }
982 unsigned getOpcode() const {
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1002 llvm_unreachable("Cannot find interchangeable instruction.");
1003 }
1004
1005 /// Return true if the instruction can be converted to \p Opcode.
1006 bool hasCandidateOpcode(unsigned Opcode) const {
1007 MaskType Candidate = Mask & SeenBefore;
1008 switch (Opcode) {
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1035 return false;
1036 default:
1037 break;
1038 }
1039 llvm_unreachable("Cannot find interchangeable instruction.");
1040 }
1041
1042 SmallVector<Value *> getOperand(const Instruction *To) const {
1043 unsigned ToOpcode = To->getOpcode();
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1046 return SmallVector<Value *>(I->operands());
1047 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1051 APInt ToCIValue;
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1055 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1056 FromCIValue.getZExtValue());
1057 } else {
1058 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1060 ? APInt::getAllOnes(FromCIValueBitWidth)
1061 : APInt::getZero(FromCIValueBitWidth);
1062 }
1063 break;
1064 case Instruction::Mul:
1065 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1068 } else {
1069 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1071 ? APInt::getAllOnes(FromCIValueBitWidth)
1072 : APInt::getZero(FromCIValueBitWidth);
1073 }
1074 break;
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.isZero()) {
1078 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1079 } else {
1080 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1083 ToCIValue.negate();
1084 }
1085 break;
1086 case Instruction::And:
1087 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1088 ToCIValue = ToOpcode == Instruction::Mul
1089 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1090 : APInt::getZero(FromCIValueBitWidth);
1091 break;
1092 default:
1093 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1094 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1095 break;
1096 }
1097 Value *LHS = I->getOperand(1 - Pos);
1098 Constant *RHS =
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1100 // constant + x cannot be -constant - x
1101 // instead, it should be x - -constant
1102 if (Pos == 1 ||
1103 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1104 FromOpcode == Instruction::Xor) &&
1105 ToOpcode == Instruction::Sub))
1106 return SmallVector<Value *>({LHS, RHS});
1107 return SmallVector<Value *>({RHS, LHS});
1108 }
1109 };
1110 InterchangeableInfo MainOp;
1111 InterchangeableInfo AltOp;
1112 bool isValidForAlternation(const Instruction *I) const {
1113 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1114 ::isValidForAlternation(I->getOpcode());
1115 }
1116 bool initializeAltOp(const Instruction *I) {
1117 if (AltOp.I)
1118 return true;
1119 if (!isValidForAlternation(I))
1120 return false;
1121 AltOp.I = I;
1122 return true;
1123 }
1124
1125public:
1126 BinOpSameOpcodeHelper(const Instruction *MainOp,
1127 const Instruction *AltOp = nullptr)
1128 : MainOp(MainOp), AltOp(AltOp) {
1129 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1130 }
1131 bool add(const Instruction *I) {
1133 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1134 unsigned Opcode = I->getOpcode();
1135 MaskType OpcodeInMaskForm;
1136 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1137 switch (Opcode) {
1138 case Instruction::Shl:
1139 OpcodeInMaskForm = ShlBIT;
1140 break;
1141 case Instruction::AShr:
1142 OpcodeInMaskForm = AShrBIT;
1143 break;
1144 case Instruction::Mul:
1145 OpcodeInMaskForm = MulBIT;
1146 break;
1147 case Instruction::Add:
1148 OpcodeInMaskForm = AddBIT;
1149 break;
1150 case Instruction::Sub:
1151 OpcodeInMaskForm = SubBIT;
1152 break;
1153 case Instruction::And:
1154 OpcodeInMaskForm = AndBIT;
1155 break;
1156 case Instruction::Or:
1157 OpcodeInMaskForm = OrBIT;
1158 break;
1159 case Instruction::Xor:
1160 OpcodeInMaskForm = XorBIT;
1161 break;
1162 default:
1163 return MainOp.equal(Opcode) ||
1164 (initializeAltOp(I) && AltOp.equal(Opcode));
1165 }
1166 MaskType InterchangeableMask = OpcodeInMaskForm;
1167 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1168 if (CI) {
1169 constexpr MaskType CanBeAll =
1170 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1171 const APInt &CIValue = CI->getValue();
1172 switch (Opcode) {
1173 case Instruction::Shl:
1174 if (CIValue.ult(CIValue.getBitWidth()))
1175 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1176 break;
1177 case Instruction::Mul:
1178 if (CIValue.isOne()) {
1179 InterchangeableMask = CanBeAll;
1180 break;
1181 }
1182 if (CIValue.isPowerOf2())
1183 InterchangeableMask = MulBIT | ShlBIT;
1184 break;
1185 case Instruction::Add:
1186 case Instruction::Sub:
1187 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1188 break;
1189 case Instruction::And:
1190 if (CIValue.isAllOnes())
1191 InterchangeableMask = CanBeAll;
1192 break;
1193 case Instruction::Xor:
1194 if (CIValue.isZero())
1195 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1196 break;
1197 default:
1198 if (CIValue.isZero())
1199 InterchangeableMask = CanBeAll;
1200 break;
1201 }
1202 }
1203 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1204 (initializeAltOp(I) &&
1205 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1206 }
1207 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1208 /// Checks if the list of potential opcodes includes \p Opcode.
1209 bool hasCandidateOpcode(unsigned Opcode) const {
1210 return MainOp.hasCandidateOpcode(Opcode);
1211 }
1212 bool hasAltOp() const { return AltOp.I; }
1213 unsigned getAltOpcode() const {
1214 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1215 }
1216 SmallVector<Value *> getOperand(const Instruction *I) const {
1217 return MainOp.getOperand(I);
1218 }
1219};
1220
1221/// Main data required for vectorization of instructions.
1222class InstructionsState {
1223 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1224 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1225 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1226 /// isAltShuffle).
1227 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1228 /// from getMainAltOpsNoStateVL.
1229 /// For those InstructionsState that use alternate instructions, the resulting
1230 /// vectorized output ultimately comes from a shufflevector. For example,
1231 /// given a vector list (VL):
1232 /// VL[0] = add i32 a, e
1233 /// VL[1] = sub i32 b, f
1234 /// VL[2] = add i32 c, g
1235 /// VL[3] = sub i32 d, h
1236 /// The vectorized result would be:
1237 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1238 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1239 /// result = shufflevector <4 x i32> intermediated_0,
1240 /// <4 x i32> intermediated_1,
1241 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1242 /// Since shufflevector is used in the final result, when calculating the cost
1243 /// (getEntryCost), we must account for the usage of shufflevector in
1244 /// GetVectorCost.
1245 Instruction *MainOp = nullptr;
1246 Instruction *AltOp = nullptr;
1247 /// Wether the instruction state represents copyable instructions.
1248 bool HasCopyables = false;
1249
1250public:
1251 Instruction *getMainOp() const {
1252 assert(valid() && "InstructionsState is invalid.");
1253 return MainOp;
1254 }
1255
1256 Instruction *getAltOp() const {
1257 assert(valid() && "InstructionsState is invalid.");
1258 return AltOp;
1259 }
1260
1261 /// The main/alternate opcodes for the list of instructions.
1262 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1263
1264 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1265
1266 /// Some of the instructions in the list have alternate opcodes.
1267 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1268
1269 /// Checks if the instruction matches either the main or alternate opcode.
1270 /// \returns
1271 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1272 /// to it
1273 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1274 /// it
1275 /// - nullptr if \param I cannot be matched or converted to either opcode
1276 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1277 assert(MainOp && "MainOp cannot be nullptr.");
1278 if (I->getOpcode() == MainOp->getOpcode())
1279 return MainOp;
1280 // Prefer AltOp instead of interchangeable instruction of MainOp.
1281 assert(AltOp && "AltOp cannot be nullptr.");
1282 if (I->getOpcode() == AltOp->getOpcode())
1283 return AltOp;
1284 if (!I->isBinaryOp())
1285 return nullptr;
1286 BinOpSameOpcodeHelper Converter(MainOp);
1287 if (!Converter.add(I) || !Converter.add(MainOp))
1288 return nullptr;
1289 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1290 BinOpSameOpcodeHelper AltConverter(AltOp);
1291 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1292 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1293 return AltOp;
1294 }
1295 if (Converter.hasAltOp() && !isAltShuffle())
1296 return nullptr;
1297 return Converter.hasAltOp() ? AltOp : MainOp;
1298 }
1299
1300 /// Checks if main/alt instructions are shift operations.
1301 bool isShiftOp() const {
1302 return getMainOp()->isShift() && getAltOp()->isShift();
1303 }
1304
1305 /// Checks if main/alt instructions are bitwise logic operations.
1306 bool isBitwiseLogicOp() const {
1307 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1308 }
1309
1310 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1311 bool isMulDivLikeOp() const {
1312 constexpr std::array<unsigned, 8> MulDiv = {
1313 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1314 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1315 Instruction::URem, Instruction::FRem};
1316 return is_contained(MulDiv, getOpcode()) &&
1317 is_contained(MulDiv, getAltOpcode());
1318 }
1319
1320 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1321 bool isAddSubLikeOp() const {
1322 constexpr std::array<unsigned, 4> AddSub = {
1323 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1324 Instruction::FSub};
1325 return is_contained(AddSub, getOpcode()) &&
1326 is_contained(AddSub, getAltOpcode());
1327 }
1328
1329 /// Checks if main/alt instructions are cmp operations.
1330 bool isCmpOp() const {
1331 return (getOpcode() == Instruction::ICmp ||
1332 getOpcode() == Instruction::FCmp) &&
1333 getAltOpcode() == getOpcode();
1334 }
1335
1336 /// Checks if the current state is valid, i.e. has non-null MainOp
1337 bool valid() const { return MainOp && AltOp; }
1338
1339 explicit operator bool() const { return valid(); }
1340
1341 InstructionsState() = delete;
1342 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1343 bool HasCopyables = false)
1344 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1345 static InstructionsState invalid() { return {nullptr, nullptr}; }
1346
1347 /// Checks if the value is a copyable element.
1348 bool isCopyableElement(Value *V) const {
1349 assert(valid() && "InstructionsState is invalid.");
1350 if (!HasCopyables)
1351 return false;
1352 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1353 return false;
1354 auto *I = dyn_cast<Instruction>(V);
1355 if (!I)
1356 return !isa<PoisonValue>(V);
1357 if (I->getParent() != MainOp->getParent() &&
1360 return true;
1361 if (I->getOpcode() == MainOp->getOpcode())
1362 return false;
1363 if (!I->isBinaryOp())
1364 return true;
1365 BinOpSameOpcodeHelper Converter(MainOp);
1366 return !Converter.add(I) || !Converter.add(MainOp) ||
1367 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1368 }
1369
1370 /// Checks if the value is non-schedulable.
1371 bool isNonSchedulable(Value *V) const {
1372 assert(valid() && "InstructionsState is invalid.");
1373 auto *I = dyn_cast<Instruction>(V);
1374 if (!HasCopyables)
1375 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1377 // MainOp for copyables always schedulable to correctly identify
1378 // non-schedulable copyables.
1379 if (getMainOp() == V)
1380 return false;
1381 if (isCopyableElement(V)) {
1382 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1383 auto *I = dyn_cast<Instruction>(V);
1384 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1386 // If the copyable instructions comes after MainOp
1387 // (non-schedulable, but used in the block) - cannot vectorize
1388 // it, will possibly generate use before def.
1389 !MainOp->comesBefore(I));
1390 };
1391
1392 return IsNonSchedulableCopyableElement(V);
1393 }
1394 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1396 }
1397
1398 /// Checks if the state represents copyable instructions.
1399 bool areInstructionsWithCopyableElements() const {
1400 assert(valid() && "InstructionsState is invalid.");
1401 return HasCopyables;
1402 }
1403};
1404
1405std::pair<Instruction *, SmallVector<Value *>>
1406convertTo(Instruction *I, const InstructionsState &S) {
1407 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1408 assert(SelectedOp && "Cannot convert the instruction.");
1409 if (I->isBinaryOp()) {
1410 BinOpSameOpcodeHelper Converter(I);
1411 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1412 }
1413 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1414}
1415
1416} // end anonymous namespace
1417
1418static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1419 const TargetLibraryInfo &TLI);
1420
1421/// Find an instruction with a specific opcode in VL.
1422/// \param VL Array of values to search through. Must contain only Instructions
1423/// and PoisonValues.
1424/// \param Opcode The instruction opcode to search for
1425/// \returns
1426/// - The first instruction found with matching opcode
1427/// - nullptr if no matching instruction is found
1429 unsigned Opcode) {
1430 for (Value *V : VL) {
1431 if (isa<PoisonValue>(V))
1432 continue;
1433 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1434 auto *Inst = cast<Instruction>(V);
1435 if (Inst->getOpcode() == Opcode)
1436 return Inst;
1437 }
1438 return nullptr;
1439}
1440
1441/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1442/// compatible instructions or constants, or just some other regular values.
1443static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1444 Value *Op1, const TargetLibraryInfo &TLI) {
1445 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1446 (isConstant(BaseOp1) && isConstant(Op1)) ||
1447 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1448 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1449 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1450 getSameOpcode({BaseOp0, Op0}, TLI) ||
1451 getSameOpcode({BaseOp1, Op1}, TLI);
1452}
1453
1454/// \returns true if a compare instruction \p CI has similar "look" and
1455/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1456/// swapped, false otherwise.
1457static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1458 const TargetLibraryInfo &TLI) {
1459 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1460 "Assessing comparisons of different types?");
1461 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1462 CmpInst::Predicate Pred = CI->getPredicate();
1464
1465 Value *BaseOp0 = BaseCI->getOperand(0);
1466 Value *BaseOp1 = BaseCI->getOperand(1);
1467 Value *Op0 = CI->getOperand(0);
1468 Value *Op1 = CI->getOperand(1);
1469
1470 return (BasePred == Pred &&
1471 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1472 (BasePred == SwappedPred &&
1473 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1474}
1475
1476/// \returns analysis of the Instructions in \p VL described in
1477/// InstructionsState, the Opcode that we suppose the whole list
1478/// could be vectorized even if its structure is diverse.
1479static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1480 const TargetLibraryInfo &TLI) {
1481 // Make sure these are all Instructions.
1483 return InstructionsState::invalid();
1484
1485 auto *It = find_if(VL, IsaPred<Instruction>);
1486 if (It == VL.end())
1487 return InstructionsState::invalid();
1488
1489 Instruction *MainOp = cast<Instruction>(*It);
1490 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1491 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1492 (VL.size() == 2 && InstCnt < 2))
1493 return InstructionsState::invalid();
1494
1495 bool IsCastOp = isa<CastInst>(MainOp);
1496 bool IsBinOp = isa<BinaryOperator>(MainOp);
1497 bool IsCmpOp = isa<CmpInst>(MainOp);
1498 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1500 Instruction *AltOp = MainOp;
1501 unsigned Opcode = MainOp->getOpcode();
1502 unsigned AltOpcode = Opcode;
1503
1504 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1505 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1506 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1507 UniquePreds.insert(BasePred);
1508 UniqueNonSwappedPreds.insert(BasePred);
1509 for (Value *V : VL) {
1510 auto *I = dyn_cast<CmpInst>(V);
1511 if (!I)
1512 return false;
1513 CmpInst::Predicate CurrentPred = I->getPredicate();
1514 CmpInst::Predicate SwappedCurrentPred =
1515 CmpInst::getSwappedPredicate(CurrentPred);
1516 UniqueNonSwappedPreds.insert(CurrentPred);
1517 if (!UniquePreds.contains(CurrentPred) &&
1518 !UniquePreds.contains(SwappedCurrentPred))
1519 UniquePreds.insert(CurrentPred);
1520 }
1521 // Total number of predicates > 2, but if consider swapped predicates
1522 // compatible only 2, consider swappable predicates as compatible opcodes,
1523 // not alternate.
1524 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1525 }();
1526 // Check for one alternate opcode from another BinaryOperator.
1527 // TODO - generalize to support all operators (types, calls etc.).
1528 Intrinsic::ID BaseID = 0;
1529 SmallVector<VFInfo> BaseMappings;
1530 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1531 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1532 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1533 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1534 return InstructionsState::invalid();
1535 }
1536 bool AnyPoison = InstCnt != VL.size();
1537 // Check MainOp too to be sure that it matches the requirements for the
1538 // instructions.
1539 for (Value *V : iterator_range(It, VL.end())) {
1540 auto *I = dyn_cast<Instruction>(V);
1541 if (!I)
1542 continue;
1543
1544 // Cannot combine poison and divisions.
1545 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1546 // intrinsics/functions only.
1547 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1548 return InstructionsState::invalid();
1549 unsigned InstOpcode = I->getOpcode();
1550 if (IsBinOp && isa<BinaryOperator>(I)) {
1551 if (BinOpHelper.add(I))
1552 continue;
1553 } else if (IsCastOp && isa<CastInst>(I)) {
1554 Value *Op0 = MainOp->getOperand(0);
1555 Type *Ty0 = Op0->getType();
1556 Value *Op1 = I->getOperand(0);
1557 Type *Ty1 = Op1->getType();
1558 if (Ty0 == Ty1) {
1559 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1560 continue;
1561 if (Opcode == AltOpcode) {
1562 assert(isValidForAlternation(Opcode) &&
1563 isValidForAlternation(InstOpcode) &&
1564 "Cast isn't safe for alternation, logic needs to be updated!");
1565 AltOpcode = InstOpcode;
1566 AltOp = I;
1567 continue;
1568 }
1569 }
1570 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1571 auto *BaseInst = cast<CmpInst>(MainOp);
1572 Type *Ty0 = BaseInst->getOperand(0)->getType();
1573 Type *Ty1 = Inst->getOperand(0)->getType();
1574 if (Ty0 == Ty1) {
1575 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1576 assert(InstOpcode == AltOpcode &&
1577 "Alternate instructions are only supported by BinaryOperator "
1578 "and CastInst.");
1579 // Check for compatible operands. If the corresponding operands are not
1580 // compatible - need to perform alternate vectorization.
1581 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1582 CmpInst::Predicate SwappedCurrentPred =
1583 CmpInst::getSwappedPredicate(CurrentPred);
1584
1585 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1586 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1587 continue;
1588
1589 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1590 continue;
1591 auto *AltInst = cast<CmpInst>(AltOp);
1592 if (MainOp != AltOp) {
1593 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1594 continue;
1595 } else if (BasePred != CurrentPred) {
1596 assert(
1597 isValidForAlternation(InstOpcode) &&
1598 "CmpInst isn't safe for alternation, logic needs to be updated!");
1599 AltOp = I;
1600 continue;
1601 }
1602 CmpInst::Predicate AltPred = AltInst->getPredicate();
1603 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1604 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1605 continue;
1606 }
1607 } else if (InstOpcode == Opcode) {
1608 assert(InstOpcode == AltOpcode &&
1609 "Alternate instructions are only supported by BinaryOperator and "
1610 "CastInst.");
1611 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1612 if (Gep->getNumOperands() != 2 ||
1613 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1614 return InstructionsState::invalid();
1615 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1617 return InstructionsState::invalid();
1618 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1619 auto *BaseLI = cast<LoadInst>(MainOp);
1620 if (!LI->isSimple() || !BaseLI->isSimple())
1621 return InstructionsState::invalid();
1622 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1623 auto *CallBase = cast<CallInst>(MainOp);
1624 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1625 return InstructionsState::invalid();
1626 if (Call->hasOperandBundles() &&
1628 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1629 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1630 CallBase->op_begin() +
1632 return InstructionsState::invalid();
1634 if (ID != BaseID)
1635 return InstructionsState::invalid();
1636 if (!ID) {
1637 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1638 if (Mappings.size() != BaseMappings.size() ||
1639 Mappings.front().ISA != BaseMappings.front().ISA ||
1640 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1641 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1642 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1643 Mappings.front().Shape.Parameters !=
1644 BaseMappings.front().Shape.Parameters)
1645 return InstructionsState::invalid();
1646 }
1647 }
1648 continue;
1649 }
1650 return InstructionsState::invalid();
1651 }
1652
1653 if (IsBinOp) {
1654 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1655 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1656 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1657 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1658 }
1659 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1660 "Incorrect implementation of allSameOpcode.");
1661 InstructionsState S(MainOp, AltOp);
1662 assert(all_of(VL,
1663 [&](Value *V) {
1664 return isa<PoisonValue>(V) ||
1665 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1666 }) &&
1667 "Invalid InstructionsState.");
1668 return S;
1669}
1670
1671/// \returns true if all of the values in \p VL have the same type or false
1672/// otherwise.
1674 Type *Ty = VL.consume_front()->getType();
1675 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1676}
1677
1678/// \returns True if in-tree use also needs extract. This refers to
1679/// possible scalar operand in vectorized instruction.
1680static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1681 TargetLibraryInfo *TLI,
1682 const TargetTransformInfo *TTI) {
1683 if (!UserInst)
1684 return false;
1685 unsigned Opcode = UserInst->getOpcode();
1686 switch (Opcode) {
1687 case Instruction::Load: {
1688 LoadInst *LI = cast<LoadInst>(UserInst);
1689 return (LI->getPointerOperand() == Scalar);
1690 }
1691 case Instruction::Store: {
1692 StoreInst *SI = cast<StoreInst>(UserInst);
1693 return (SI->getPointerOperand() == Scalar);
1694 }
1695 case Instruction::Call: {
1696 CallInst *CI = cast<CallInst>(UserInst);
1698 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1699 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1700 Arg.value().get() == Scalar;
1701 });
1702 }
1703 default:
1704 return false;
1705 }
1706}
1707
1708/// \returns the AA location that is being access by the instruction.
1711 return MemoryLocation::get(SI);
1712 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1713 return MemoryLocation::get(LI);
1714 return MemoryLocation();
1715}
1716
1717/// \returns True if the instruction is not a volatile or atomic load/store.
1718static bool isSimple(Instruction *I) {
1719 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1720 return LI->isSimple();
1722 return SI->isSimple();
1724 return !MI->isVolatile();
1725 return true;
1726}
1727
1728/// Shuffles \p Mask in accordance with the given \p SubMask.
1729/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1730/// one but two input vectors.
1731static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1732 bool ExtendingManyInputs = false) {
1733 if (SubMask.empty())
1734 return;
1735 assert(
1736 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1737 // Check if input scalars were extended to match the size of other node.
1738 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1739 "SubMask with many inputs support must be larger than the mask.");
1740 if (Mask.empty()) {
1741 Mask.append(SubMask.begin(), SubMask.end());
1742 return;
1743 }
1744 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1745 int TermValue = std::min(Mask.size(), SubMask.size());
1746 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1747 if (SubMask[I] == PoisonMaskElem ||
1748 (!ExtendingManyInputs &&
1749 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1750 continue;
1751 NewMask[I] = Mask[SubMask[I]];
1752 }
1753 Mask.swap(NewMask);
1754}
1755
1756/// Order may have elements assigned special value (size) which is out of
1757/// bounds. Such indices only appear on places which correspond to undef values
1758/// (see canReuseExtract for details) and used in order to avoid undef values
1759/// have effect on operands ordering.
1760/// The first loop below simply finds all unused indices and then the next loop
1761/// nest assigns these indices for undef values positions.
1762/// As an example below Order has two undef positions and they have assigned
1763/// values 3 and 7 respectively:
1764/// before: 6 9 5 4 9 2 1 0
1765/// after: 6 3 5 4 7 2 1 0
1767 const size_t Sz = Order.size();
1768 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1769 SmallBitVector MaskedIndices(Sz);
1770 for (unsigned I = 0; I < Sz; ++I) {
1771 if (Order[I] < Sz)
1772 UnusedIndices.reset(Order[I]);
1773 else
1774 MaskedIndices.set(I);
1775 }
1776 if (MaskedIndices.none())
1777 return;
1778 assert(UnusedIndices.count() == MaskedIndices.count() &&
1779 "Non-synced masked/available indices.");
1780 int Idx = UnusedIndices.find_first();
1781 int MIdx = MaskedIndices.find_first();
1782 while (MIdx >= 0) {
1783 assert(Idx >= 0 && "Indices must be synced.");
1784 Order[MIdx] = Idx;
1785 Idx = UnusedIndices.find_next(Idx);
1786 MIdx = MaskedIndices.find_next(MIdx);
1787 }
1788}
1789
1790/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1791/// Opcode1.
1793 unsigned Opcode0, unsigned Opcode1) {
1794 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1795 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1796 for (unsigned Lane : seq<unsigned>(VL.size())) {
1797 if (isa<PoisonValue>(VL[Lane]))
1798 continue;
1799 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1800 OpcodeMask.set(Lane * ScalarTyNumElements,
1801 Lane * ScalarTyNumElements + ScalarTyNumElements);
1802 }
1803 return OpcodeMask;
1804}
1805
1806/// Replicates the given \p Val \p VF times.
1808 unsigned VF) {
1809 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1810 "Expected scalar constants.");
1811 SmallVector<Constant *> NewVal(Val.size() * VF);
1812 for (auto [I, V] : enumerate(Val))
1813 std::fill_n(NewVal.begin() + I * VF, VF, V);
1814 return NewVal;
1815}
1816
1817namespace llvm {
1818
1820 SmallVectorImpl<int> &Mask) {
1821 Mask.clear();
1822 const unsigned E = Indices.size();
1823 Mask.resize(E, PoisonMaskElem);
1824 for (unsigned I = 0; I < E; ++I)
1825 Mask[Indices[I]] = I;
1826}
1827
1828/// Reorders the list of scalars in accordance with the given \p Mask.
1830 ArrayRef<int> Mask) {
1831 assert(!Mask.empty() && "Expected non-empty mask.");
1832 SmallVector<Value *> Prev(Scalars.size(),
1833 PoisonValue::get(Scalars.front()->getType()));
1834 Prev.swap(Scalars);
1835 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1836 if (Mask[I] != PoisonMaskElem)
1837 Scalars[Mask[I]] = Prev[I];
1838}
1839
1840/// Checks if the provided value does not require scheduling. It does not
1841/// require scheduling if this is not an instruction or it is an instruction
1842/// that does not read/write memory and all operands are either not instructions
1843/// or phi nodes or instructions from different blocks.
1845 auto *I = dyn_cast<Instruction>(V);
1846 if (!I)
1847 return true;
1848 return !mayHaveNonDefUseDependency(*I) &&
1849 all_of(I->operands(), [I](Value *V) {
1850 auto *IO = dyn_cast<Instruction>(V);
1851 if (!IO)
1852 return true;
1853 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1854 });
1855}
1856
1857/// Checks if the provided value does not require scheduling. It does not
1858/// require scheduling if this is not an instruction or it is an instruction
1859/// that does not read/write memory and all users are phi nodes or instructions
1860/// from the different blocks.
1861static bool isUsedOutsideBlock(Value *V) {
1862 auto *I = dyn_cast<Instruction>(V);
1863 if (!I)
1864 return true;
1865 // Limits the number of uses to save compile time.
1866 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1867 all_of(I->users(), [I](User *U) {
1868 auto *IU = dyn_cast<Instruction>(U);
1869 if (!IU)
1870 return true;
1871 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1872 });
1873}
1874
1875/// Checks if the specified value does not require scheduling. It does not
1876/// require scheduling if all operands and all users do not need to be scheduled
1877/// in the current basic block.
1880}
1881
1882/// Checks if the specified array of instructions does not require scheduling.
1883/// It is so if all either instructions have operands that do not require
1884/// scheduling or their users do not require scheduling since they are phis or
1885/// in other basic blocks.
1887 return !VL.empty() &&
1889}
1890
1891/// Returns true if widened type of \p Ty elements with size \p Sz represents
1892/// full vector type, i.e. adding extra element results in extra parts upon type
1893/// legalization.
1895 unsigned Sz) {
1896 if (Sz <= 1)
1897 return false;
1899 return false;
1900 if (has_single_bit(Sz))
1901 return true;
1902 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1903 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1904 Sz % NumParts == 0;
1905}
1906
1907/// Returns number of parts, the type \p VecTy will be split at the codegen
1908/// phase. If the type is going to be scalarized or does not uses whole
1909/// registers, returns 1.
1910static unsigned
1912 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1913 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1914 if (NumParts == 0 || NumParts >= Limit)
1915 return 1;
1916 unsigned Sz = getNumElements(VecTy);
1917 if (NumParts >= Sz || Sz % NumParts != 0 ||
1918 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1919 return 1;
1920 return NumParts;
1921}
1922
1923namespace slpvectorizer {
1924
1925/// Bottom Up SLP Vectorizer.
1926class BoUpSLP {
1927 class TreeEntry;
1928 class ScheduleEntity;
1929 class ScheduleData;
1930 class ScheduleCopyableData;
1931 class ScheduleBundle;
1934
1935 /// If we decide to generate strided load / store, this struct contains all
1936 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1937 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1938 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1939 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1940 /// size of element of FixedVectorType.
1941 struct StridedPtrInfo {
1942 Value *StrideVal = nullptr;
1943 const SCEV *StrideSCEV = nullptr;
1944 FixedVectorType *Ty = nullptr;
1945 };
1946 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1947
1948public:
1949 /// Tracks the state we can represent the loads in the given sequence.
1957
1964
1966 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1968 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1969 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1970 AC(AC), DB(DB), DL(DL), ORE(ORE),
1971 Builder(Se->getContext(), TargetFolder(*DL)) {
1972 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1973 // Use the vector register size specified by the target unless overridden
1974 // by a command-line option.
1975 // TODO: It would be better to limit the vectorization factor based on
1976 // data type rather than just register size. For example, x86 AVX has
1977 // 256-bit registers, but it does not support integer operations
1978 // at that width (that requires AVX2).
1979 if (MaxVectorRegSizeOption.getNumOccurrences())
1980 MaxVecRegSize = MaxVectorRegSizeOption;
1981 else
1982 MaxVecRegSize =
1983 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
1984 .getFixedValue();
1985
1986 if (MinVectorRegSizeOption.getNumOccurrences())
1987 MinVecRegSize = MinVectorRegSizeOption;
1988 else
1989 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1990 }
1991
1992 /// Vectorize the tree that starts with the elements in \p VL.
1993 /// Returns the vectorized root.
1995
1996 /// Vectorize the tree but with the list of externally used values \p
1997 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1998 /// generated extractvalue instructions.
2000 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2001 Instruction *ReductionRoot = nullptr,
2002 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2003
2004 /// \returns the cost incurred by unwanted spills and fills, caused by
2005 /// holding live values over call sites.
2007
2008 /// \returns the vectorization cost of the subtree that starts at \p VL.
2009 /// A negative number means that this is profitable.
2010 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
2011 InstructionCost ReductionCost = TTI::TCC_Free);
2012
2013 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2014 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2015 void buildTree(ArrayRef<Value *> Roots,
2016 const SmallDenseSet<Value *> &UserIgnoreLst);
2017
2018 /// Construct a vectorizable tree that starts at \p Roots.
2019 void buildTree(ArrayRef<Value *> Roots);
2020
2021 /// Return the scalars of the root node.
2023 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2024 return VectorizableTree.front()->Scalars;
2025 }
2026
2027 /// Returns the type/is-signed info for the root node in the graph without
2028 /// casting.
2029 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2030 const TreeEntry &Root = *VectorizableTree.front();
2031 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2032 !Root.Scalars.front()->getType()->isIntegerTy())
2033 return std::nullopt;
2034 auto It = MinBWs.find(&Root);
2035 if (It != MinBWs.end())
2036 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2037 It->second.first),
2038 It->second.second);
2039 if (Root.getOpcode() == Instruction::ZExt ||
2040 Root.getOpcode() == Instruction::SExt)
2041 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2042 Root.getOpcode() == Instruction::SExt);
2043 return std::nullopt;
2044 }
2045
2046 /// Checks if the root graph node can be emitted with narrower bitwidth at
2047 /// codegen and returns it signedness, if so.
2049 return MinBWs.at(VectorizableTree.front().get()).second;
2050 }
2051
2052 /// Returns reduction type after minbitdth analysis.
2054 if (ReductionBitWidth == 0 ||
2055 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2056 ReductionBitWidth >=
2057 DL->getTypeSizeInBits(
2058 VectorizableTree.front()->Scalars.front()->getType()))
2059 return getWidenedType(
2060 VectorizableTree.front()->Scalars.front()->getType(),
2061 VectorizableTree.front()->getVectorFactor());
2062 return getWidenedType(
2064 VectorizableTree.front()->Scalars.front()->getContext(),
2065 ReductionBitWidth),
2066 VectorizableTree.front()->getVectorFactor());
2067 }
2068
2069 /// Builds external uses of the vectorized scalars, i.e. the list of
2070 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2071 /// ExternallyUsedValues contains additional list of external uses to handle
2072 /// vectorization of reductions.
2073 void
2074 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2075
2076 /// Transforms graph nodes to target specific representations, if profitable.
2077 void transformNodes();
2078
2079 /// Clear the internal data structures that are created by 'buildTree'.
2080 void deleteTree() {
2081 VectorizableTree.clear();
2082 ScalarToTreeEntries.clear();
2083 OperandsToTreeEntry.clear();
2084 ScalarsInSplitNodes.clear();
2085 MustGather.clear();
2086 NonScheduledFirst.clear();
2087 EntryToLastInstruction.clear();
2088 LoadEntriesToVectorize.clear();
2089 IsGraphTransformMode = false;
2090 GatheredLoadsEntriesFirst.reset();
2091 CompressEntryToData.clear();
2092 ExternalUses.clear();
2093 ExternalUsesAsOriginalScalar.clear();
2094 ExternalUsesWithNonUsers.clear();
2095 for (auto &Iter : BlocksSchedules) {
2096 BlockScheduling *BS = Iter.second.get();
2097 BS->clear();
2098 }
2099 MinBWs.clear();
2100 ReductionBitWidth = 0;
2101 BaseGraphSize = 1;
2102 CastMaxMinBWSizes.reset();
2103 ExtraBitWidthNodes.clear();
2104 InstrElementSize.clear();
2105 UserIgnoreList = nullptr;
2106 PostponedGathers.clear();
2107 ValueToGatherNodes.clear();
2108 TreeEntryToStridedPtrInfoMap.clear();
2109 }
2110
2111 unsigned getTreeSize() const { return VectorizableTree.size(); }
2112
2113 /// Returns the base graph size, before any transformations.
2114 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2115
2116 /// Perform LICM and CSE on the newly generated gather sequences.
2118
2119 /// Does this non-empty order represent an identity order? Identity
2120 /// should be represented as an empty order, so this is used to
2121 /// decide if we can canonicalize a computed order. Undef elements
2122 /// (represented as size) are ignored.
2124 assert(!Order.empty() && "expected non-empty order");
2125 const unsigned Sz = Order.size();
2126 return all_of(enumerate(Order), [&](const auto &P) {
2127 return P.value() == P.index() || P.value() == Sz;
2128 });
2129 }
2130
2131 /// Checks if the specified gather tree entry \p TE can be represented as a
2132 /// shuffled vector entry + (possibly) permutation with other gathers. It
2133 /// implements the checks only for possibly ordered scalars (Loads,
2134 /// ExtractElement, ExtractValue), which can be part of the graph.
2135 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2136 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2137 /// node might be ignored.
2138 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2139 bool TopToBottom,
2140 bool IgnoreReorder);
2141
2142 /// Sort loads into increasing pointers offsets to allow greater clustering.
2143 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2144
2145 /// Gets reordering data for the given tree entry. If the entry is vectorized
2146 /// - just return ReorderIndices, otherwise check if the scalars can be
2147 /// reordered and return the most optimal order.
2148 /// \return std::nullopt if ordering is not important, empty order, if
2149 /// identity order is important, or the actual order.
2150 /// \param TopToBottom If true, include the order of vectorized stores and
2151 /// insertelement nodes, otherwise skip them.
2152 /// \param IgnoreReorder true, if the root node order can be ignored.
2153 std::optional<OrdersType>
2154 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2155
2156 /// Checks if it is profitable to reorder the current tree.
2157 /// If the tree does not contain many profitable reordable nodes, better to
2158 /// skip it to save compile time.
2159 bool isProfitableToReorder() const;
2160
2161 /// Reorders the current graph to the most profitable order starting from the
2162 /// root node to the leaf nodes. The best order is chosen only from the nodes
2163 /// of the same size (vectorization factor). Smaller nodes are considered
2164 /// parts of subgraph with smaller VF and they are reordered independently. We
2165 /// can make it because we still need to extend smaller nodes to the wider VF
2166 /// and we can merge reordering shuffles with the widening shuffles.
2167 void reorderTopToBottom();
2168
2169 /// Reorders the current graph to the most profitable order starting from
2170 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2171 /// number of reshuffles if the leaf nodes use the same order. In this case we
2172 /// can merge the orders and just shuffle user node instead of shuffling its
2173 /// operands. Plus, even the leaf nodes have different orders, it allows to
2174 /// sink reordering in the graph closer to the root node and merge it later
2175 /// during analysis.
2176 void reorderBottomToTop(bool IgnoreReorder = false);
2177
2178 /// \return The vector element size in bits to use when vectorizing the
2179 /// expression tree ending at \p V. If V is a store, the size is the width of
2180 /// the stored value. Otherwise, the size is the width of the largest loaded
2181 /// value reaching V. This method is used by the vectorizer to calculate
2182 /// vectorization factors.
2183 unsigned getVectorElementSize(Value *V);
2184
2185 /// Compute the minimum type sizes required to represent the entries in a
2186 /// vectorizable tree.
2188
2189 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2190 unsigned getMaxVecRegSize() const {
2191 return MaxVecRegSize;
2192 }
2193
2194 // \returns minimum vector register size as set by cl::opt.
2195 unsigned getMinVecRegSize() const {
2196 return MinVecRegSize;
2197 }
2198
2199 unsigned getMinVF(unsigned Sz) const {
2200 return std::max(2U, getMinVecRegSize() / Sz);
2201 }
2202
2203 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2204 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2205 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2206 return MaxVF ? MaxVF : UINT_MAX;
2207 }
2208
2209 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2210 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2211 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2212 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2213 ///
2214 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2215 unsigned canMapToVector(Type *T) const;
2216
2217 /// \returns True if the VectorizableTree is both tiny and not fully
2218 /// vectorizable. We do not vectorize such trees.
2219 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2220
2221 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2222 /// It may happen, if all gather nodes are loads and they cannot be
2223 /// "clusterized". In this case even subgraphs cannot be vectorized more
2224 /// effectively than the base graph.
2225 bool isTreeNotExtendable() const;
2226
2227 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2228 /// can be load combined in the backend. Load combining may not be allowed in
2229 /// the IR optimizer, so we do not want to alter the pattern. For example,
2230 /// partially transforming a scalar bswap() pattern into vector code is
2231 /// effectively impossible for the backend to undo.
2232 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2233 /// may not be necessary.
2234 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2235
2236 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2237 /// can be load combined in the backend. Load combining may not be allowed in
2238 /// the IR optimizer, so we do not want to alter the pattern. For example,
2239 /// partially transforming a scalar bswap() pattern into vector code is
2240 /// effectively impossible for the backend to undo.
2241 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2242 /// may not be necessary.
2243 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2244 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2245 Align Alignment, const int64_t Diff,
2246 const size_t Sz) const;
2247
2248 /// Return true if an array of scalar loads can be replaced with a strided
2249 /// load (with constant stride).
2250 ///
2251 /// TODO:
2252 /// It is possible that the load gets "widened". Suppose that originally each
2253 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2254 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2255 /// ...
2256 /// %b + 0 * %s + (w - 1)
2257 ///
2258 /// %b + 1 * %s + 0
2259 /// %b + 1 * %s + 1
2260 /// %b + 1 * %s + 2
2261 /// ...
2262 /// %b + 1 * %s + (w - 1)
2263 /// ...
2264 ///
2265 /// %b + (n - 1) * %s + 0
2266 /// %b + (n - 1) * %s + 1
2267 /// %b + (n - 1) * %s + 2
2268 /// ...
2269 /// %b + (n - 1) * %s + (w - 1)
2270 ///
2271 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2272 ///
2273 /// \param PointerOps list of pointer arguments of loads.
2274 /// \param ElemTy original scalar type of loads.
2275 /// \param Alignment alignment of the first load.
2276 /// \param SortedIndices is the order of PointerOps as returned by
2277 /// `sortPtrAccesses`
2278 /// \param Diff Pointer difference between the lowest and the highes pointer
2279 /// in `PointerOps` as returned by `getPointersDiff`.
2280 /// \param Ptr0 first pointer in `PointersOps`.
2281 /// \param PtrN last pointer in `PointersOps`.
2282 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2283 /// of `SPtrInfo` necessary to generate the strided load later.
2285 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2286 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2287 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2288
2289 /// Return true if an array of scalar loads can be replaced with a strided
2290 /// load (with run-time stride).
2291 /// \param PointerOps list of pointer arguments of loads.
2292 /// \param ScalarTy type of loads.
2293 /// \param CommonAlignment common alignement of loads as computed by
2294 /// `computeCommonAlignment<LoadInst>`.
2295 /// \param SortedIndicies is a list of indicies computed by this function such
2296 /// that the sequence `PointerOps[SortedIndices[0]],
2297 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2298 /// ordered by the coefficient of the stride. For example, if PointerOps is
2299 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2300 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2301 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2302 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2303 /// of `SPtrInfo` necessary to generate the strided load later.
2304 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2305 Align CommonAlignment,
2306 SmallVectorImpl<unsigned> &SortedIndices,
2307 StridedPtrInfo &SPtrInfo) const;
2308
2309 /// Checks if the given array of loads can be represented as a vectorized,
2310 /// scatter or just simple gather.
2311 /// \param VL list of loads.
2312 /// \param VL0 main load value.
2313 /// \param Order returned order of load instructions.
2314 /// \param PointerOps returned list of pointer operands.
2315 /// \param BestVF return best vector factor, if recursive check found better
2316 /// vectorization sequences rather than masked gather.
2317 /// \param TryRecursiveCheck used to check if long masked gather can be
2318 /// represented as a serie of loads/insert subvector, if profitable.
2321 SmallVectorImpl<Value *> &PointerOps,
2322 StridedPtrInfo &SPtrInfo,
2323 unsigned *BestVF = nullptr,
2324 bool TryRecursiveCheck = true) const;
2325
2326 /// Registers non-vectorizable sequence of loads
2327 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2328 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2329 }
2330
2331 /// Checks if the given loads sequence is known as not vectorizable
2332 template <typename T>
2334 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2335 }
2336
2338
2339 /// This structure holds any data we need about the edges being traversed
2340 /// during buildTreeRec(). We keep track of:
2341 /// (i) the user TreeEntry index, and
2342 /// (ii) the index of the edge.
2343 struct EdgeInfo {
2344 EdgeInfo() = default;
2345 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2347 /// The user TreeEntry.
2348 TreeEntry *UserTE = nullptr;
2349 /// The operand index of the use.
2350 unsigned EdgeIdx = UINT_MAX;
2351#ifndef NDEBUG
2353 const BoUpSLP::EdgeInfo &EI) {
2354 EI.dump(OS);
2355 return OS;
2356 }
2357 /// Debug print.
2358 void dump(raw_ostream &OS) const {
2359 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2360 << " EdgeIdx:" << EdgeIdx << "}";
2361 }
2362 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2363#endif
2364 bool operator == (const EdgeInfo &Other) const {
2365 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2366 }
2367
2368 operator bool() const { return UserTE != nullptr; }
2369 };
2370 friend struct DenseMapInfo<EdgeInfo>;
2371
2372 /// A helper class used for scoring candidates for two consecutive lanes.
2374 const TargetLibraryInfo &TLI;
2375 const DataLayout &DL;
2376 ScalarEvolution &SE;
2377 const BoUpSLP &R;
2378 int NumLanes; // Total number of lanes (aka vectorization factor).
2379 int MaxLevel; // The maximum recursion depth for accumulating score.
2380
2381 public:
2383 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2384 int MaxLevel)
2385 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2386 MaxLevel(MaxLevel) {}
2387
2388 // The hard-coded scores listed here are not very important, though it shall
2389 // be higher for better matches to improve the resulting cost. When
2390 // computing the scores of matching one sub-tree with another, we are
2391 // basically counting the number of values that are matching. So even if all
2392 // scores are set to 1, we would still get a decent matching result.
2393 // However, sometimes we have to break ties. For example we may have to
2394 // choose between matching loads vs matching opcodes. This is what these
2395 // scores are helping us with: they provide the order of preference. Also,
2396 // this is important if the scalar is externally used or used in another
2397 // tree entry node in the different lane.
2398
2399 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2400 static const int ScoreConsecutiveLoads = 4;
2401 /// The same load multiple times. This should have a better score than
2402 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2403 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2404 /// a vector load and 1.0 for a broadcast.
2405 static const int ScoreSplatLoads = 3;
2406 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2407 static const int ScoreReversedLoads = 3;
2408 /// A load candidate for masked gather.
2409 static const int ScoreMaskedGatherCandidate = 1;
2410 /// ExtractElementInst from same vector and consecutive indexes.
2411 static const int ScoreConsecutiveExtracts = 4;
2412 /// ExtractElementInst from same vector and reversed indices.
2413 static const int ScoreReversedExtracts = 3;
2414 /// Constants.
2415 static const int ScoreConstants = 2;
2416 /// Instructions with the same opcode.
2417 static const int ScoreSameOpcode = 2;
2418 /// Instructions with alt opcodes (e.g, add + sub).
2419 static const int ScoreAltOpcodes = 1;
2420 /// Identical instructions (a.k.a. splat or broadcast).
2421 static const int ScoreSplat = 1;
2422 /// Matching with an undef is preferable to failing.
2423 static const int ScoreUndef = 1;
2424 /// Score for failing to find a decent match.
2425 static const int ScoreFail = 0;
2426 /// Score if all users are vectorized.
2427 static const int ScoreAllUserVectorized = 1;
2428
2429 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2430 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2431 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2432 /// MainAltOps.
2434 ArrayRef<Value *> MainAltOps) const {
2435 if (!isValidElementType(V1->getType()) ||
2438
2439 if (V1 == V2) {
2440 if (isa<LoadInst>(V1)) {
2441 // Retruns true if the users of V1 and V2 won't need to be extracted.
2442 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2443 // Bail out if we have too many uses to save compilation time.
2444 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2445 return false;
2446
2447 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2448 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2449 return U == U1 || U == U2 || R.isVectorized(U);
2450 });
2451 };
2452 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2453 };
2454 // A broadcast of a load can be cheaper on some targets.
2455 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2456 ElementCount::getFixed(NumLanes)) &&
2457 ((int)V1->getNumUses() == NumLanes ||
2458 AllUsersAreInternal(V1, V2)))
2460 }
2462 }
2463
2464 auto CheckSameEntryOrFail = [&]() {
2465 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2467 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2468 !TEs2.empty() &&
2469 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2471 }
2473 };
2474
2475 auto *LI1 = dyn_cast<LoadInst>(V1);
2476 auto *LI2 = dyn_cast<LoadInst>(V2);
2477 if (LI1 && LI2) {
2478 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2479 !LI2->isSimple())
2480 return CheckSameEntryOrFail();
2481
2482 std::optional<int64_t> Dist = getPointersDiff(
2483 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2484 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2485 if (!Dist || *Dist == 0) {
2486 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2487 getUnderlyingObject(LI2->getPointerOperand()) &&
2488 R.TTI->isLegalMaskedGather(
2489 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2491 return CheckSameEntryOrFail();
2492 }
2493 // The distance is too large - still may be profitable to use masked
2494 // loads/gathers.
2495 if (std::abs(*Dist) > NumLanes / 2)
2497 // This still will detect consecutive loads, but we might have "holes"
2498 // in some cases. It is ok for non-power-2 vectorization and may produce
2499 // better results. It should not affect current vectorization.
2502 }
2503
2504 auto *C1 = dyn_cast<Constant>(V1);
2505 auto *C2 = dyn_cast<Constant>(V2);
2506 if (C1 && C2)
2508
2509 // Consider constants and buildvector compatible.
2510 if ((C1 && isa<InsertElementInst>(V2)) ||
2511 (C2 && isa<InsertElementInst>(V1)))
2513
2514 // Extracts from consecutive indexes of the same vector better score as
2515 // the extracts could be optimized away.
2516 Value *EV1;
2517 ConstantInt *Ex1Idx;
2518 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2519 // Undefs are always profitable for extractelements.
2520 // Compiler can easily combine poison and extractelement <non-poison> or
2521 // undef and extractelement <poison>. But combining undef +
2522 // extractelement <non-poison-but-may-produce-poison> requires some
2523 // extra operations.
2524 if (isa<UndefValue>(V2))
2525 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2528 Value *EV2 = nullptr;
2529 ConstantInt *Ex2Idx = nullptr;
2530 if (match(V2,
2532 m_Undef())))) {
2533 // Undefs are always profitable for extractelements.
2534 if (!Ex2Idx)
2536 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2538 if (EV2 == EV1) {
2539 int Idx1 = Ex1Idx->getZExtValue();
2540 int Idx2 = Ex2Idx->getZExtValue();
2541 int Dist = Idx2 - Idx1;
2542 // The distance is too large - still may be profitable to use
2543 // shuffles.
2544 if (std::abs(Dist) == 0)
2546 if (std::abs(Dist) > NumLanes / 2)
2550 }
2552 }
2553 return CheckSameEntryOrFail();
2554 }
2555
2556 auto *I1 = dyn_cast<Instruction>(V1);
2557 auto *I2 = dyn_cast<Instruction>(V2);
2558 if (I1 && I2) {
2559 if (I1->getParent() != I2->getParent())
2560 return CheckSameEntryOrFail();
2561 SmallVector<Value *, 4> Ops(MainAltOps);
2562 Ops.push_back(I1);
2563 Ops.push_back(I2);
2564 InstructionsState S = getSameOpcode(Ops, TLI);
2565 // Note: Only consider instructions with <= 2 operands to avoid
2566 // complexity explosion.
2567 if (S &&
2568 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2569 !S.isAltShuffle()) &&
2570 all_of(Ops, [&S](Value *V) {
2571 return isa<PoisonValue>(V) ||
2572 cast<Instruction>(V)->getNumOperands() ==
2573 S.getMainOp()->getNumOperands();
2574 }))
2575 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2577 }
2578
2579 if (I1 && isa<PoisonValue>(V2))
2581
2582 if (isa<UndefValue>(V2))
2584
2585 return CheckSameEntryOrFail();
2586 }
2587
2588 /// Go through the operands of \p LHS and \p RHS recursively until
2589 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2590 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2591 /// of \p U1 and \p U2), except at the beginning of the recursion where
2592 /// these are set to nullptr.
2593 ///
2594 /// For example:
2595 /// \verbatim
2596 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2597 /// \ / \ / \ / \ /
2598 /// + + + +
2599 /// G1 G2 G3 G4
2600 /// \endverbatim
2601 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2602 /// each level recursively, accumulating the score. It starts from matching
2603 /// the additions at level 0, then moves on to the loads (level 1). The
2604 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2605 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2606 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2607 /// Please note that the order of the operands does not matter, as we
2608 /// evaluate the score of all profitable combinations of operands. In
2609 /// other words the score of G1 and G4 is the same as G1 and G2. This
2610 /// heuristic is based on ideas described in:
2611 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2612 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2613 /// Luís F. W. Góes
2615 Instruction *U2, int CurrLevel,
2616 ArrayRef<Value *> MainAltOps) const {
2617
2618 // Get the shallow score of V1 and V2.
2619 int ShallowScoreAtThisLevel =
2620 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2621
2622 // If reached MaxLevel,
2623 // or if V1 and V2 are not instructions,
2624 // or if they are SPLAT,
2625 // or if they are not consecutive,
2626 // or if profitable to vectorize loads or extractelements, early return
2627 // the current cost.
2628 auto *I1 = dyn_cast<Instruction>(LHS);
2629 auto *I2 = dyn_cast<Instruction>(RHS);
2630 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2631 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2632 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2633 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2635 ShallowScoreAtThisLevel))
2636 return ShallowScoreAtThisLevel;
2637 assert(I1 && I2 && "Should have early exited.");
2638
2639 // Contains the I2 operand indexes that got matched with I1 operands.
2640 SmallSet<unsigned, 4> Op2Used;
2641
2642 // Recursion towards the operands of I1 and I2. We are trying all possible
2643 // operand pairs, and keeping track of the best score.
2644 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2645 OpIdx1 != NumOperands1; ++OpIdx1) {
2646 // Try to pair op1I with the best operand of I2.
2647 int MaxTmpScore = 0;
2648 unsigned MaxOpIdx2 = 0;
2649 bool FoundBest = false;
2650 // If I2 is commutative try all combinations.
2651 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2652 unsigned ToIdx = isCommutative(I2)
2653 ? I2->getNumOperands()
2654 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2655 assert(FromIdx <= ToIdx && "Bad index");
2656 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2657 // Skip operands already paired with OpIdx1.
2658 if (Op2Used.count(OpIdx2))
2659 continue;
2660 // Recursively calculate the cost at each level
2661 int TmpScore =
2662 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2663 I1, I2, CurrLevel + 1, {});
2664 // Look for the best score.
2665 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2666 TmpScore > MaxTmpScore) {
2667 MaxTmpScore = TmpScore;
2668 MaxOpIdx2 = OpIdx2;
2669 FoundBest = true;
2670 }
2671 }
2672 if (FoundBest) {
2673 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2674 Op2Used.insert(MaxOpIdx2);
2675 ShallowScoreAtThisLevel += MaxTmpScore;
2676 }
2677 }
2678 return ShallowScoreAtThisLevel;
2679 }
2680 };
2681 /// A helper data structure to hold the operands of a vector of instructions.
2682 /// This supports a fixed vector length for all operand vectors.
2684 /// For each operand we need (i) the value, and (ii) the opcode that it
2685 /// would be attached to if the expression was in a left-linearized form.
2686 /// This is required to avoid illegal operand reordering.
2687 /// For example:
2688 /// \verbatim
2689 /// 0 Op1
2690 /// |/
2691 /// Op1 Op2 Linearized + Op2
2692 /// \ / ----------> |/
2693 /// - -
2694 ///
2695 /// Op1 - Op2 (0 + Op1) - Op2
2696 /// \endverbatim
2697 ///
2698 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2699 ///
2700 /// Another way to think of this is to track all the operations across the
2701 /// path from the operand all the way to the root of the tree and to
2702 /// calculate the operation that corresponds to this path. For example, the
2703 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2704 /// corresponding operation is a '-' (which matches the one in the
2705 /// linearized tree, as shown above).
2706 ///
2707 /// For lack of a better term, we refer to this operation as Accumulated
2708 /// Path Operation (APO).
2709 struct OperandData {
2710 OperandData() = default;
2711 OperandData(Value *V, bool APO, bool IsUsed)
2712 : V(V), APO(APO), IsUsed(IsUsed) {}
2713 /// The operand value.
2714 Value *V = nullptr;
2715 /// TreeEntries only allow a single opcode, or an alternate sequence of
2716 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2717 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2718 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2719 /// (e.g., Add/Mul)
2720 bool APO = false;
2721 /// Helper data for the reordering function.
2722 bool IsUsed = false;
2723 };
2724
2725 /// During operand reordering, we are trying to select the operand at lane
2726 /// that matches best with the operand at the neighboring lane. Our
2727 /// selection is based on the type of value we are looking for. For example,
2728 /// if the neighboring lane has a load, we need to look for a load that is
2729 /// accessing a consecutive address. These strategies are summarized in the
2730 /// 'ReorderingMode' enumerator.
2731 enum class ReorderingMode {
2732 Load, ///< Matching loads to consecutive memory addresses
2733 Opcode, ///< Matching instructions based on opcode (same or alternate)
2734 Constant, ///< Matching constants
2735 Splat, ///< Matching the same instruction multiple times (broadcast)
2736 Failed, ///< We failed to create a vectorizable group
2737 };
2738
2739 using OperandDataVec = SmallVector<OperandData, 2>;
2740
2741 /// A vector of operand vectors.
2743 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2744 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2745 unsigned ArgSize = 0;
2746
2747 const TargetLibraryInfo &TLI;
2748 const DataLayout &DL;
2749 ScalarEvolution &SE;
2750 const BoUpSLP &R;
2751 const Loop *L = nullptr;
2752
2753 /// \returns the operand data at \p OpIdx and \p Lane.
2754 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2755 return OpsVec[OpIdx][Lane];
2756 }
2757
2758 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2759 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2760 return OpsVec[OpIdx][Lane];
2761 }
2762
2763 /// Clears the used flag for all entries.
2764 void clearUsed() {
2765 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2766 OpIdx != NumOperands; ++OpIdx)
2767 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2768 ++Lane)
2769 OpsVec[OpIdx][Lane].IsUsed = false;
2770 }
2771
2772 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2773 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2774 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2775 }
2776
2777 /// \param Lane lane of the operands under analysis.
2778 /// \param OpIdx operand index in \p Lane lane we're looking the best
2779 /// candidate for.
2780 /// \param Idx operand index of the current candidate value.
2781 /// \returns The additional score due to possible broadcasting of the
2782 /// elements in the lane. It is more profitable to have power-of-2 unique
2783 /// elements in the lane, it will be vectorized with higher probability
2784 /// after removing duplicates. Currently the SLP vectorizer supports only
2785 /// vectorization of the power-of-2 number of unique scalars.
2786 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2787 const SmallBitVector &UsedLanes) const {
2788 Value *IdxLaneV = getData(Idx, Lane).V;
2789 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2790 isa<ExtractElementInst>(IdxLaneV))
2791 return 0;
2793 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2794 if (Ln == Lane)
2795 continue;
2796 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2797 if (!isa<Instruction>(OpIdxLnV))
2798 return 0;
2799 Uniques.try_emplace(OpIdxLnV, Ln);
2800 }
2801 unsigned UniquesCount = Uniques.size();
2802 auto IdxIt = Uniques.find(IdxLaneV);
2803 unsigned UniquesCntWithIdxLaneV =
2804 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2805 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2806 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2807 unsigned UniquesCntWithOpIdxLaneV =
2808 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2809 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2810 return 0;
2811 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2812 UniquesCntWithOpIdxLaneV,
2813 UniquesCntWithOpIdxLaneV -
2814 bit_floor(UniquesCntWithOpIdxLaneV)) -
2815 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2816 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2817 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2818 }
2819
2820 /// \param Lane lane of the operands under analysis.
2821 /// \param OpIdx operand index in \p Lane lane we're looking the best
2822 /// candidate for.
2823 /// \param Idx operand index of the current candidate value.
2824 /// \returns The additional score for the scalar which users are all
2825 /// vectorized.
2826 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2827 Value *IdxLaneV = getData(Idx, Lane).V;
2828 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2829 // Do not care about number of uses for vector-like instructions
2830 // (extractelement/extractvalue with constant indices), they are extracts
2831 // themselves and already externally used. Vectorization of such
2832 // instructions does not add extra extractelement instruction, just may
2833 // remove it.
2834 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2835 isVectorLikeInstWithConstOps(OpIdxLaneV))
2837 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2838 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2839 return 0;
2840 return R.areAllUsersVectorized(IdxLaneI)
2842 : 0;
2843 }
2844
2845 /// Score scaling factor for fully compatible instructions but with
2846 /// different number of external uses. Allows better selection of the
2847 /// instructions with less external uses.
2848 static const int ScoreScaleFactor = 10;
2849
2850 /// \Returns the look-ahead score, which tells us how much the sub-trees
2851 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2852 /// score. This helps break ties in an informed way when we cannot decide on
2853 /// the order of the operands by just considering the immediate
2854 /// predecessors.
2855 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2856 int Lane, unsigned OpIdx, unsigned Idx,
2857 bool &IsUsed, const SmallBitVector &UsedLanes) {
2858 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2860 // Keep track of the instruction stack as we recurse into the operands
2861 // during the look-ahead score exploration.
2862 int Score =
2863 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2864 /*CurrLevel=*/1, MainAltOps);
2865 if (Score) {
2866 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2867 if (Score <= -SplatScore) {
2868 // Failed score.
2869 Score = 0;
2870 } else {
2871 Score += SplatScore;
2872 // Scale score to see the difference between different operands
2873 // and similar operands but all vectorized/not all vectorized
2874 // uses. It does not affect actual selection of the best
2875 // compatible operand in general, just allows to select the
2876 // operand with all vectorized uses.
2877 Score *= ScoreScaleFactor;
2878 Score += getExternalUseScore(Lane, OpIdx, Idx);
2879 IsUsed = true;
2880 }
2881 }
2882 return Score;
2883 }
2884
2885 /// Best defined scores per lanes between the passes. Used to choose the
2886 /// best operand (with the highest score) between the passes.
2887 /// The key - {Operand Index, Lane}.
2888 /// The value - the best score between the passes for the lane and the
2889 /// operand.
2891 BestScoresPerLanes;
2892
2893 // Search all operands in Ops[*][Lane] for the one that matches best
2894 // Ops[OpIdx][LastLane] and return its opreand index.
2895 // If no good match can be found, return std::nullopt.
2896 std::optional<unsigned>
2897 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2898 ArrayRef<ReorderingMode> ReorderingModes,
2899 ArrayRef<Value *> MainAltOps,
2900 const SmallBitVector &UsedLanes) {
2901 unsigned NumOperands = getNumOperands();
2902
2903 // The operand of the previous lane at OpIdx.
2904 Value *OpLastLane = getData(OpIdx, LastLane).V;
2905
2906 // Our strategy mode for OpIdx.
2907 ReorderingMode RMode = ReorderingModes[OpIdx];
2908 if (RMode == ReorderingMode::Failed)
2909 return std::nullopt;
2910
2911 // The linearized opcode of the operand at OpIdx, Lane.
2912 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2913
2914 // The best operand index and its score.
2915 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2916 // are using the score to differentiate between the two.
2917 struct BestOpData {
2918 std::optional<unsigned> Idx;
2919 unsigned Score = 0;
2920 } BestOp;
2921 BestOp.Score =
2922 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2923 .first->second;
2924
2925 // Track if the operand must be marked as used. If the operand is set to
2926 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2927 // want to reestimate the operands again on the following iterations).
2928 bool IsUsed = RMode == ReorderingMode::Splat ||
2929 RMode == ReorderingMode::Constant ||
2930 RMode == ReorderingMode::Load;
2931 // Iterate through all unused operands and look for the best.
2932 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2933 // Get the operand at Idx and Lane.
2934 OperandData &OpData = getData(Idx, Lane);
2935 Value *Op = OpData.V;
2936 bool OpAPO = OpData.APO;
2937
2938 // Skip already selected operands.
2939 if (OpData.IsUsed)
2940 continue;
2941
2942 // Skip if we are trying to move the operand to a position with a
2943 // different opcode in the linearized tree form. This would break the
2944 // semantics.
2945 if (OpAPO != OpIdxAPO)
2946 continue;
2947
2948 // Look for an operand that matches the current mode.
2949 switch (RMode) {
2950 case ReorderingMode::Load:
2951 case ReorderingMode::Opcode: {
2952 bool LeftToRight = Lane > LastLane;
2953 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2954 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2955 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2956 OpIdx, Idx, IsUsed, UsedLanes);
2957 if (Score > static_cast<int>(BestOp.Score) ||
2958 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2959 Idx == OpIdx)) {
2960 BestOp.Idx = Idx;
2961 BestOp.Score = Score;
2962 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2963 }
2964 break;
2965 }
2966 case ReorderingMode::Constant:
2967 if (isa<Constant>(Op) ||
2968 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2969 BestOp.Idx = Idx;
2970 if (isa<Constant>(Op)) {
2972 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2974 }
2976 IsUsed = false;
2977 }
2978 break;
2979 case ReorderingMode::Splat:
2980 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2981 IsUsed = Op == OpLastLane;
2982 if (Op == OpLastLane) {
2983 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2984 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2986 }
2987 BestOp.Idx = Idx;
2988 }
2989 break;
2990 case ReorderingMode::Failed:
2991 llvm_unreachable("Not expected Failed reordering mode.");
2992 }
2993 }
2994
2995 if (BestOp.Idx) {
2996 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2997 return BestOp.Idx;
2998 }
2999 // If we could not find a good match return std::nullopt.
3000 return std::nullopt;
3001 }
3002
3003 /// Helper for reorderOperandVecs.
3004 /// \returns the lane that we should start reordering from. This is the one
3005 /// which has the least number of operands that can freely move about or
3006 /// less profitable because it already has the most optimal set of operands.
3007 unsigned getBestLaneToStartReordering() const {
3008 unsigned Min = UINT_MAX;
3009 unsigned SameOpNumber = 0;
3010 // std::pair<unsigned, unsigned> is used to implement a simple voting
3011 // algorithm and choose the lane with the least number of operands that
3012 // can freely move about or less profitable because it already has the
3013 // most optimal set of operands. The first unsigned is a counter for
3014 // voting, the second unsigned is the counter of lanes with instructions
3015 // with same/alternate opcodes and same parent basic block.
3017 // Try to be closer to the original results, if we have multiple lanes
3018 // with same cost. If 2 lanes have the same cost, use the one with the
3019 // highest index.
3020 for (int I = getNumLanes(); I > 0; --I) {
3021 unsigned Lane = I - 1;
3022 OperandsOrderData NumFreeOpsHash =
3023 getMaxNumOperandsThatCanBeReordered(Lane);
3024 // Compare the number of operands that can move and choose the one with
3025 // the least number.
3026 if (NumFreeOpsHash.NumOfAPOs < Min) {
3027 Min = NumFreeOpsHash.NumOfAPOs;
3028 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3029 HashMap.clear();
3030 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3031 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3032 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3033 // Select the most optimal lane in terms of number of operands that
3034 // should be moved around.
3035 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3036 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3037 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3038 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3039 auto [It, Inserted] =
3040 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3041 if (!Inserted)
3042 ++It->second.first;
3043 }
3044 }
3045 // Select the lane with the minimum counter.
3046 unsigned BestLane = 0;
3047 unsigned CntMin = UINT_MAX;
3048 for (const auto &Data : reverse(HashMap)) {
3049 if (Data.second.first < CntMin) {
3050 CntMin = Data.second.first;
3051 BestLane = Data.second.second;
3052 }
3053 }
3054 return BestLane;
3055 }
3056
3057 /// Data structure that helps to reorder operands.
3058 struct OperandsOrderData {
3059 /// The best number of operands with the same APOs, which can be
3060 /// reordered.
3061 unsigned NumOfAPOs = UINT_MAX;
3062 /// Number of operands with the same/alternate instruction opcode and
3063 /// parent.
3064 unsigned NumOpsWithSameOpcodeParent = 0;
3065 /// Hash for the actual operands ordering.
3066 /// Used to count operands, actually their position id and opcode
3067 /// value. It is used in the voting mechanism to find the lane with the
3068 /// least number of operands that can freely move about or less profitable
3069 /// because it already has the most optimal set of operands. Can be
3070 /// replaced with SmallVector<unsigned> instead but hash code is faster
3071 /// and requires less memory.
3072 unsigned Hash = 0;
3073 };
3074 /// \returns the maximum number of operands that are allowed to be reordered
3075 /// for \p Lane and the number of compatible instructions(with the same
3076 /// parent/opcode). This is used as a heuristic for selecting the first lane
3077 /// to start operand reordering.
3078 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3079 unsigned CntTrue = 0;
3080 unsigned NumOperands = getNumOperands();
3081 // Operands with the same APO can be reordered. We therefore need to count
3082 // how many of them we have for each APO, like this: Cnt[APO] = x.
3083 // Since we only have two APOs, namely true and false, we can avoid using
3084 // a map. Instead we can simply count the number of operands that
3085 // correspond to one of them (in this case the 'true' APO), and calculate
3086 // the other by subtracting it from the total number of operands.
3087 // Operands with the same instruction opcode and parent are more
3088 // profitable since we don't need to move them in many cases, with a high
3089 // probability such lane already can be vectorized effectively.
3090 bool AllUndefs = true;
3091 unsigned NumOpsWithSameOpcodeParent = 0;
3092 Instruction *OpcodeI = nullptr;
3093 BasicBlock *Parent = nullptr;
3094 unsigned Hash = 0;
3095 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3096 const OperandData &OpData = getData(OpIdx, Lane);
3097 if (OpData.APO)
3098 ++CntTrue;
3099 // Use Boyer-Moore majority voting for finding the majority opcode and
3100 // the number of times it occurs.
3101 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3102 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3103 I->getParent() != Parent) {
3104 if (NumOpsWithSameOpcodeParent == 0) {
3105 NumOpsWithSameOpcodeParent = 1;
3106 OpcodeI = I;
3107 Parent = I->getParent();
3108 } else {
3109 --NumOpsWithSameOpcodeParent;
3110 }
3111 } else {
3112 ++NumOpsWithSameOpcodeParent;
3113 }
3114 }
3115 Hash = hash_combine(
3116 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3117 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3118 }
3119 if (AllUndefs)
3120 return {};
3121 OperandsOrderData Data;
3122 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3123 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3124 Data.Hash = Hash;
3125 return Data;
3126 }
3127
3128 /// Go through the instructions in VL and append their operands.
3129 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3130 const InstructionsState &S) {
3131 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3132 assert((empty() || all_of(Operands,
3133 [this](const ValueList &VL) {
3134 return VL.size() == getNumLanes();
3135 })) &&
3136 "Expected same number of lanes");
3137 assert(S.valid() && "InstructionsState is invalid.");
3138 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3139 // arguments to the intrinsic produces the same result.
3140 Instruction *MainOp = S.getMainOp();
3141 unsigned NumOperands = MainOp->getNumOperands();
3143 OpsVec.resize(ArgSize);
3144 unsigned NumLanes = VL.size();
3145 for (OperandDataVec &Ops : OpsVec)
3146 Ops.resize(NumLanes);
3147 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3148 // Our tree has just 3 nodes: the root and two operands.
3149 // It is therefore trivial to get the APO. We only need to check the
3150 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3151 // operand. The LHS operand of both add and sub is never attached to an
3152 // inversese operation in the linearized form, therefore its APO is
3153 // false. The RHS is true only if V is an inverse operation.
3154
3155 // Since operand reordering is performed on groups of commutative
3156 // operations or alternating sequences (e.g., +, -), we can safely tell
3157 // the inverse operations by checking commutativity.
3158 auto *I = dyn_cast<Instruction>(VL[Lane]);
3159 if (!I && isa<PoisonValue>(VL[Lane])) {
3160 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3161 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3162 continue;
3163 }
3164 bool IsInverseOperation = false;
3165 if (S.isCopyableElement(VL[Lane])) {
3166 // The value is a copyable element.
3167 IsInverseOperation = !isCommutative(MainOp, VL[Lane]);
3168 } else {
3169 assert(I && "Expected instruction");
3170 auto [SelectedOp, Ops] = convertTo(I, S);
3171 // We cannot check commutativity by the converted instruction
3172 // (SelectedOp) because isCommutative also examines def-use
3173 // relationships.
3174 IsInverseOperation = !isCommutative(SelectedOp, I);
3175 }
3176 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3177 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3178 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3179 }
3180 }
3181 }
3182
3183 /// \returns the number of operands.
3184 unsigned getNumOperands() const { return ArgSize; }
3185
3186 /// \returns the number of lanes.
3187 unsigned getNumLanes() const { return OpsVec[0].size(); }
3188
3189 /// \returns the operand value at \p OpIdx and \p Lane.
3190 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3191 return getData(OpIdx, Lane).V;
3192 }
3193
3194 /// \returns true if the data structure is empty.
3195 bool empty() const { return OpsVec.empty(); }
3196
3197 /// Clears the data.
3198 void clear() { OpsVec.clear(); }
3199
3200 /// \Returns true if there are enough operands identical to \p Op to fill
3201 /// the whole vector (it is mixed with constants or loop invariant values).
3202 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3203 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3204 assert(Op == getValue(OpIdx, Lane) &&
3205 "Op is expected to be getValue(OpIdx, Lane).");
3206 // Small number of loads - try load matching.
3207 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3208 return false;
3209 bool OpAPO = getData(OpIdx, Lane).APO;
3210 bool IsInvariant = L && L->isLoopInvariant(Op);
3211 unsigned Cnt = 0;
3212 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3213 if (Ln == Lane)
3214 continue;
3215 // This is set to true if we found a candidate for broadcast at Lane.
3216 bool FoundCandidate = false;
3217 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3218 OperandData &Data = getData(OpI, Ln);
3219 if (Data.APO != OpAPO || Data.IsUsed)
3220 continue;
3221 Value *OpILane = getValue(OpI, Lane);
3222 bool IsConstantOp = isa<Constant>(OpILane);
3223 // Consider the broadcast candidate if:
3224 // 1. Same value is found in one of the operands.
3225 if (Data.V == Op ||
3226 // 2. The operand in the given lane is not constant but there is a
3227 // constant operand in another lane (which can be moved to the
3228 // given lane). In this case we can represent it as a simple
3229 // permutation of constant and broadcast.
3230 (!IsConstantOp &&
3231 ((Lns > 2 && isa<Constant>(Data.V)) ||
3232 // 2.1. If we have only 2 lanes, need to check that value in the
3233 // next lane does not build same opcode sequence.
3234 (Lns == 2 &&
3235 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3236 isa<Constant>(Data.V)))) ||
3237 // 3. The operand in the current lane is loop invariant (can be
3238 // hoisted out) and another operand is also a loop invariant
3239 // (though not a constant). In this case the whole vector can be
3240 // hoisted out.
3241 // FIXME: need to teach the cost model about this case for better
3242 // estimation.
3243 (IsInvariant && !isa<Constant>(Data.V) &&
3244 !getSameOpcode({Op, Data.V}, TLI) &&
3245 L->isLoopInvariant(Data.V))) {
3246 FoundCandidate = true;
3247 Data.IsUsed = Data.V == Op;
3248 if (Data.V == Op)
3249 ++Cnt;
3250 break;
3251 }
3252 }
3253 if (!FoundCandidate)
3254 return false;
3255 }
3256 return getNumLanes() == 2 || Cnt > 1;
3257 }
3258
3259 /// Checks if there is at least single compatible operand in lanes other
3260 /// than \p Lane, compatible with the operand \p Op.
3261 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3262 assert(Op == getValue(OpIdx, Lane) &&
3263 "Op is expected to be getValue(OpIdx, Lane).");
3264 bool OpAPO = getData(OpIdx, Lane).APO;
3265 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3266 if (Ln == Lane)
3267 continue;
3268 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3269 const OperandData &Data = getData(OpI, Ln);
3270 if (Data.APO != OpAPO || Data.IsUsed)
3271 return true;
3272 Value *OpILn = getValue(OpI, Ln);
3273 return (L && L->isLoopInvariant(OpILn)) ||
3274 (getSameOpcode({Op, OpILn}, TLI) &&
3275 allSameBlock({Op, OpILn}));
3276 }))
3277 return true;
3278 }
3279 return false;
3280 }
3281
3282 public:
3283 /// Initialize with all the operands of the instruction vector \p RootVL.
3285 const InstructionsState &S, const BoUpSLP &R)
3286 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3287 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3288 // Append all the operands of RootVL.
3289 appendOperands(RootVL, Operands, S);
3290 }
3291
3292 /// \Returns a value vector with the operands across all lanes for the
3293 /// opearnd at \p OpIdx.
3294 ValueList getVL(unsigned OpIdx) const {
3295 ValueList OpVL(OpsVec[OpIdx].size());
3296 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3297 "Expected same num of lanes across all operands");
3298 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3299 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3300 return OpVL;
3301 }
3302
3303 // Performs operand reordering for 2 or more operands.
3304 // The original operands are in OrigOps[OpIdx][Lane].
3305 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3306 void reorder() {
3307 unsigned NumOperands = getNumOperands();
3308 unsigned NumLanes = getNumLanes();
3309 // Each operand has its own mode. We are using this mode to help us select
3310 // the instructions for each lane, so that they match best with the ones
3311 // we have selected so far.
3312 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3313
3314 // This is a greedy single-pass algorithm. We are going over each lane
3315 // once and deciding on the best order right away with no back-tracking.
3316 // However, in order to increase its effectiveness, we start with the lane
3317 // that has operands that can move the least. For example, given the
3318 // following lanes:
3319 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3320 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3321 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3322 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3323 // we will start at Lane 1, since the operands of the subtraction cannot
3324 // be reordered. Then we will visit the rest of the lanes in a circular
3325 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3326
3327 // Find the first lane that we will start our search from.
3328 unsigned FirstLane = getBestLaneToStartReordering();
3329
3330 // Initialize the modes.
3331 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3332 Value *OpLane0 = getValue(OpIdx, FirstLane);
3333 // Keep track if we have instructions with all the same opcode on one
3334 // side.
3335 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3336 // Check if OpLane0 should be broadcast.
3337 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3338 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3339 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3340 else if (isa<LoadInst>(OpILane0))
3341 ReorderingModes[OpIdx] = ReorderingMode::Load;
3342 else
3343 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3344 } else if (isa<Constant>(OpLane0)) {
3345 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3346 } else if (isa<Argument>(OpLane0)) {
3347 // Our best hope is a Splat. It may save some cost in some cases.
3348 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3349 } else {
3350 llvm_unreachable("Unexpected value kind.");
3351 }
3352 }
3353
3354 // Check that we don't have same operands. No need to reorder if operands
3355 // are just perfect diamond or shuffled diamond match. Do not do it only
3356 // for possible broadcasts or non-power of 2 number of scalars (just for
3357 // now).
3358 auto &&SkipReordering = [this]() {
3359 SmallPtrSet<Value *, 4> UniqueValues;
3360 ArrayRef<OperandData> Op0 = OpsVec.front();
3361 for (const OperandData &Data : Op0)
3362 UniqueValues.insert(Data.V);
3364 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3365 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3366 return !UniqueValues.contains(Data.V);
3367 }))
3368 return false;
3369 }
3370 // TODO: Check if we can remove a check for non-power-2 number of
3371 // scalars after full support of non-power-2 vectorization.
3372 return UniqueValues.size() != 2 &&
3373 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3374 UniqueValues.size());
3375 };
3376
3377 // If the initial strategy fails for any of the operand indexes, then we
3378 // perform reordering again in a second pass. This helps avoid assigning
3379 // high priority to the failed strategy, and should improve reordering for
3380 // the non-failed operand indexes.
3381 for (int Pass = 0; Pass != 2; ++Pass) {
3382 // Check if no need to reorder operands since they're are perfect or
3383 // shuffled diamond match.
3384 // Need to do it to avoid extra external use cost counting for
3385 // shuffled matches, which may cause regressions.
3386 if (SkipReordering())
3387 break;
3388 // Skip the second pass if the first pass did not fail.
3389 bool StrategyFailed = false;
3390 // Mark all operand data as free to use.
3391 clearUsed();
3392 // We keep the original operand order for the FirstLane, so reorder the
3393 // rest of the lanes. We are visiting the nodes in a circular fashion,
3394 // using FirstLane as the center point and increasing the radius
3395 // distance.
3396 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3397 for (unsigned I = 0; I < NumOperands; ++I)
3398 MainAltOps[I].push_back(getData(I, FirstLane).V);
3399
3400 SmallBitVector UsedLanes(NumLanes);
3401 UsedLanes.set(FirstLane);
3402 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3403 // Visit the lane on the right and then the lane on the left.
3404 for (int Direction : {+1, -1}) {
3405 int Lane = FirstLane + Direction * Distance;
3406 if (Lane < 0 || Lane >= (int)NumLanes)
3407 continue;
3408 UsedLanes.set(Lane);
3409 int LastLane = Lane - Direction;
3410 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3411 "Out of bounds");
3412 // Look for a good match for each operand.
3413 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3414 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3415 std::optional<unsigned> BestIdx =
3416 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3417 MainAltOps[OpIdx], UsedLanes);
3418 // By not selecting a value, we allow the operands that follow to
3419 // select a better matching value. We will get a non-null value in
3420 // the next run of getBestOperand().
3421 if (BestIdx) {
3422 // Swap the current operand with the one returned by
3423 // getBestOperand().
3424 swap(OpIdx, *BestIdx, Lane);
3425 } else {
3426 // Enable the second pass.
3427 StrategyFailed = true;
3428 }
3429 // Try to get the alternate opcode and follow it during analysis.
3430 if (MainAltOps[OpIdx].size() != 2) {
3431 OperandData &AltOp = getData(OpIdx, Lane);
3432 InstructionsState OpS =
3433 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3434 if (OpS && OpS.isAltShuffle())
3435 MainAltOps[OpIdx].push_back(AltOp.V);
3436 }
3437 }
3438 }
3439 }
3440 // Skip second pass if the strategy did not fail.
3441 if (!StrategyFailed)
3442 break;
3443 }
3444 }
3445
3446#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3447 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3448 switch (RMode) {
3449 case ReorderingMode::Load:
3450 return "Load";
3451 case ReorderingMode::Opcode:
3452 return "Opcode";
3453 case ReorderingMode::Constant:
3454 return "Constant";
3455 case ReorderingMode::Splat:
3456 return "Splat";
3457 case ReorderingMode::Failed:
3458 return "Failed";
3459 }
3460 llvm_unreachable("Unimplemented Reordering Type");
3461 }
3462
3463 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3464 raw_ostream &OS) {
3465 return OS << getModeStr(RMode);
3466 }
3467
3468 /// Debug print.
3469 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3470 printMode(RMode, dbgs());
3471 }
3472
3473 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3474 return printMode(RMode, OS);
3475 }
3476
3478 const unsigned Indent = 2;
3479 unsigned Cnt = 0;
3480 for (const OperandDataVec &OpDataVec : OpsVec) {
3481 OS << "Operand " << Cnt++ << "\n";
3482 for (const OperandData &OpData : OpDataVec) {
3483 OS.indent(Indent) << "{";
3484 if (Value *V = OpData.V)
3485 OS << *V;
3486 else
3487 OS << "null";
3488 OS << ", APO:" << OpData.APO << "}\n";
3489 }
3490 OS << "\n";
3491 }
3492 return OS;
3493 }
3494
3495 /// Debug print.
3496 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3497#endif
3498 };
3499
3500 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3501 /// for a pair which have highest score deemed to have best chance to form
3502 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3503 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3504 /// of the cost, considered to be good enough score.
3505 std::optional<int>
3506 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3507 int Limit = LookAheadHeuristics::ScoreFail) const {
3508 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3510 int BestScore = Limit;
3511 std::optional<int> Index;
3512 for (int I : seq<int>(0, Candidates.size())) {
3513 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3514 Candidates[I].second,
3515 /*U1=*/nullptr, /*U2=*/nullptr,
3516 /*CurrLevel=*/1, {});
3517 if (Score > BestScore) {
3518 BestScore = Score;
3519 Index = I;
3520 }
3521 }
3522 return Index;
3523 }
3524
3525 /// Checks if the instruction is marked for deletion.
3526 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3527
3528 /// Removes an instruction from its block and eventually deletes it.
3529 /// It's like Instruction::eraseFromParent() except that the actual deletion
3530 /// is delayed until BoUpSLP is destructed.
3532 DeletedInstructions.insert(I);
3533 }
3534
3535 /// Remove instructions from the parent function and clear the operands of \p
3536 /// DeadVals instructions, marking for deletion trivially dead operands.
3537 template <typename T>
3539 ArrayRef<T *> DeadVals,
3540 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3542 for (T *V : DeadVals) {
3543 auto *I = cast<Instruction>(V);
3545 }
3546 DenseSet<Value *> Processed;
3547 for (T *V : DeadVals) {
3548 if (!V || !Processed.insert(V).second)
3549 continue;
3550 auto *I = cast<Instruction>(V);
3552 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3553 for (Use &U : I->operands()) {
3554 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3555 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3557 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3558 return Entry->VectorizedValue == OpI;
3559 })))
3560 DeadInsts.push_back(OpI);
3561 }
3562 I->dropAllReferences();
3563 }
3564 for (T *V : DeadVals) {
3565 auto *I = cast<Instruction>(V);
3566 if (!I->getParent())
3567 continue;
3568 assert((I->use_empty() || all_of(I->uses(),
3569 [&](Use &U) {
3570 return isDeleted(
3571 cast<Instruction>(U.getUser()));
3572 })) &&
3573 "trying to erase instruction with users.");
3574 I->removeFromParent();
3575 SE->forgetValue(I);
3576 }
3577 // Process the dead instruction list until empty.
3578 while (!DeadInsts.empty()) {
3579 Value *V = DeadInsts.pop_back_val();
3581 if (!VI || !VI->getParent())
3582 continue;
3584 "Live instruction found in dead worklist!");
3585 assert(VI->use_empty() && "Instructions with uses are not dead.");
3586
3587 // Don't lose the debug info while deleting the instructions.
3588 salvageDebugInfo(*VI);
3589
3590 // Null out all of the instruction's operands to see if any operand
3591 // becomes dead as we go.
3592 for (Use &OpU : VI->operands()) {
3593 Value *OpV = OpU.get();
3594 if (!OpV)
3595 continue;
3596 OpU.set(nullptr);
3597
3598 if (!OpV->use_empty())
3599 continue;
3600
3601 // If the operand is an instruction that became dead as we nulled out
3602 // the operand, and if it is 'trivially' dead, delete it in a future
3603 // loop iteration.
3604 if (auto *OpI = dyn_cast<Instruction>(OpV))
3605 if (!DeletedInstructions.contains(OpI) &&
3606 (!OpI->getType()->isVectorTy() ||
3607 none_of(VectorValuesAndScales,
3608 [&](const std::tuple<Value *, unsigned, bool> &V) {
3609 return std::get<0>(V) == OpI;
3610 })) &&
3612 DeadInsts.push_back(OpI);
3613 }
3614
3615 VI->removeFromParent();
3616 eraseInstruction(VI);
3617 SE->forgetValue(VI);
3618 }
3619 }
3620
3621 /// Checks if the instruction was already analyzed for being possible
3622 /// reduction root.
3624 return AnalyzedReductionsRoots.count(I);
3625 }
3626 /// Register given instruction as already analyzed for being possible
3627 /// reduction root.
3629 AnalyzedReductionsRoots.insert(I);
3630 }
3631 /// Checks if the provided list of reduced values was checked already for
3632 /// vectorization.
3634 return AnalyzedReductionVals.contains(hash_value(VL));
3635 }
3636 /// Adds the list of reduced values to list of already checked values for the
3637 /// vectorization.
3639 AnalyzedReductionVals.insert(hash_value(VL));
3640 }
3641 /// Clear the list of the analyzed reduction root instructions.
3643 AnalyzedReductionsRoots.clear();
3644 AnalyzedReductionVals.clear();
3645 AnalyzedMinBWVals.clear();
3646 }
3647 /// Checks if the given value is gathered in one of the nodes.
3648 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3649 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3650 }
3651 /// Checks if the given value is gathered in one of the nodes.
3652 bool isGathered(const Value *V) const {
3653 return MustGather.contains(V);
3654 }
3655 /// Checks if the specified value was not schedule.
3656 bool isNotScheduled(const Value *V) const {
3657 return NonScheduledFirst.contains(V);
3658 }
3659
3660 /// Check if the value is vectorized in the tree.
3661 bool isVectorized(const Value *V) const {
3662 assert(V && "V cannot be nullptr.");
3663 return ScalarToTreeEntries.contains(V);
3664 }
3665
3666 ~BoUpSLP();
3667
3668private:
3669 /// Determine if a node \p E in can be demoted to a smaller type with a
3670 /// truncation. We collect the entries that will be demoted in ToDemote.
3671 /// \param E Node for analysis
3672 /// \param ToDemote indices of the nodes to be demoted.
3673 bool collectValuesToDemote(
3674 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3676 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3677 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3678
3679 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3680 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3681 /// they have only one user and reordarable).
3682 /// \param ReorderableGathers List of all gather nodes that require reordering
3683 /// (e.g., gather of extractlements or partially vectorizable loads).
3684 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3685 /// reordering, subset of \p NonVectorized.
3686 void buildReorderableOperands(
3687 TreeEntry *UserTE,
3688 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3689 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3690 SmallVectorImpl<TreeEntry *> &GatherOps);
3691
3692 /// Checks if the given \p TE is a gather node with clustered reused scalars
3693 /// and reorders it per given \p Mask.
3694 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3695
3696 /// Checks if all users of \p I are the part of the vectorization tree.
3697 bool areAllUsersVectorized(
3698 Instruction *I,
3699 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3700
3701 /// Return information about the vector formed for the specified index
3702 /// of a vector of (the same) instruction.
3704
3705 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3706 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3707 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3708 return const_cast<TreeEntry *>(
3709 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3710 }
3711
3712 /// Gets the root instruction for the given node. If the node is a strided
3713 /// load/store node with the reverse order, the root instruction is the last
3714 /// one.
3715 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3716
3717 /// \returns Cast context for the given graph node.
3719 getCastContextHint(const TreeEntry &TE) const;
3720
3721 /// \returns the cost of the vectorizable entry.
3722 InstructionCost getEntryCost(const TreeEntry *E,
3723 ArrayRef<Value *> VectorizedVals,
3724 SmallPtrSetImpl<Value *> &CheckedExtracts);
3725
3726 /// Checks if it is legal and profitable to build SplitVectorize node for the
3727 /// given \p VL.
3728 /// \param Op1 first homogeneous scalars.
3729 /// \param Op2 second homogeneous scalars.
3730 /// \param ReorderIndices indices to reorder the scalars.
3731 /// \returns true if the node was successfully built.
3732 bool canBuildSplitNode(ArrayRef<Value *> VL,
3733 const InstructionsState &LocalState,
3736 OrdersType &ReorderIndices) const;
3737
3738 /// This is the recursive part of buildTree.
3739 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3740 unsigned InterleaveFactor = 0);
3741
3742 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3743 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3744 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3745 /// returns false, setting \p CurrentOrder to either an empty vector or a
3746 /// non-identity permutation that allows to reuse extract instructions.
3747 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3748 /// extract order.
3749 bool canReuseExtract(ArrayRef<Value *> VL,
3750 SmallVectorImpl<unsigned> &CurrentOrder,
3751 bool ResizeAllowed = false) const;
3752
3753 /// Vectorize a single entry in the tree.
3754 Value *vectorizeTree(TreeEntry *E);
3755
3756 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3757 /// \p E.
3758 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3759
3760 /// Create a new vector from a list of scalar values. Produces a sequence
3761 /// which exploits values reused across lanes, and arranges the inserts
3762 /// for ease of later optimization.
3763 template <typename BVTy, typename ResTy, typename... Args>
3764 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3765
3766 /// Create a new vector from a list of scalar values. Produces a sequence
3767 /// which exploits values reused across lanes, and arranges the inserts
3768 /// for ease of later optimization.
3769 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3770
3771 /// Returns the instruction in the bundle, which can be used as a base point
3772 /// for scheduling. Usually it is the last instruction in the bundle, except
3773 /// for the case when all operands are external (in this case, it is the first
3774 /// instruction in the list).
3775 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3776
3777 /// Tries to find extractelement instructions with constant indices from fixed
3778 /// vector type and gather such instructions into a bunch, which highly likely
3779 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3780 /// was successful, the matched scalars are replaced by poison values in \p VL
3781 /// for future analysis.
3782 std::optional<TargetTransformInfo::ShuffleKind>
3783 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3784 SmallVectorImpl<int> &Mask) const;
3785
3786 /// Tries to find extractelement instructions with constant indices from fixed
3787 /// vector type and gather such instructions into a bunch, which highly likely
3788 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3789 /// was successful, the matched scalars are replaced by poison values in \p VL
3790 /// for future analysis.
3792 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3794 unsigned NumParts) const;
3795
3796 /// Checks if the gathered \p VL can be represented as a single register
3797 /// shuffle(s) of previous tree entries.
3798 /// \param TE Tree entry checked for permutation.
3799 /// \param VL List of scalars (a subset of the TE scalar), checked for
3800 /// permutations. Must form single-register vector.
3801 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3802 /// commands to build the mask using the original vector value, without
3803 /// relying on the potential reordering.
3804 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3805 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3806 std::optional<TargetTransformInfo::ShuffleKind>
3807 isGatherShuffledSingleRegisterEntry(
3808 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3809 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3810 bool ForOrder);
3811
3812 /// Checks if the gathered \p VL can be represented as multi-register
3813 /// shuffle(s) of previous tree entries.
3814 /// \param TE Tree entry checked for permutation.
3815 /// \param VL List of scalars (a subset of the TE scalar), checked for
3816 /// permutations.
3817 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3818 /// commands to build the mask using the original vector value, without
3819 /// relying on the potential reordering.
3820 /// \returns per-register series of ShuffleKind, if gathered values can be
3821 /// represented as shuffles of previous tree entries. \p Mask is filled with
3822 /// the shuffle mask (also on per-register base).
3824 isGatherShuffledEntry(
3825 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3827 unsigned NumParts, bool ForOrder = false);
3828
3829 /// \returns the cost of gathering (inserting) the values in \p VL into a
3830 /// vector.
3831 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3832 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3833 Type *ScalarTy) const;
3834
3835 /// Set the Builder insert point to one after the last instruction in
3836 /// the bundle
3837 void setInsertPointAfterBundle(const TreeEntry *E);
3838
3839 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3840 /// specified, the starting vector value is poison.
3841 Value *
3842 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3843 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3844
3845 /// \returns whether the VectorizableTree is fully vectorizable and will
3846 /// be beneficial even the tree height is tiny.
3847 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3848
3849 /// Run through the list of all gathered loads in the graph and try to find
3850 /// vector loads/masked gathers instead of regular gathers. Later these loads
3851 /// are reshufled to build final gathered nodes.
3852 void tryToVectorizeGatheredLoads(
3853 const SmallMapVector<
3854 std::tuple<BasicBlock *, Value *, Type *>,
3855 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3856 &GatheredLoads);
3857
3858 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3859 /// users of \p TE and collects the stores. It returns the map from the store
3860 /// pointers to the collected stores.
3862 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3863
3864 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3865 /// stores in \p StoresVec can form a vector instruction. If so it returns
3866 /// true and populates \p ReorderIndices with the shuffle indices of the
3867 /// stores when compared to the sorted vector.
3868 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3869 OrdersType &ReorderIndices) const;
3870
3871 /// Iterates through the users of \p TE, looking for scalar stores that can be
3872 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3873 /// their order and builds an order index vector for each store bundle. It
3874 /// returns all these order vectors found.
3875 /// We run this after the tree has formed, otherwise we may come across user
3876 /// instructions that are not yet in the tree.
3878 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3879
3880 /// Tries to reorder the gathering node for better vectorization
3881 /// opportunities.
3882 void reorderGatherNode(TreeEntry &TE);
3883
3884 class TreeEntry {
3885 public:
3886 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3887 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3888
3889 /// \returns Common mask for reorder indices and reused scalars.
3890 SmallVector<int> getCommonMask() const {
3891 if (State == TreeEntry::SplitVectorize)
3892 return {};
3893 SmallVector<int> Mask;
3894 inversePermutation(ReorderIndices, Mask);
3895 ::addMask(Mask, ReuseShuffleIndices);
3896 return Mask;
3897 }
3898
3899 /// \returns The mask for split nodes.
3900 SmallVector<int> getSplitMask() const {
3901 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3902 "Expected only split vectorize node.");
3903 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3904 unsigned CommonVF = std::max<unsigned>(
3905 CombinedEntriesWithIndices.back().second,
3906 Scalars.size() - CombinedEntriesWithIndices.back().second);
3907 for (auto [Idx, I] : enumerate(ReorderIndices))
3908 Mask[I] =
3909 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3910 ? CommonVF - CombinedEntriesWithIndices.back().second
3911 : 0);
3912 return Mask;
3913 }
3914
3915 /// Updates (reorders) SplitVectorize node according to the given mask \p
3916 /// Mask and order \p MaskOrder.
3917 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3918 ArrayRef<int> MaskOrder);
3919
3920 /// \returns true if the scalars in VL are equal to this entry.
3921 bool isSame(ArrayRef<Value *> VL) const {
3922 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3923 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3924 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3925 return VL.size() == Mask.size() &&
3926 std::equal(VL.begin(), VL.end(), Mask.begin(),
3927 [Scalars](Value *V, int Idx) {
3928 return (isa<UndefValue>(V) &&
3929 Idx == PoisonMaskElem) ||
3930 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3931 });
3932 };
3933 if (!ReorderIndices.empty()) {
3934 // TODO: implement matching if the nodes are just reordered, still can
3935 // treat the vector as the same if the list of scalars matches VL
3936 // directly, without reordering.
3937 SmallVector<int> Mask;
3938 inversePermutation(ReorderIndices, Mask);
3939 if (VL.size() == Scalars.size())
3940 return IsSame(Scalars, Mask);
3941 if (VL.size() == ReuseShuffleIndices.size()) {
3942 ::addMask(Mask, ReuseShuffleIndices);
3943 return IsSame(Scalars, Mask);
3944 }
3945 return false;
3946 }
3947 return IsSame(Scalars, ReuseShuffleIndices);
3948 }
3949
3950 /// \returns true if current entry has same operands as \p TE.
3951 bool hasEqualOperands(const TreeEntry &TE) const {
3952 if (TE.getNumOperands() != getNumOperands())
3953 return false;
3954 SmallBitVector Used(getNumOperands());
3955 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3956 unsigned PrevCount = Used.count();
3957 for (unsigned K = 0; K < E; ++K) {
3958 if (Used.test(K))
3959 continue;
3960 if (getOperand(K) == TE.getOperand(I)) {
3961 Used.set(K);
3962 break;
3963 }
3964 }
3965 // Check if we actually found the matching operand.
3966 if (PrevCount == Used.count())
3967 return false;
3968 }
3969 return true;
3970 }
3971
3972 /// \return Final vectorization factor for the node. Defined by the total
3973 /// number of vectorized scalars, including those, used several times in the
3974 /// entry and counted in the \a ReuseShuffleIndices, if any.
3975 unsigned getVectorFactor() const {
3976 if (!ReuseShuffleIndices.empty())
3977 return ReuseShuffleIndices.size();
3978 return Scalars.size();
3979 };
3980
3981 /// Checks if the current node is a gather node.
3982 bool isGather() const { return State == NeedToGather; }
3983
3984 /// A vector of scalars.
3985 ValueList Scalars;
3986
3987 /// The Scalars are vectorized into this value. It is initialized to Null.
3988 WeakTrackingVH VectorizedValue = nullptr;
3989
3990 /// Do we need to gather this sequence or vectorize it
3991 /// (either with vector instruction or with scatter/gather
3992 /// intrinsics for store/load)?
3993 enum EntryState {
3994 Vectorize, ///< The node is regularly vectorized.
3995 ScatterVectorize, ///< Masked scatter/gather node.
3996 StridedVectorize, ///< Strided loads (and stores)
3997 CompressVectorize, ///< (Masked) load with compress.
3998 NeedToGather, ///< Gather/buildvector node.
3999 CombinedVectorize, ///< Vectorized node, combined with its user into more
4000 ///< complex node like select/cmp to minmax, mul/add to
4001 ///< fma, etc. Must be used for the following nodes in
4002 ///< the pattern, not the very first one.
4003 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4004 ///< independently and then combines back.
4005 };
4006 EntryState State;
4007
4008 /// List of combined opcodes supported by the vectorizer.
4009 enum CombinedOpcode {
4010 NotCombinedOp = -1,
4011 MinMax = Instruction::OtherOpsEnd + 1,
4012 FMulAdd,
4013 };
4014 CombinedOpcode CombinedOp = NotCombinedOp;
4015
4016 /// Does this sequence require some shuffling?
4017 SmallVector<int, 4> ReuseShuffleIndices;
4018
4019 /// Does this entry require reordering?
4020 SmallVector<unsigned, 4> ReorderIndices;
4021
4022 /// Points back to the VectorizableTree.
4023 ///
4024 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4025 /// to be a pointer and needs to be able to initialize the child iterator.
4026 /// Thus we need a reference back to the container to translate the indices
4027 /// to entries.
4028 VecTreeTy &Container;
4029
4030 /// The TreeEntry index containing the user of this entry.
4031 EdgeInfo UserTreeIndex;
4032
4033 /// The index of this treeEntry in VectorizableTree.
4034 unsigned Idx = 0;
4035
4036 /// For gather/buildvector/alt opcode nodes, which are combined from
4037 /// other nodes as a series of insertvector instructions.
4038 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4039
4040 private:
4041 /// The operands of each instruction in each lane Operands[op_index][lane].
4042 /// Note: This helps avoid the replication of the code that performs the
4043 /// reordering of operands during buildTreeRec() and vectorizeTree().
4045
4046 /// Copyable elements of the entry node.
4047 SmallPtrSet<const Value *, 4> CopyableElements;
4048
4049 /// MainOp and AltOp are recorded inside. S should be obtained from
4050 /// newTreeEntry.
4051 InstructionsState S = InstructionsState::invalid();
4052
4053 /// Interleaving factor for interleaved loads Vectorize nodes.
4054 unsigned InterleaveFactor = 0;
4055
4056 /// True if the node does not require scheduling.
4057 bool DoesNotNeedToSchedule = false;
4058
4059 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4060 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4061 if (Operands.size() < OpIdx + 1)
4062 Operands.resize(OpIdx + 1);
4063 assert(Operands[OpIdx].empty() && "Already resized?");
4064 assert(OpVL.size() <= Scalars.size() &&
4065 "Number of operands is greater than the number of scalars.");
4066 Operands[OpIdx].resize(OpVL.size());
4067 copy(OpVL, Operands[OpIdx].begin());
4068 }
4069
4070 public:
4071 /// Returns interleave factor for interleave nodes.
4072 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4073 /// Sets interleaving factor for the interleaving nodes.
4074 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4075
4076 /// Marks the node as one that does not require scheduling.
4077 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4078 /// Returns true if the node is marked as one that does not require
4079 /// scheduling.
4080 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4081
4082 /// Set this bundle's operands from \p Operands.
4083 void setOperands(ArrayRef<ValueList> Operands) {
4084 for (unsigned I : seq<unsigned>(Operands.size()))
4085 setOperand(I, Operands[I]);
4086 }
4087
4088 /// Reorders operands of the node to the given mask \p Mask.
4089 void reorderOperands(ArrayRef<int> Mask) {
4090 for (ValueList &Operand : Operands)
4091 reorderScalars(Operand, Mask);
4092 }
4093
4094 /// \returns the \p OpIdx operand of this TreeEntry.
4095 ValueList &getOperand(unsigned OpIdx) {
4096 assert(OpIdx < Operands.size() && "Off bounds");
4097 return Operands[OpIdx];
4098 }
4099
4100 /// \returns the \p OpIdx operand of this TreeEntry.
4101 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4102 assert(OpIdx < Operands.size() && "Off bounds");
4103 return Operands[OpIdx];
4104 }
4105
4106 /// \returns the number of operands.
4107 unsigned getNumOperands() const { return Operands.size(); }
4108
4109 /// \return the single \p OpIdx operand.
4110 Value *getSingleOperand(unsigned OpIdx) const {
4111 assert(OpIdx < Operands.size() && "Off bounds");
4112 assert(!Operands[OpIdx].empty() && "No operand available");
4113 return Operands[OpIdx][0];
4114 }
4115
4116 /// Some of the instructions in the list have alternate opcodes.
4117 bool isAltShuffle() const { return S.isAltShuffle(); }
4118
4119 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4120 return S.getMatchingMainOpOrAltOp(I);
4121 }
4122
4123 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4124 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4125 /// \p OpValue.
4126 Value *isOneOf(Value *Op) const {
4127 auto *I = dyn_cast<Instruction>(Op);
4128 if (I && getMatchingMainOpOrAltOp(I))
4129 return Op;
4130 return S.getMainOp();
4131 }
4132
4133 void setOperations(const InstructionsState &S) {
4134 assert(S && "InstructionsState is invalid.");
4135 this->S = S;
4136 }
4137
4138 Instruction *getMainOp() const { return S.getMainOp(); }
4139
4140 Instruction *getAltOp() const { return S.getAltOp(); }
4141
4142 /// The main/alternate opcodes for the list of instructions.
4143 unsigned getOpcode() const { return S.getOpcode(); }
4144
4145 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4146
4147 bool hasState() const { return S.valid(); }
4148
4149 /// Add \p V to the list of copyable elements.
4150 void addCopyableElement(Value *V) {
4151 assert(S.isCopyableElement(V) && "Not a copyable element.");
4152 CopyableElements.insert(V);
4153 }
4154
4155 /// Returns true if \p V is a copyable element.
4156 bool isCopyableElement(Value *V) const {
4157 return CopyableElements.contains(V);
4158 }
4159
4160 /// Returns true if any scalar in the list is a copyable element.
4161 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4162
4163 /// Returns the state of the operations.
4164 const InstructionsState &getOperations() const { return S; }
4165
4166 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4167 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4168 unsigned findLaneForValue(Value *V) const {
4169 unsigned FoundLane = getVectorFactor();
4170 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4171 std::advance(It, 1)) {
4172 if (*It != V)
4173 continue;
4174 FoundLane = std::distance(Scalars.begin(), It);
4175 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4176 if (!ReorderIndices.empty())
4177 FoundLane = ReorderIndices[FoundLane];
4178 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4179 if (ReuseShuffleIndices.empty())
4180 break;
4181 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4182 RIt != ReuseShuffleIndices.end()) {
4183 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4184 break;
4185 }
4186 }
4187 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4188 return FoundLane;
4189 }
4190
4191 /// Build a shuffle mask for graph entry which represents a merge of main
4192 /// and alternate operations.
4193 void
4194 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4195 SmallVectorImpl<int> &Mask,
4196 SmallVectorImpl<Value *> *OpScalars = nullptr,
4197 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4198
4199 /// Return true if this is a non-power-of-2 node.
4200 bool isNonPowOf2Vec() const {
4201 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4202 return IsNonPowerOf2;
4203 }
4204
4205 /// Return true if this is a node, which tries to vectorize number of
4206 /// elements, forming whole vectors.
4207 bool
4208 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4209 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4210 TTI, getValueType(Scalars.front()), Scalars.size());
4211 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4212 "Reshuffling not supported with non-power-of-2 vectors yet.");
4213 return IsNonPowerOf2;
4214 }
4215
4216 Value *getOrdered(unsigned Idx) const {
4217 assert(isGather() && "Must be used only for buildvectors/gathers.");
4218 if (ReorderIndices.empty())
4219 return Scalars[Idx];
4220 SmallVector<int> Mask;
4221 inversePermutation(ReorderIndices, Mask);
4222 return Scalars[Mask[Idx]];
4223 }
4224
4225#ifndef NDEBUG
4226 /// Debug printer.
4227 LLVM_DUMP_METHOD void dump() const {
4228 dbgs() << Idx << ".\n";
4229 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4230 dbgs() << "Operand " << OpI << ":\n";
4231 for (const Value *V : Operands[OpI])
4232 dbgs().indent(2) << *V << "\n";
4233 }
4234 dbgs() << "Scalars: \n";
4235 for (Value *V : Scalars)
4236 dbgs().indent(2) << *V << "\n";
4237 dbgs() << "State: ";
4238 if (S && hasCopyableElements())
4239 dbgs() << "[[Copyable]] ";
4240 switch (State) {
4241 case Vectorize:
4242 if (InterleaveFactor > 0) {
4243 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4244 << "\n";
4245 } else {
4246 dbgs() << "Vectorize\n";
4247 }
4248 break;
4249 case ScatterVectorize:
4250 dbgs() << "ScatterVectorize\n";
4251 break;
4252 case StridedVectorize:
4253 dbgs() << "StridedVectorize\n";
4254 break;
4255 case CompressVectorize:
4256 dbgs() << "CompressVectorize\n";
4257 break;
4258 case NeedToGather:
4259 dbgs() << "NeedToGather\n";
4260 break;
4261 case CombinedVectorize:
4262 dbgs() << "CombinedVectorize\n";
4263 break;
4264 case SplitVectorize:
4265 dbgs() << "SplitVectorize\n";
4266 break;
4267 }
4268 if (S) {
4269 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4270 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4271 } else {
4272 dbgs() << "MainOp: NULL\n";
4273 dbgs() << "AltOp: NULL\n";
4274 }
4275 dbgs() << "VectorizedValue: ";
4276 if (VectorizedValue)
4277 dbgs() << *VectorizedValue << "\n";
4278 else
4279 dbgs() << "NULL\n";
4280 dbgs() << "ReuseShuffleIndices: ";
4281 if (ReuseShuffleIndices.empty())
4282 dbgs() << "Empty";
4283 else
4284 for (int ReuseIdx : ReuseShuffleIndices)
4285 dbgs() << ReuseIdx << ", ";
4286 dbgs() << "\n";
4287 dbgs() << "ReorderIndices: ";
4288 for (unsigned ReorderIdx : ReorderIndices)
4289 dbgs() << ReorderIdx << ", ";
4290 dbgs() << "\n";
4291 dbgs() << "UserTreeIndex: ";
4292 if (UserTreeIndex)
4293 dbgs() << UserTreeIndex;
4294 else
4295 dbgs() << "<invalid>";
4296 dbgs() << "\n";
4297 if (!CombinedEntriesWithIndices.empty()) {
4298 dbgs() << "Combined entries: ";
4299 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4300 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4301 });
4302 dbgs() << "\n";
4303 }
4304 }
4305#endif
4306 };
4307
4308#ifndef NDEBUG
4309 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4310 InstructionCost VecCost, InstructionCost ScalarCost,
4311 StringRef Banner) const {
4312 dbgs() << "SLP: " << Banner << ":\n";
4313 E->dump();
4314 dbgs() << "SLP: Costs:\n";
4315 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4316 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4317 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4318 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4319 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4320 }
4321#endif
4322
4323 /// Create a new gather TreeEntry
4324 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4325 const InstructionsState &S,
4326 const EdgeInfo &UserTreeIdx,
4327 ArrayRef<int> ReuseShuffleIndices = {}) {
4328 auto Invalid = ScheduleBundle::invalid();
4329 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4330 }
4331
4332 /// Create a new VectorizableTree entry.
4333 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4334 const InstructionsState &S,
4335 const EdgeInfo &UserTreeIdx,
4336 ArrayRef<int> ReuseShuffleIndices = {},
4337 ArrayRef<unsigned> ReorderIndices = {},
4338 unsigned InterleaveFactor = 0) {
4339 TreeEntry::EntryState EntryState =
4340 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4341 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4342 ReuseShuffleIndices, ReorderIndices);
4343 if (E && InterleaveFactor > 0)
4344 E->setInterleave(InterleaveFactor);
4345 return E;
4346 }
4347
4348 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4349 TreeEntry::EntryState EntryState,
4350 ScheduleBundle &Bundle, const InstructionsState &S,
4351 const EdgeInfo &UserTreeIdx,
4352 ArrayRef<int> ReuseShuffleIndices = {},
4353 ArrayRef<unsigned> ReorderIndices = {}) {
4354 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4355 EntryState == TreeEntry::SplitVectorize)) ||
4356 (Bundle && EntryState != TreeEntry::NeedToGather &&
4357 EntryState != TreeEntry::SplitVectorize)) &&
4358 "Need to vectorize gather entry?");
4359 // Gathered loads still gathered? Do not create entry, use the original one.
4360 if (GatheredLoadsEntriesFirst.has_value() &&
4361 EntryState == TreeEntry::NeedToGather && S &&
4362 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4363 !UserTreeIdx.UserTE)
4364 return nullptr;
4365 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4366 TreeEntry *Last = VectorizableTree.back().get();
4367 Last->Idx = VectorizableTree.size() - 1;
4368 Last->State = EntryState;
4369 if (UserTreeIdx.UserTE)
4370 OperandsToTreeEntry.try_emplace(
4371 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4372 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4373 // for non-power-of-two vectors.
4374 assert(
4375 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4376 ReuseShuffleIndices.empty()) &&
4377 "Reshuffling scalars not yet supported for nodes with padding");
4378 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4379 ReuseShuffleIndices.end());
4380 if (ReorderIndices.empty()) {
4381 Last->Scalars.assign(VL.begin(), VL.end());
4382 if (S)
4383 Last->setOperations(S);
4384 } else {
4385 // Reorder scalars and build final mask.
4386 Last->Scalars.assign(VL.size(), nullptr);
4387 transform(ReorderIndices, Last->Scalars.begin(),
4388 [VL](unsigned Idx) -> Value * {
4389 if (Idx >= VL.size())
4390 return UndefValue::get(VL.front()->getType());
4391 return VL[Idx];
4392 });
4393 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4394 if (S)
4395 Last->setOperations(S);
4396 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4397 }
4398 if (EntryState == TreeEntry::SplitVectorize) {
4399 assert(S && "Split nodes must have operations.");
4400 Last->setOperations(S);
4401 SmallPtrSet<Value *, 4> Processed;
4402 for (Value *V : VL) {
4403 auto *I = dyn_cast<Instruction>(V);
4404 if (!I)
4405 continue;
4406 auto It = ScalarsInSplitNodes.find(V);
4407 if (It == ScalarsInSplitNodes.end()) {
4408 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4409 (void)Processed.insert(V);
4410 } else if (Processed.insert(V).second) {
4411 assert(!is_contained(It->getSecond(), Last) &&
4412 "Value already associated with the node.");
4413 It->getSecond().push_back(Last);
4414 }
4415 }
4416 } else if (!Last->isGather()) {
4417 if (isa<PHINode>(S.getMainOp()) ||
4418 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4419 (!S.areInstructionsWithCopyableElements() &&
4420 doesNotNeedToSchedule(VL)) ||
4421 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4422 Last->setDoesNotNeedToSchedule();
4423 SmallPtrSet<Value *, 4> Processed;
4424 for (Value *V : VL) {
4425 if (isa<PoisonValue>(V))
4426 continue;
4427 if (S.isCopyableElement(V)) {
4428 Last->addCopyableElement(V);
4429 continue;
4430 }
4431 auto It = ScalarToTreeEntries.find(V);
4432 if (It == ScalarToTreeEntries.end()) {
4433 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4434 (void)Processed.insert(V);
4435 } else if (Processed.insert(V).second) {
4436 assert(!is_contained(It->getSecond(), Last) &&
4437 "Value already associated with the node.");
4438 It->getSecond().push_back(Last);
4439 }
4440 }
4441 // Update the scheduler bundle to point to this TreeEntry.
4442 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4443 "Bundle and VL out of sync");
4444 if (!Bundle.getBundle().empty()) {
4445#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4446 auto *BundleMember = Bundle.getBundle().begin();
4447 SmallPtrSet<Value *, 4> Processed;
4448 for (Value *V : VL) {
4449 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4450 continue;
4451 ++BundleMember;
4452 }
4453 assert(BundleMember == Bundle.getBundle().end() &&
4454 "Bundle and VL out of sync");
4455#endif
4456 Bundle.setTreeEntry(Last);
4457 }
4458 } else {
4459 // Build a map for gathered scalars to the nodes where they are used.
4460 bool AllConstsOrCasts = true;
4461 for (Value *V : VL) {
4462 if (S && S.areInstructionsWithCopyableElements() &&
4463 S.isCopyableElement(V))
4464 Last->addCopyableElement(V);
4465 if (!isConstant(V)) {
4466 auto *I = dyn_cast<CastInst>(V);
4467 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4468 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4469 !UserTreeIdx.UserTE->isGather())
4470 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4471 }
4472 }
4473 if (AllConstsOrCasts)
4474 CastMaxMinBWSizes =
4475 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4476 MustGather.insert_range(VL);
4477 }
4478
4479 if (UserTreeIdx.UserTE)
4480 Last->UserTreeIndex = UserTreeIdx;
4481 return Last;
4482 }
4483
4484 /// -- Vectorization State --
4485 /// Holds all of the tree entries.
4486 TreeEntry::VecTreeTy VectorizableTree;
4487
4488#ifndef NDEBUG
4489 /// Debug printer.
4490 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4491 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4492 VectorizableTree[Id]->dump();
4493 dbgs() << "\n";
4494 }
4495 }
4496#endif
4497
4498 /// Get list of vector entries, associated with the value \p V.
4499 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4500 assert(V && "V cannot be nullptr.");
4501 auto It = ScalarToTreeEntries.find(V);
4502 if (It == ScalarToTreeEntries.end())
4503 return {};
4504 return It->getSecond();
4505 }
4506
4507 /// Get list of split vector entries, associated with the value \p V.
4508 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4509 assert(V && "V cannot be nullptr.");
4510 auto It = ScalarsInSplitNodes.find(V);
4511 if (It == ScalarsInSplitNodes.end())
4512 return {};
4513 return It->getSecond();
4514 }
4515
4516 /// Returns first vector node for value \p V, matching values \p VL.
4517 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4518 bool SameVF = false) const {
4519 assert(V && "V cannot be nullptr.");
4520 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4521 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4522 return TE;
4523 return nullptr;
4524 }
4525
4526 /// Check that the operand node of alternate node does not generate
4527 /// buildvector sequence. If it is, then probably not worth it to build
4528 /// alternate shuffle, if number of buildvector operands + alternate
4529 /// instruction > than the number of buildvector instructions.
4530 /// \param S the instructions state of the analyzed values.
4531 /// \param VL list of the instructions with alternate opcodes.
4532 bool areAltOperandsProfitable(const InstructionsState &S,
4533 ArrayRef<Value *> VL) const;
4534
4535 /// Contains all the outputs of legality analysis for a list of values to
4536 /// vectorize.
4537 class ScalarsVectorizationLegality {
4538 InstructionsState S;
4539 bool IsLegal;
4540 bool TryToFindDuplicates;
4541 bool TrySplitVectorize;
4542
4543 public:
4544 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4545 bool TryToFindDuplicates = true,
4546 bool TrySplitVectorize = false)
4547 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4548 TrySplitVectorize(TrySplitVectorize) {
4549 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4550 "Inconsistent state");
4551 }
4552 const InstructionsState &getInstructionsState() const { return S; };
4553 bool isLegal() const { return IsLegal; }
4554 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4555 bool trySplitVectorize() const { return TrySplitVectorize; }
4556 };
4557
4558 /// Checks if the specified list of the instructions/values can be vectorized
4559 /// in general.
4560 ScalarsVectorizationLegality
4561 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4562 const EdgeInfo &UserTreeIdx,
4563 bool TryCopyableElementsVectorization) const;
4564
4565 /// Checks if the specified list of the instructions/values can be vectorized
4566 /// and fills required data before actual scheduling of the instructions.
4567 TreeEntry::EntryState getScalarsVectorizationState(
4568 const InstructionsState &S, ArrayRef<Value *> VL,
4569 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4570 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4571
4572 /// Maps a specific scalar to its tree entry(ies).
4573 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4574
4575 /// Maps the operand index and entry to the corresponding tree entry.
4576 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4577 OperandsToTreeEntry;
4578
4579 /// Scalars, used in split vectorize nodes.
4580 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4581
4582 /// Maps a value to the proposed vectorizable size.
4583 SmallDenseMap<Value *, unsigned> InstrElementSize;
4584
4585 /// A list of scalars that we found that we need to keep as scalars.
4586 ValueSet MustGather;
4587
4588 /// A set of first non-schedulable values.
4589 ValueSet NonScheduledFirst;
4590
4591 /// A map between the vectorized entries and the last instructions in the
4592 /// bundles. The bundles are built in use order, not in the def order of the
4593 /// instructions. So, we cannot rely directly on the last instruction in the
4594 /// bundle being the last instruction in the program order during
4595 /// vectorization process since the basic blocks are affected, need to
4596 /// pre-gather them before.
4597 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4598
4599 /// List of gather nodes, depending on other gather/vector nodes, which should
4600 /// be emitted after the vector instruction emission process to correctly
4601 /// handle order of the vector instructions and shuffles.
4602 SetVector<const TreeEntry *> PostponedGathers;
4603
4604 using ValueToGatherNodesMap =
4605 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4606 ValueToGatherNodesMap ValueToGatherNodes;
4607
4608 /// A list of the load entries (node indices), which can be vectorized using
4609 /// strided or masked gather approach, but attempted to be represented as
4610 /// contiguous loads.
4611 SetVector<unsigned> LoadEntriesToVectorize;
4612
4613 /// true if graph nodes transforming mode is on.
4614 bool IsGraphTransformMode = false;
4615
4616 /// The index of the first gathered load entry in the VectorizeTree.
4617 std::optional<unsigned> GatheredLoadsEntriesFirst;
4618
4619 /// Maps compress entries to their mask data for the final codegen.
4620 SmallDenseMap<const TreeEntry *,
4621 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4622 CompressEntryToData;
4623
4624 /// This POD struct describes one external user in the vectorized tree.
4625 struct ExternalUser {
4626 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4627 : Scalar(S), User(U), E(E), Lane(L) {}
4628
4629 /// Which scalar in our function.
4630 Value *Scalar = nullptr;
4631
4632 /// Which user that uses the scalar.
4633 llvm::User *User = nullptr;
4634
4635 /// Vector node, the value is part of.
4636 const TreeEntry &E;
4637
4638 /// Which lane does the scalar belong to.
4639 unsigned Lane;
4640 };
4641 using UserList = SmallVector<ExternalUser, 16>;
4642
4643 /// Checks if two instructions may access the same memory.
4644 ///
4645 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4646 /// is invariant in the calling loop.
4647 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4648 Instruction *Inst2) {
4649 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4650 // First check if the result is already in the cache.
4651 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4652 auto Res = AliasCache.try_emplace(Key);
4653 if (!Res.second)
4654 return Res.first->second;
4655 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4656 // Store the result in the cache.
4657 Res.first->getSecond() = Aliased;
4658 return Aliased;
4659 }
4660
4661 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4662
4663 /// Cache for alias results.
4664 /// TODO: consider moving this to the AliasAnalysis itself.
4665 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4666
4667 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4668 // globally through SLP because we don't perform any action which
4669 // invalidates capture results.
4670 BatchAAResults BatchAA;
4671
4672 /// Temporary store for deleted instructions. Instructions will be deleted
4673 /// eventually when the BoUpSLP is destructed. The deferral is required to
4674 /// ensure that there are no incorrect collisions in the AliasCache, which
4675 /// can happen if a new instruction is allocated at the same address as a
4676 /// previously deleted instruction.
4677 DenseSet<Instruction *> DeletedInstructions;
4678
4679 /// Set of the instruction, being analyzed already for reductions.
4680 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4681
4682 /// Set of hashes for the list of reduction values already being analyzed.
4683 DenseSet<size_t> AnalyzedReductionVals;
4684
4685 /// Values, already been analyzed for mininmal bitwidth and found to be
4686 /// non-profitable.
4687 DenseSet<Value *> AnalyzedMinBWVals;
4688
4689 /// A list of values that need to extracted out of the tree.
4690 /// This list holds pairs of (Internal Scalar : External User). External User
4691 /// can be nullptr, it means that this Internal Scalar will be used later,
4692 /// after vectorization.
4693 UserList ExternalUses;
4694
4695 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4696 /// extractelement instructions.
4697 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4698
4699 /// A list of scalar to be extracted without specific user necause of too many
4700 /// uses.
4701 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4702
4703 /// Values used only by @llvm.assume calls.
4704 SmallPtrSet<const Value *, 32> EphValues;
4705
4706 /// Holds all of the instructions that we gathered, shuffle instructions and
4707 /// extractelements.
4708 SetVector<Instruction *> GatherShuffleExtractSeq;
4709
4710 /// A list of blocks that we are going to CSE.
4711 DenseSet<BasicBlock *> CSEBlocks;
4712
4713 /// List of hashes of vector of loads, which are known to be non vectorizable.
4714 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4715
4716 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4717 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4718 /// instructions, while ScheduleBundle represents a batch of instructions,
4719 /// going to be groupped together. ScheduleCopyableData models extra user for
4720 /// "copyable" instructions.
4721 class ScheduleEntity {
4722 friend class ScheduleBundle;
4723 friend class ScheduleData;
4724 friend class ScheduleCopyableData;
4725
4726 protected:
4727 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4728 Kind getKind() const { return K; }
4729 ScheduleEntity(Kind K) : K(K) {}
4730
4731 private:
4732 /// Used for getting a "good" final ordering of instructions.
4733 int SchedulingPriority = 0;
4734 /// True if this instruction (or bundle) is scheduled (or considered as
4735 /// scheduled in the dry-run).
4736 bool IsScheduled = false;
4737 /// The kind of the ScheduleEntity.
4738 const Kind K = Kind::ScheduleData;
4739
4740 public:
4741 ScheduleEntity() = delete;
4742 /// Gets/sets the scheduling priority.
4743 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4744 int getSchedulingPriority() const { return SchedulingPriority; }
4745 bool isReady() const {
4746 if (const auto *SD = dyn_cast<ScheduleData>(this))
4747 return SD->isReady();
4748 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4749 return CD->isReady();
4750 return cast<ScheduleBundle>(this)->isReady();
4751 }
4752 /// Returns true if the dependency information has been calculated.
4753 /// Note that depenendency validity can vary between instructions within
4754 /// a single bundle.
4755 bool hasValidDependencies() const {
4756 if (const auto *SD = dyn_cast<ScheduleData>(this))
4757 return SD->hasValidDependencies();
4758 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4759 return CD->hasValidDependencies();
4760 return cast<ScheduleBundle>(this)->hasValidDependencies();
4761 }
4762 /// Gets the number of unscheduled dependencies.
4763 int getUnscheduledDeps() const {
4764 if (const auto *SD = dyn_cast<ScheduleData>(this))
4765 return SD->getUnscheduledDeps();
4766 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4767 return CD->getUnscheduledDeps();
4768 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4769 }
4770 /// Increments the number of unscheduled dependencies.
4771 int incrementUnscheduledDeps(int Incr) {
4772 if (auto *SD = dyn_cast<ScheduleData>(this))
4773 return SD->incrementUnscheduledDeps(Incr);
4774 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4775 }
4776 /// Gets the number of dependencies.
4777 int getDependencies() const {
4778 if (const auto *SD = dyn_cast<ScheduleData>(this))
4779 return SD->getDependencies();
4780 return cast<ScheduleCopyableData>(this)->getDependencies();
4781 }
4782 /// Gets the instruction.
4783 Instruction *getInst() const {
4784 if (const auto *SD = dyn_cast<ScheduleData>(this))
4785 return SD->getInst();
4786 return cast<ScheduleCopyableData>(this)->getInst();
4787 }
4788
4789 /// Gets/sets if the bundle is scheduled.
4790 bool isScheduled() const { return IsScheduled; }
4791 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4792
4793 static bool classof(const ScheduleEntity *) { return true; }
4794
4795#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4796 void dump(raw_ostream &OS) const {
4797 if (const auto *SD = dyn_cast<ScheduleData>(this))
4798 return SD->dump(OS);
4799 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4800 return CD->dump(OS);
4801 return cast<ScheduleBundle>(this)->dump(OS);
4802 }
4803
4804 LLVM_DUMP_METHOD void dump() const {
4805 dump(dbgs());
4806 dbgs() << '\n';
4807 }
4808#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4809 };
4810
4811#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4813 const BoUpSLP::ScheduleEntity &SE) {
4814 SE.dump(OS);
4815 return OS;
4816 }
4817#endif
4818
4819 /// Contains all scheduling relevant data for an instruction.
4820 /// A ScheduleData either represents a single instruction or a member of an
4821 /// instruction bundle (= a group of instructions which is combined into a
4822 /// vector instruction).
4823 class ScheduleData final : public ScheduleEntity {
4824 public:
4825 // The initial value for the dependency counters. It means that the
4826 // dependencies are not calculated yet.
4827 enum { InvalidDeps = -1 };
4828
4829 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4830 static bool classof(const ScheduleEntity *Entity) {
4831 return Entity->getKind() == Kind::ScheduleData;
4832 }
4833
4834 void init(int BlockSchedulingRegionID, Instruction *I) {
4835 NextLoadStore = nullptr;
4836 IsScheduled = false;
4837 SchedulingRegionID = BlockSchedulingRegionID;
4838 clearDependencies();
4839 Inst = I;
4840 }
4841
4842 /// Verify basic self consistency properties
4843 void verify() {
4844 if (hasValidDependencies()) {
4845 assert(UnscheduledDeps <= Dependencies && "invariant");
4846 } else {
4847 assert(UnscheduledDeps == Dependencies && "invariant");
4848 }
4849
4850 if (IsScheduled) {
4851 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4852 "unexpected scheduled state");
4853 }
4854 }
4855
4856 /// Returns true if the dependency information has been calculated.
4857 /// Note that depenendency validity can vary between instructions within
4858 /// a single bundle.
4859 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4860
4861 /// Returns true if it is ready for scheduling, i.e. it has no more
4862 /// unscheduled depending instructions/bundles.
4863 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4864
4865 /// Modifies the number of unscheduled dependencies for this instruction,
4866 /// and returns the number of remaining dependencies for the containing
4867 /// bundle.
4868 int incrementUnscheduledDeps(int Incr) {
4869 assert(hasValidDependencies() &&
4870 "increment of unscheduled deps would be meaningless");
4871 UnscheduledDeps += Incr;
4872 return UnscheduledDeps;
4873 }
4874
4875 /// Sets the number of unscheduled dependencies to the number of
4876 /// dependencies.
4877 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4878
4879 /// Clears all dependency information.
4880 void clearDependencies() {
4881 clearDirectDependencies();
4882 MemoryDependencies.clear();
4883 ControlDependencies.clear();
4884 }
4885
4886 /// Clears all direct dependencies only, except for control and memory
4887 /// dependencies.
4888 /// Required for copyable elements to correctly handle control/memory deps
4889 /// and avoid extra reclaculation of such deps.
4890 void clearDirectDependencies() {
4891 Dependencies = InvalidDeps;
4892 resetUnscheduledDeps();
4893 IsScheduled = false;
4894 }
4895
4896 /// Gets the number of unscheduled dependencies.
4897 int getUnscheduledDeps() const { return UnscheduledDeps; }
4898 /// Gets the number of dependencies.
4899 int getDependencies() const { return Dependencies; }
4900 /// Initializes the number of dependencies.
4901 void initDependencies() { Dependencies = 0; }
4902 /// Increments the number of dependencies.
4903 void incDependencies() { Dependencies++; }
4904
4905 /// Gets scheduling region ID.
4906 int getSchedulingRegionID() const { return SchedulingRegionID; }
4907
4908 /// Gets the instruction.
4909 Instruction *getInst() const { return Inst; }
4910
4911 /// Gets the list of memory dependencies.
4912 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4913 return MemoryDependencies;
4914 }
4915 /// Adds a memory dependency.
4916 void addMemoryDependency(ScheduleData *Dep) {
4917 MemoryDependencies.push_back(Dep);
4918 }
4919 /// Gets the list of control dependencies.
4920 ArrayRef<ScheduleData *> getControlDependencies() const {
4921 return ControlDependencies;
4922 }
4923 /// Adds a control dependency.
4924 void addControlDependency(ScheduleData *Dep) {
4925 ControlDependencies.push_back(Dep);
4926 }
4927 /// Gets/sets the next load/store instruction in the block.
4928 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4929 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4930
4931 void dump(raw_ostream &OS) const { OS << *Inst; }
4932
4933 LLVM_DUMP_METHOD void dump() const {
4934 dump(dbgs());
4935 dbgs() << '\n';
4936 }
4937
4938 private:
4939 Instruction *Inst = nullptr;
4940
4941 /// Single linked list of all memory instructions (e.g. load, store, call)
4942 /// in the block - until the end of the scheduling region.
4943 ScheduleData *NextLoadStore = nullptr;
4944
4945 /// The dependent memory instructions.
4946 /// This list is derived on demand in calculateDependencies().
4947 SmallVector<ScheduleData *> MemoryDependencies;
4948
4949 /// List of instructions which this instruction could be control dependent
4950 /// on. Allowing such nodes to be scheduled below this one could introduce
4951 /// a runtime fault which didn't exist in the original program.
4952 /// ex: this is a load or udiv following a readonly call which inf loops
4953 SmallVector<ScheduleData *> ControlDependencies;
4954
4955 /// This ScheduleData is in the current scheduling region if this matches
4956 /// the current SchedulingRegionID of BlockScheduling.
4957 int SchedulingRegionID = 0;
4958
4959 /// The number of dependencies. Constitutes of the number of users of the
4960 /// instruction plus the number of dependent memory instructions (if any).
4961 /// This value is calculated on demand.
4962 /// If InvalidDeps, the number of dependencies is not calculated yet.
4963 int Dependencies = InvalidDeps;
4964
4965 /// The number of dependencies minus the number of dependencies of scheduled
4966 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4967 /// for scheduling.
4968 /// Note that this is negative as long as Dependencies is not calculated.
4969 int UnscheduledDeps = InvalidDeps;
4970 };
4971
4972#ifndef NDEBUG
4974 const BoUpSLP::ScheduleData &SD) {
4975 SD.dump(OS);
4976 return OS;
4977 }
4978#endif
4979
4980 class ScheduleBundle final : public ScheduleEntity {
4981 /// The schedule data for the instructions in the bundle.
4983 /// True if this bundle is valid.
4984 bool IsValid = true;
4985 /// The TreeEntry that this instruction corresponds to.
4986 TreeEntry *TE = nullptr;
4987 ScheduleBundle(bool IsValid)
4988 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4989
4990 public:
4991 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4992 static bool classof(const ScheduleEntity *Entity) {
4993 return Entity->getKind() == Kind::ScheduleBundle;
4994 }
4995
4996 /// Verify basic self consistency properties
4997 void verify() const {
4998 for (const ScheduleEntity *SD : Bundle) {
4999 if (SD->hasValidDependencies()) {
5000 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5001 "invariant");
5002 } else {
5003 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5004 "invariant");
5005 }
5006
5007 if (isScheduled()) {
5008 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5009 "unexpected scheduled state");
5010 }
5011 }
5012 }
5013
5014 /// Returns the number of unscheduled dependencies in the bundle.
5015 int unscheduledDepsInBundle() const {
5016 assert(*this && "bundle must not be empty");
5017 int Sum = 0;
5018 for (const ScheduleEntity *BundleMember : Bundle) {
5019 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5020 return ScheduleData::InvalidDeps;
5021 Sum += BundleMember->getUnscheduledDeps();
5022 }
5023 return Sum;
5024 }
5025
5026 /// Returns true if the dependency information has been calculated.
5027 /// Note that depenendency validity can vary between instructions within
5028 /// a single bundle.
5029 bool hasValidDependencies() const {
5030 return all_of(Bundle, [](const ScheduleEntity *SD) {
5031 return SD->hasValidDependencies();
5032 });
5033 }
5034
5035 /// Returns true if it is ready for scheduling, i.e. it has no more
5036 /// unscheduled depending instructions/bundles.
5037 bool isReady() const {
5038 assert(*this && "bundle must not be empty");
5039 return unscheduledDepsInBundle() == 0 && !isScheduled();
5040 }
5041
5042 /// Returns the bundle of scheduling data, associated with the current
5043 /// instruction.
5044 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5045 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5046 /// Adds an instruction to the bundle.
5047 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5048
5049 /// Gets/sets the associated tree entry.
5050 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5051 TreeEntry *getTreeEntry() const { return TE; }
5052
5053 static ScheduleBundle invalid() { return {false}; }
5054
5055 operator bool() const { return IsValid; }
5056
5057#ifndef NDEBUG
5058 void dump(raw_ostream &OS) const {
5059 if (!*this) {
5060 OS << "[]";
5061 return;
5062 }
5063 OS << '[';
5064 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5066 OS << "<Copyable>";
5067 OS << *SD->getInst();
5068 });
5069 OS << ']';
5070 }
5071
5072 LLVM_DUMP_METHOD void dump() const {
5073 dump(dbgs());
5074 dbgs() << '\n';
5075 }
5076#endif // NDEBUG
5077 };
5078
5079#ifndef NDEBUG
5081 const BoUpSLP::ScheduleBundle &Bundle) {
5082 Bundle.dump(OS);
5083 return OS;
5084 }
5085#endif
5086
5087 /// Contains all scheduling relevant data for the copyable instruction.
5088 /// It models the virtual instructions, supposed to replace the original
5089 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5090 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5091 /// instruction %virt = add %0, 0.
5092 class ScheduleCopyableData final : public ScheduleEntity {
5093 /// The source schedule data for the instruction.
5094 Instruction *Inst = nullptr;
5095 /// The edge information for the instruction.
5096 const EdgeInfo EI;
5097 /// This ScheduleData is in the current scheduling region if this matches
5098 /// the current SchedulingRegionID of BlockScheduling.
5099 int SchedulingRegionID = 0;
5100 /// Bundle, this data is part of.
5101 ScheduleBundle &Bundle;
5102
5103 public:
5104 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5105 const EdgeInfo &EI, ScheduleBundle &Bundle)
5106 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5107 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5108 static bool classof(const ScheduleEntity *Entity) {
5109 return Entity->getKind() == Kind::ScheduleCopyableData;
5110 }
5111
5112 /// Verify basic self consistency properties
5113 void verify() {
5114 if (hasValidDependencies()) {
5115 assert(UnscheduledDeps <= Dependencies && "invariant");
5116 } else {
5117 assert(UnscheduledDeps == Dependencies && "invariant");
5118 }
5119
5120 if (IsScheduled) {
5121 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5122 "unexpected scheduled state");
5123 }
5124 }
5125
5126 /// Returns true if the dependency information has been calculated.
5127 /// Note that depenendency validity can vary between instructions within
5128 /// a single bundle.
5129 bool hasValidDependencies() const {
5130 return Dependencies != ScheduleData::InvalidDeps;
5131 }
5132
5133 /// Returns true if it is ready for scheduling, i.e. it has no more
5134 /// unscheduled depending instructions/bundles.
5135 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5136
5137 /// Modifies the number of unscheduled dependencies for this instruction,
5138 /// and returns the number of remaining dependencies for the containing
5139 /// bundle.
5140 int incrementUnscheduledDeps(int Incr) {
5141 assert(hasValidDependencies() &&
5142 "increment of unscheduled deps would be meaningless");
5143 UnscheduledDeps += Incr;
5144 assert(UnscheduledDeps >= 0 && "invariant");
5145 return UnscheduledDeps;
5146 }
5147
5148 /// Sets the number of unscheduled dependencies to the number of
5149 /// dependencies.
5150 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5151
5152 /// Gets the number of unscheduled dependencies.
5153 int getUnscheduledDeps() const { return UnscheduledDeps; }
5154 /// Gets the number of dependencies.
5155 int getDependencies() const { return Dependencies; }
5156 /// Initializes the number of dependencies.
5157 void initDependencies() { Dependencies = 0; }
5158 /// Increments the number of dependencies.
5159 void incDependencies() { Dependencies++; }
5160
5161 /// Gets scheduling region ID.
5162 int getSchedulingRegionID() const { return SchedulingRegionID; }
5163
5164 /// Gets the instruction.
5165 Instruction *getInst() const { return Inst; }
5166
5167 /// Clears all dependency information.
5168 void clearDependencies() {
5169 Dependencies = ScheduleData::InvalidDeps;
5170 UnscheduledDeps = ScheduleData::InvalidDeps;
5171 IsScheduled = false;
5172 }
5173
5174 /// Gets the edge information.
5175 const EdgeInfo &getEdgeInfo() const { return EI; }
5176
5177 /// Gets the bundle.
5178 ScheduleBundle &getBundle() { return Bundle; }
5179 const ScheduleBundle &getBundle() const { return Bundle; }
5180
5181#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5182 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5183
5184 LLVM_DUMP_METHOD void dump() const {
5185 dump(dbgs());
5186 dbgs() << '\n';
5187 }
5188#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5189
5190 private:
5191 /// true, if it has valid dependency information. These nodes always have
5192 /// only single dependency.
5193 int Dependencies = ScheduleData::InvalidDeps;
5194
5195 /// The number of dependencies minus the number of dependencies of scheduled
5196 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5197 /// for scheduling.
5198 /// Note that this is negative as long as Dependencies is not calculated.
5199 int UnscheduledDeps = ScheduleData::InvalidDeps;
5200 };
5201
5202#ifndef NDEBUG
5203 friend inline raw_ostream &
5204 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5205 SD.dump(OS);
5206 return OS;
5207 }
5208#endif
5209
5210 friend struct GraphTraits<BoUpSLP *>;
5211 friend struct DOTGraphTraits<BoUpSLP *>;
5212
5213 /// Contains all scheduling data for a basic block.
5214 /// It does not schedules instructions, which are not memory read/write
5215 /// instructions and their operands are either constants, or arguments, or
5216 /// phis, or instructions from others blocks, or their users are phis or from
5217 /// the other blocks. The resulting vector instructions can be placed at the
5218 /// beginning of the basic block without scheduling (if operands does not need
5219 /// to be scheduled) or at the end of the block (if users are outside of the
5220 /// block). It allows to save some compile time and memory used by the
5221 /// compiler.
5222 /// ScheduleData is assigned for each instruction in between the boundaries of
5223 /// the tree entry, even for those, which are not part of the graph. It is
5224 /// required to correctly follow the dependencies between the instructions and
5225 /// their correct scheduling. The ScheduleData is not allocated for the
5226 /// instructions, which do not require scheduling, like phis, nodes with
5227 /// extractelements/insertelements only or nodes with instructions, with
5228 /// uses/operands outside of the block.
5229 struct BlockScheduling {
5230 BlockScheduling(BasicBlock *BB)
5231 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5232
5233 void clear() {
5234 ScheduledBundles.clear();
5235 ScheduledBundlesList.clear();
5236 ScheduleCopyableDataMap.clear();
5237 ScheduleCopyableDataMapByInst.clear();
5238 ScheduleCopyableDataMapByInstUser.clear();
5239 ScheduleCopyableDataMapByUsers.clear();
5240 ReadyInsts.clear();
5241 ScheduleStart = nullptr;
5242 ScheduleEnd = nullptr;
5243 FirstLoadStoreInRegion = nullptr;
5244 LastLoadStoreInRegion = nullptr;
5245 RegionHasStackSave = false;
5246
5247 // Reduce the maximum schedule region size by the size of the
5248 // previous scheduling run.
5249 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5250 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5251 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5252 ScheduleRegionSize = 0;
5253
5254 // Make a new scheduling region, i.e. all existing ScheduleData is not
5255 // in the new region yet.
5256 ++SchedulingRegionID;
5257 }
5258
5259 ScheduleData *getScheduleData(Instruction *I) {
5260 if (!I)
5261 return nullptr;
5262 if (BB != I->getParent())
5263 // Avoid lookup if can't possibly be in map.
5264 return nullptr;
5265 ScheduleData *SD = ScheduleDataMap.lookup(I);
5266 if (SD && isInSchedulingRegion(*SD))
5267 return SD;
5268 return nullptr;
5269 }
5270
5271 ScheduleData *getScheduleData(Value *V) {
5272 return getScheduleData(dyn_cast<Instruction>(V));
5273 }
5274
5275 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5276 /// operand number) and value.
5277 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5278 const Value *V) const {
5279 if (ScheduleCopyableDataMap.empty())
5280 return nullptr;
5281 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5282 if (It == ScheduleCopyableDataMap.end())
5283 return nullptr;
5284 ScheduleCopyableData *SD = It->getSecond().get();
5285 if (!isInSchedulingRegion(*SD))
5286 return nullptr;
5287 return SD;
5288 }
5289
5290 /// Returns the ScheduleCopyableData for the given user \p User, operand
5291 /// number and operand \p V.
5293 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5294 const Value *V) {
5295 if (ScheduleCopyableDataMapByInstUser.empty())
5296 return {};
5297 const auto It = ScheduleCopyableDataMapByInstUser.find(
5298 std::make_pair(std::make_pair(User, OperandIdx), V));
5299 if (It == ScheduleCopyableDataMapByInstUser.end())
5300 return {};
5302 for (ScheduleCopyableData *SD : It->getSecond()) {
5303 if (isInSchedulingRegion(*SD))
5304 Res.push_back(SD);
5305 }
5306 return Res;
5307 }
5308
5309 /// Returns true if all operands of the given instruction \p User are
5310 /// replaced by copyable data.
5311 /// \param User The user instruction.
5312 /// \param Op The operand, which might be replaced by the copyable data.
5313 /// \param SLP The SLP tree.
5314 /// \param NumOps The number of operands used. If the instruction uses the
5315 /// same operand several times, check for the first use, then the second,
5316 /// etc.
5317 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5318 Instruction *Op, BoUpSLP &SLP,
5319 unsigned NumOps) const {
5320 assert(NumOps > 0 && "No operands");
5321 if (ScheduleCopyableDataMap.empty())
5322 return false;
5323 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5324 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5325 for (const Use &U : User->operands()) {
5326 if (U.get() != Op)
5327 continue;
5328 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5329 if (Entries.empty())
5330 return false;
5331 // Check all tree entries, if they have operands replaced by copyable
5332 // data.
5333 for (TreeEntry *TE : Entries) {
5334 // Check if the user is commutative.
5335 // The commutatives are handled later, as their operands can be
5336 // reordered.
5337 // Same applies even for non-commutative cmps, because we can invert
5338 // their predicate potentially and, thus, reorder the operands.
5339 bool IsCommutativeUser =
5340 ::isCommutative(User) ||
5341 ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
5342 if (!IsCommutativeUser && !isa<CmpInst>(User)) {
5343 unsigned &OpCnt =
5344 OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
5345 EdgeInfo EI(TE, U.getOperandNo());
5346 if (!getScheduleCopyableData(EI, Op))
5347 continue;
5348 // Found copyable operand - continue.
5349 ++OpCnt;
5350 continue;
5351 }
5352 ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5353 .first->getSecond();
5354 }
5355 }
5356 if (PotentiallyReorderedEntriesCount.empty())
5357 return all_of(OrderedEntriesCount,
5358 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5359 return P.second == NumOps;
5360 });
5361 // Check the commutative/cmp entries.
5362 for (auto &P : PotentiallyReorderedEntriesCount) {
5363 auto *It = find(P.first->Scalars, User);
5364 assert(It != P.first->Scalars.end() && "User is not in the tree entry");
5365 int Lane = std::distance(P.first->Scalars.begin(), It);
5366 assert(Lane >= 0 && "Lane is not found");
5367 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5368 Lane = P.first->ReorderIndices[Lane];
5369 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5370 "Couldn't find extract lane");
5371 SmallVector<unsigned> OpIndices;
5372 for (unsigned OpIdx :
5374 P.first->getMainOp()))) {
5375 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5376 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5377 --P.getSecond();
5378 }
5379 }
5380 return all_of(PotentiallyReorderedEntriesCount,
5381 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5382 return P.second == NumOps - 1;
5383 }) &&
5384 all_of(OrderedEntriesCount,
5385 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5386 return P.second == NumOps;
5387 });
5388 }
5389
5391 getScheduleCopyableData(const Instruction *I) const {
5392 if (ScheduleCopyableDataMapByInst.empty())
5393 return {};
5394 const auto It = ScheduleCopyableDataMapByInst.find(I);
5395 if (It == ScheduleCopyableDataMapByInst.end())
5396 return {};
5398 for (ScheduleCopyableData *SD : It->getSecond()) {
5399 if (isInSchedulingRegion(*SD))
5400 Res.push_back(SD);
5401 }
5402 return Res;
5403 }
5404
5406 getScheduleCopyableDataUsers(const Instruction *User) const {
5407 if (ScheduleCopyableDataMapByUsers.empty())
5408 return {};
5409 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5410 if (It == ScheduleCopyableDataMapByUsers.end())
5411 return {};
5413 for (ScheduleCopyableData *SD : It->getSecond()) {
5414 if (isInSchedulingRegion(*SD))
5415 Res.push_back(SD);
5416 }
5417 return Res;
5418 }
5419
5420 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5421 Instruction *I,
5422 int SchedulingRegionID,
5423 ScheduleBundle &Bundle) {
5424 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5425 ScheduleCopyableData *CD =
5426 ScheduleCopyableDataMap
5427 .try_emplace(std::make_pair(EI, I),
5428 std::make_unique<ScheduleCopyableData>(
5429 SchedulingRegionID, I, EI, Bundle))
5430 .first->getSecond()
5431 .get();
5432 ScheduleCopyableDataMapByInst[I].push_back(CD);
5433 if (EI.UserTE) {
5434 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5435 const auto *It = find(Op, I);
5436 assert(It != Op.end() && "Lane not set");
5437 SmallPtrSet<Instruction *, 4> Visited;
5438 do {
5439 int Lane = std::distance(Op.begin(), It);
5440 assert(Lane >= 0 && "Lane not set");
5441 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5442 !EI.UserTE->ReorderIndices.empty())
5443 Lane = EI.UserTE->ReorderIndices[Lane];
5444 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5445 "Couldn't find extract lane");
5446 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5447 if (!Visited.insert(In).second) {
5448 It = find(make_range(std::next(It), Op.end()), I);
5449 continue;
5450 }
5451 ScheduleCopyableDataMapByInstUser
5452 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5453 .first->getSecond()
5454 .push_back(CD);
5455 ScheduleCopyableDataMapByUsers.try_emplace(I)
5456 .first->getSecond()
5457 .insert(CD);
5458 // Remove extra deps for users, becoming non-immediate users of the
5459 // instruction. It may happen, if the chain of same copyable elements
5460 // appears in the tree.
5461 if (In == I) {
5462 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5463 if (ScheduleCopyableData *UserCD =
5464 getScheduleCopyableData(UserEI, In))
5465 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5466 }
5467 It = find(make_range(std::next(It), Op.end()), I);
5468 } while (It != Op.end());
5469 } else {
5470 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5471 CD);
5472 }
5473 return *CD;
5474 }
5475
5476 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5477 auto *I = dyn_cast<Instruction>(V);
5478 if (!I)
5479 return {};
5480 auto It = ScheduledBundles.find(I);
5481 if (It == ScheduledBundles.end())
5482 return {};
5483 return It->getSecond();
5484 }
5485
5486 /// Returns true if the entity is in the scheduling region.
5487 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5488 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5489 return Data->getSchedulingRegionID() == SchedulingRegionID;
5490 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5491 return CD->getSchedulingRegionID() == SchedulingRegionID;
5492 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5493 [&](const ScheduleEntity *BundleMember) {
5494 return isInSchedulingRegion(*BundleMember);
5495 });
5496 }
5497
5498 /// Marks an instruction as scheduled and puts all dependent ready
5499 /// instructions into the ready-list.
5500 template <typename ReadyListType>
5501 void schedule(const BoUpSLP &R, const InstructionsState &S,
5502 const EdgeInfo &EI, ScheduleEntity *Data,
5503 ReadyListType &ReadyList) {
5504 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5506 // Handle the def-use chain dependencies.
5507
5508 // Decrement the unscheduled counter and insert to ready list if ready.
5509 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5510 if ((IsControl || Data->hasValidDependencies()) &&
5511 Data->incrementUnscheduledDeps(-1) == 0) {
5512 // There are no more unscheduled dependencies after
5513 // decrementing, so we can put the dependent instruction
5514 // into the ready list.
5515 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5517 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5518 CopyableBundle.push_back(&CD->getBundle());
5519 Bundles = CopyableBundle;
5520 } else {
5521 Bundles = getScheduleBundles(Data->getInst());
5522 }
5523 if (!Bundles.empty()) {
5524 for (ScheduleBundle *Bundle : Bundles) {
5525 if (Bundle->unscheduledDepsInBundle() == 0) {
5526 assert(!Bundle->isScheduled() &&
5527 "already scheduled bundle gets ready");
5528 ReadyList.insert(Bundle);
5530 << "SLP: gets ready: " << *Bundle << "\n");
5531 }
5532 }
5533 return;
5534 }
5535 assert(!Data->isScheduled() &&
5536 "already scheduled bundle gets ready");
5538 "Expected non-copyable data");
5539 ReadyList.insert(Data);
5540 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5541 }
5542 };
5543
5544 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5545 Instruction *I) {
5546 if (!ScheduleCopyableDataMap.empty()) {
5548 getScheduleCopyableData(User, OpIdx, I);
5549 for (ScheduleCopyableData *CD : CopyableData)
5550 DecrUnsched(CD, /*IsControl=*/false);
5551 if (!CopyableData.empty())
5552 return;
5553 }
5554 if (ScheduleData *OpSD = getScheduleData(I))
5555 DecrUnsched(OpSD, /*IsControl=*/false);
5556 };
5557
5558 // If BundleMember is a vector bundle, its operands may have been
5559 // reordered during buildTree(). We therefore need to get its operands
5560 // through the TreeEntry.
5561 if (!Bundles.empty()) {
5562 auto *In = BundleMember->getInst();
5563 // Count uses of each instruction operand.
5564 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5565 unsigned TotalOpCount = 0;
5566 if (isa<ScheduleCopyableData>(BundleMember)) {
5567 // Copyable data is used only once (uses itself).
5568 TotalOpCount = OperandsUses[In] = 1;
5569 } else {
5570 for (const Use &U : In->operands()) {
5571 if (auto *I = dyn_cast<Instruction>(U.get())) {
5572 auto Res = OperandsUses.try_emplace(I, 0);
5573 ++Res.first->getSecond();
5574 ++TotalOpCount;
5575 }
5576 }
5577 }
5578 // Decrement the unscheduled counter and insert to ready list if
5579 // ready.
5580 auto DecrUnschedForInst =
5581 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
5582 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5583 &Checked) {
5584 if (!ScheduleCopyableDataMap.empty()) {
5585 const EdgeInfo EI = {UserTE, OpIdx};
5586 if (ScheduleCopyableData *CD =
5587 getScheduleCopyableData(EI, I)) {
5588 if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
5589 return;
5590 DecrUnsched(CD, /*IsControl=*/false);
5591 return;
5592 }
5593 }
5594 auto It = OperandsUses.find(I);
5595 assert(It != OperandsUses.end() && "Operand not found");
5596 if (It->second > 0) {
5597 --It->getSecond();
5598 assert(TotalOpCount > 0 && "No more operands to decrement");
5599 --TotalOpCount;
5600 if (ScheduleData *OpSD = getScheduleData(I)) {
5601 if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
5602 return;
5603 DecrUnsched(OpSD, /*IsControl=*/false);
5604 }
5605 }
5606 };
5607
5608 for (ScheduleBundle *Bundle : Bundles) {
5609 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5610 break;
5611 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5612 // Need to search for the lane since the tree entry can be
5613 // reordered.
5614 auto *It = find(Bundle->getTreeEntry()->Scalars, In);
5615 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5616 do {
5617 int Lane =
5618 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5619 assert(Lane >= 0 && "Lane not set");
5620 if (isa<StoreInst>(In) &&
5621 !Bundle->getTreeEntry()->ReorderIndices.empty())
5622 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5623 assert(Lane < static_cast<int>(
5624 Bundle->getTreeEntry()->Scalars.size()) &&
5625 "Couldn't find extract lane");
5626
5627 // Since vectorization tree is being built recursively this
5628 // assertion ensures that the tree entry has all operands set
5629 // before reaching this code. Couple of exceptions known at the
5630 // moment are extracts where their second (immediate) operand is
5631 // not added. Since immediates do not affect scheduler behavior
5632 // this is considered okay.
5633 assert(In &&
5635 In->getNumOperands() ==
5636 Bundle->getTreeEntry()->getNumOperands() ||
5637 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5638 "Missed TreeEntry operands?");
5639
5640 bool IsNonSchedulableWithParentPhiNode =
5641 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5642 Bundle->getTreeEntry()->UserTreeIndex &&
5643 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5644 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5645 Instruction::PHI;
5646 // Count the number of unique phi nodes, which are the parent for
5647 // parent entry, and exit, if all the unique phis are processed.
5648 if (IsNonSchedulableWithParentPhiNode) {
5649 const TreeEntry *ParentTE =
5650 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5651 Value *User = ParentTE->Scalars[Lane];
5652 if (!ParentsUniqueUsers.insert(User).second)
5653 break;
5654 }
5655
5656 for (unsigned OpIdx :
5657 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5658 if (auto *I = dyn_cast<Instruction>(
5659 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5660 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
5661 << *I << "\n");
5662 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
5663 }
5664 // If parent node is schedulable, it will be handled correctly.
5665 if (!IsNonSchedulableWithParentPhiNode)
5666 break;
5667 It = std::find(std::next(It),
5668 Bundle->getTreeEntry()->Scalars.end(), In);
5669 } while (It != Bundle->getTreeEntry()->Scalars.end());
5670 }
5671 } else {
5672 // If BundleMember is a stand-alone instruction, no operand reordering
5673 // has taken place, so we directly access its operands.
5674 for (Use &U : BundleMember->getInst()->operands()) {
5675 if (auto *I = dyn_cast<Instruction>(U.get())) {
5677 << "SLP: check for readiness (def): " << *I << "\n");
5678 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5679 }
5680 }
5681 }
5682 // Handle the memory dependencies.
5683 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5684 if (!SD)
5685 return;
5686 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5687 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5688 if (!VisitedMemory.insert(MemoryDep).second)
5689 continue;
5690 // There are no more unscheduled dependencies after decrementing,
5691 // so we can put the dependent instruction into the ready list.
5692 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5693 << *MemoryDep << "\n");
5694 DecrUnsched(MemoryDep);
5695 }
5696 // Handle the control dependencies.
5697 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5698 for (ScheduleData *Dep : SD->getControlDependencies()) {
5699 if (!VisitedControl.insert(Dep).second)
5700 continue;
5701 // There are no more unscheduled dependencies after decrementing,
5702 // so we can put the dependent instruction into the ready list.
5704 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5705 DecrUnsched(Dep, /*IsControl=*/true);
5706 }
5707 };
5708 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5709 SD->setScheduled(/*Scheduled=*/true);
5710 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5713 Instruction *In = SD->getInst();
5714 if (R.isVectorized(In)) {
5715 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
5716 for (TreeEntry *TE : Entries) {
5718 In->getNumOperands() != TE->getNumOperands())
5719 continue;
5720 auto &BundlePtr =
5721 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
5722 BundlePtr->setTreeEntry(TE);
5723 BundlePtr->add(SD);
5724 Bundles.push_back(BundlePtr.get());
5725 }
5726 }
5727 ProcessBundleMember(SD, Bundles);
5728 } else {
5729 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5730 Bundle.setScheduled(/*Scheduled=*/true);
5731 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5732 auto AreAllBundlesScheduled =
5733 [&](const ScheduleEntity *SD,
5734 ArrayRef<ScheduleBundle *> SDBundles) {
5736 return true;
5737 return !SDBundles.empty() &&
5738 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5739 return SDBundle->isScheduled();
5740 });
5741 };
5742 for (ScheduleEntity *SD : Bundle.getBundle()) {
5745 SDBundles = getScheduleBundles(SD->getInst());
5746 if (AreAllBundlesScheduled(SD, SDBundles)) {
5747 SD->setScheduled(/*Scheduled=*/true);
5748 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5749 : SDBundles);
5750 }
5751 }
5752 }
5753 }
5754
5755 /// Verify basic self consistency properties of the data structure.
5756 void verify() {
5757 if (!ScheduleStart)
5758 return;
5759
5760 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5761 ScheduleStart->comesBefore(ScheduleEnd) &&
5762 "Not a valid scheduling region?");
5763
5764 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5765 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5766 if (!Bundles.empty()) {
5767 for (ScheduleBundle *Bundle : Bundles) {
5768 assert(isInSchedulingRegion(*Bundle) &&
5769 "primary schedule data not in window?");
5770 Bundle->verify();
5771 }
5772 continue;
5773 }
5774 auto *SD = getScheduleData(I);
5775 if (!SD)
5776 continue;
5777 assert(isInSchedulingRegion(*SD) &&
5778 "primary schedule data not in window?");
5779 SD->verify();
5780 }
5781
5782 assert(all_of(ReadyInsts,
5783 [](const ScheduleEntity *Bundle) {
5784 return Bundle->isReady();
5785 }) &&
5786 "item in ready list not ready?");
5787 }
5788
5789 /// Put all instructions into the ReadyList which are ready for scheduling.
5790 template <typename ReadyListType>
5791 void initialFillReadyList(ReadyListType &ReadyList) {
5792 SmallPtrSet<ScheduleBundle *, 16> Visited;
5793 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5794 ScheduleData *SD = getScheduleData(I);
5795 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5796 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5797 !Bundles.empty()) {
5798 for (ScheduleBundle *Bundle : Bundles) {
5799 if (!Visited.insert(Bundle).second)
5800 continue;
5801 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5802 ReadyList.insert(Bundle);
5803 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5804 << *Bundle << "\n");
5805 }
5806 }
5807 continue;
5808 }
5809 ReadyList.insert(SD);
5811 << "SLP: initially in ready list: " << *SD << "\n");
5812 }
5813 }
5814 }
5815
5816 /// Build a bundle from the ScheduleData nodes corresponding to the
5817 /// scalar instruction for each lane.
5818 /// \param VL The list of scalar instructions.
5819 /// \param S The state of the instructions.
5820 /// \param EI The edge in the SLP graph or the user node/operand number.
5821 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5822 const InstructionsState &S, const EdgeInfo &EI);
5823
5824 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5825 /// cyclic dependencies. This is only a dry-run, no instructions are
5826 /// actually moved at this stage.
5827 /// \returns the scheduling bundle. The returned Optional value is not
5828 /// std::nullopt if \p VL is allowed to be scheduled.
5829 std::optional<ScheduleBundle *>
5830 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5831 const InstructionsState &S, const EdgeInfo &EI);
5832
5833 /// Allocates schedule data chunk.
5834 ScheduleData *allocateScheduleDataChunks();
5835
5836 /// Extends the scheduling region so that V is inside the region.
5837 /// \returns true if the region size is within the limit.
5838 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5839
5840 /// Initialize the ScheduleData structures for new instructions in the
5841 /// scheduling region.
5842 void initScheduleData(Instruction *FromI, Instruction *ToI,
5843 ScheduleData *PrevLoadStore,
5844 ScheduleData *NextLoadStore);
5845
5846 /// Updates the dependency information of a bundle and of all instructions/
5847 /// bundles which depend on the original bundle.
5848 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5849 BoUpSLP *SLP,
5850 ArrayRef<ScheduleData *> ControlDeps = {});
5851
5852 /// Sets all instruction in the scheduling region to un-scheduled.
5853 void resetSchedule();
5854
5855 BasicBlock *BB;
5856
5857 /// Simple memory allocation for ScheduleData.
5859
5860 /// The size of a ScheduleData array in ScheduleDataChunks.
5861 int ChunkSize;
5862
5863 /// The allocator position in the current chunk, which is the last entry
5864 /// of ScheduleDataChunks.
5865 int ChunkPos;
5866
5867 /// Attaches ScheduleData to Instruction.
5868 /// Note that the mapping survives during all vectorization iterations, i.e.
5869 /// ScheduleData structures are recycled.
5870 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5871
5872 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
5873 /// number) and the operand instruction, represented as copyable element.
5874 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5875 std::unique_ptr<ScheduleCopyableData>>
5876 ScheduleCopyableDataMap;
5877
5878 /// Represents mapping between instruction and all related
5879 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
5880 /// element). The SLP tree may contain several representations of the same
5881 /// instruction.
5882 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5883 ScheduleCopyableDataMapByInst;
5884
5885 /// Represents mapping between user value and operand number, the operand
5886 /// value and all related ScheduleCopyableData. The relation is 1:n, because
5887 /// the same user may refernce the same operand in different tree entries
5888 /// and the operand may be modelled by the different copyable data element.
5889 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
5891 ScheduleCopyableDataMapByInstUser;
5892
5893 /// Represents mapping between instruction and all related
5894 /// ScheduleCopyableData. It represents the mapping between the actual
5895 /// instruction and the last copyable data element in the chain. E.g., if
5896 /// the graph models the following instructions:
5897 /// %0 = non-add instruction ...
5898 /// ...
5899 /// %4 = add %3, 1
5900 /// %5 = add %4, 1
5901 /// %6 = insertelement poison, %0, 0
5902 /// %7 = insertelement %6, %5, 1
5903 /// And the graph is modeled as:
5904 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
5905 /// -> [1, 0] -> [%1, 0]
5906 ///
5907 /// this map will map %0 only to the copyable element <1>, which is the last
5908 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
5909 /// keep the map to <0>, not the %0.
5910 SmallDenseMap<const Instruction *,
5911 SmallSetVector<ScheduleCopyableData *, 4>>
5912 ScheduleCopyableDataMapByUsers;
5913
5914 /// Attaches ScheduleBundle to Instruction.
5915 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5916 ScheduledBundles;
5917 /// The list of ScheduleBundles.
5918 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
5919
5920 /// The ready-list for scheduling (only used for the dry-run).
5921 SetVector<ScheduleEntity *> ReadyInsts;
5922
5923 /// The first instruction of the scheduling region.
5924 Instruction *ScheduleStart = nullptr;
5925
5926 /// The first instruction _after_ the scheduling region.
5927 Instruction *ScheduleEnd = nullptr;
5928
5929 /// The first memory accessing instruction in the scheduling region
5930 /// (can be null).
5931 ScheduleData *FirstLoadStoreInRegion = nullptr;
5932
5933 /// The last memory accessing instruction in the scheduling region
5934 /// (can be null).
5935 ScheduleData *LastLoadStoreInRegion = nullptr;
5936
5937 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
5938 /// region? Used to optimize the dependence calculation for the
5939 /// common case where there isn't.
5940 bool RegionHasStackSave = false;
5941
5942 /// The current size of the scheduling region.
5943 int ScheduleRegionSize = 0;
5944
5945 /// The maximum size allowed for the scheduling region.
5946 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
5947
5948 /// The ID of the scheduling region. For a new vectorization iteration this
5949 /// is incremented which "removes" all ScheduleData from the region.
5950 /// Make sure that the initial SchedulingRegionID is greater than the
5951 /// initial SchedulingRegionID in ScheduleData (which is 0).
5952 int SchedulingRegionID = 1;
5953 };
5954
5955 /// Attaches the BlockScheduling structures to basic blocks.
5956 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5957
5958 /// Performs the "real" scheduling. Done before vectorization is actually
5959 /// performed in a basic block.
5960 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
5961
5962 /// List of users to ignore during scheduling and that don't need extracting.
5963 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
5964
5965 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
5966 /// sorted SmallVectors of unsigned.
5967 struct OrdersTypeDenseMapInfo {
5968 static OrdersType getEmptyKey() {
5969 OrdersType V;
5970 V.push_back(~1U);
5971 return V;
5972 }
5973
5974 static OrdersType getTombstoneKey() {
5975 OrdersType V;
5976 V.push_back(~2U);
5977 return V;
5978 }
5979
5980 static unsigned getHashValue(const OrdersType &V) {
5981 return static_cast<unsigned>(hash_combine_range(V));
5982 }
5983
5984 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
5985 return LHS == RHS;
5986 }
5987 };
5988
5989 // Analysis and block reference.
5990 Function *F;
5991 ScalarEvolution *SE;
5992 TargetTransformInfo *TTI;
5993 TargetLibraryInfo *TLI;
5994 LoopInfo *LI;
5995 DominatorTree *DT;
5996 AssumptionCache *AC;
5997 DemandedBits *DB;
5998 const DataLayout *DL;
5999 OptimizationRemarkEmitter *ORE;
6000
6001 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6002 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6003
6004 /// Instruction builder to construct the vectorized tree.
6005 IRBuilder<TargetFolder> Builder;
6006
6007 /// A map of scalar integer values to the smallest bit width with which they
6008 /// can legally be represented. The values map to (width, signed) pairs,
6009 /// where "width" indicates the minimum bit width and "signed" is True if the
6010 /// value must be signed-extended, rather than zero-extended, back to its
6011 /// original width.
6012 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6013
6014 /// Final size of the reduced vector, if the current graph represents the
6015 /// input for the reduction and it was possible to narrow the size of the
6016 /// reduction.
6017 unsigned ReductionBitWidth = 0;
6018
6019 /// Canonical graph size before the transformations.
6020 unsigned BaseGraphSize = 1;
6021
6022 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6023 /// type sizes, used in the tree.
6024 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6025
6026 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6027 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6028 DenseSet<unsigned> ExtraBitWidthNodes;
6029};
6030
6031} // end namespace slpvectorizer
6032
6033template <> struct DenseMapInfo<BoUpSLP::EdgeInfo> {
6037 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6038 SecondInfo::getEmptyKey());
6039 }
6040
6042 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
6043 SecondInfo::getTombstoneKey());
6044 }
6045
6046 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6047 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
6048 SecondInfo::getHashValue(Val.EdgeIdx));
6049 }
6050
6051 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6052 const BoUpSLP::EdgeInfo &RHS) {
6053 return LHS == RHS;
6054 }
6055};
6056
6057template <> struct GraphTraits<BoUpSLP *> {
6058 using TreeEntry = BoUpSLP::TreeEntry;
6059
6060 /// NodeRef has to be a pointer per the GraphWriter.
6062
6063 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6064
6065 /// Add the VectorizableTree to the index iterator to be able to return
6066 /// TreeEntry pointers.
6068 : public iterator_adaptor_base<
6069 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6071
6075
6076 NodeRef operator*() { return I->UserTE; }
6077 };
6078
6080 return R.VectorizableTree[0].get();
6081 }
6082
6084 return {&N->UserTreeIndex, N->Container};
6085 }
6086
6088 return {&N->UserTreeIndex + 1, N->Container};
6089 }
6090
6091 /// For the node iterator we just need to turn the TreeEntry iterator into a
6092 /// TreeEntry* iterator so that it dereferences to NodeRef.
6094 using ItTy = ContainerTy::iterator;
6095 ItTy It;
6096
6097 public:
6098 nodes_iterator(const ItTy &It2) : It(It2) {}
6099 NodeRef operator*() { return It->get(); }
6101 ++It;
6102 return *this;
6103 }
6104 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6105 };
6106
6108 return nodes_iterator(R->VectorizableTree.begin());
6109 }
6110
6112 return nodes_iterator(R->VectorizableTree.end());
6113 }
6114
6115 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6116};
6117
6118template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
6119 using TreeEntry = BoUpSLP::TreeEntry;
6120
6121 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6122
6123 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6124 std::string Str;
6125 raw_string_ostream OS(Str);
6126 OS << Entry->Idx << ".\n";
6127 if (isSplat(Entry->Scalars))
6128 OS << "<splat> ";
6129 for (auto *V : Entry->Scalars) {
6130 OS << *V;
6131 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6132 return EU.Scalar == V;
6133 }))
6134 OS << " <extract>";
6135 OS << "\n";
6136 }
6137 return Str;
6138 }
6139
6140 static std::string getNodeAttributes(const TreeEntry *Entry,
6141 const BoUpSLP *) {
6142 if (Entry->isGather())
6143 return "color=red";
6144 if (Entry->State == TreeEntry::ScatterVectorize ||
6145 Entry->State == TreeEntry::StridedVectorize ||
6146 Entry->State == TreeEntry::CompressVectorize)
6147 return "color=blue";
6148 return "";
6149 }
6150};
6151
6152} // end namespace llvm
6153
6156 for (auto *I : DeletedInstructions) {
6157 if (!I->getParent()) {
6158 // Temporarily insert instruction back to erase them from parent and
6159 // memory later.
6160 if (isa<PHINode>(I))
6161 // Phi nodes must be the very first instructions in the block.
6162 I->insertBefore(F->getEntryBlock(),
6163 F->getEntryBlock().getFirstNonPHIIt());
6164 else
6165 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6166 continue;
6167 }
6168 for (Use &U : I->operands()) {
6169 auto *Op = dyn_cast<Instruction>(U.get());
6170 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6172 DeadInsts.emplace_back(Op);
6173 }
6174 I->dropAllReferences();
6175 }
6176 for (auto *I : DeletedInstructions) {
6177 assert(I->use_empty() &&
6178 "trying to erase instruction with users.");
6179 I->eraseFromParent();
6180 }
6181
6182 // Cleanup any dead scalar code feeding the vectorized instructions
6184
6185#ifdef EXPENSIVE_CHECKS
6186 // If we could guarantee that this call is not extremely slow, we could
6187 // remove the ifdef limitation (see PR47712).
6188 assert(!verifyFunction(*F, &dbgs()));
6189#endif
6190}
6191
6192/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6193/// contains original mask for the scalars reused in the node. Procedure
6194/// transform this mask in accordance with the given \p Mask.
6196 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6197 "Expected non-empty mask.");
6198 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6199 Prev.swap(Reuses);
6200 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6201 if (Mask[I] != PoisonMaskElem)
6202 Reuses[Mask[I]] = Prev[I];
6203}
6204
6205/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6206/// the original order of the scalars. Procedure transforms the provided order
6207/// in accordance with the given \p Mask. If the resulting \p Order is just an
6208/// identity order, \p Order is cleared.
6210 bool BottomOrder = false) {
6211 assert(!Mask.empty() && "Expected non-empty mask.");
6212 unsigned Sz = Mask.size();
6213 if (BottomOrder) {
6214 SmallVector<unsigned> PrevOrder;
6215 if (Order.empty()) {
6216 PrevOrder.resize(Sz);
6217 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6218 } else {
6219 PrevOrder.swap(Order);
6220 }
6221 Order.assign(Sz, Sz);
6222 for (unsigned I = 0; I < Sz; ++I)
6223 if (Mask[I] != PoisonMaskElem)
6224 Order[I] = PrevOrder[Mask[I]];
6225 if (all_of(enumerate(Order), [&](const auto &Data) {
6226 return Data.value() == Sz || Data.index() == Data.value();
6227 })) {
6228 Order.clear();
6229 return;
6230 }
6231 fixupOrderingIndices(Order);
6232 return;
6233 }
6234 SmallVector<int> MaskOrder;
6235 if (Order.empty()) {
6236 MaskOrder.resize(Sz);
6237 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6238 } else {
6239 inversePermutation(Order, MaskOrder);
6240 }
6241 reorderReuses(MaskOrder, Mask);
6242 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6243 Order.clear();
6244 return;
6245 }
6246 Order.assign(Sz, Sz);
6247 for (unsigned I = 0; I < Sz; ++I)
6248 if (MaskOrder[I] != PoisonMaskElem)
6249 Order[MaskOrder[I]] = I;
6250 fixupOrderingIndices(Order);
6251}
6252
6253std::optional<BoUpSLP::OrdersType>
6254BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6255 bool TopToBottom, bool IgnoreReorder) {
6256 assert(TE.isGather() && "Expected gather node only.");
6257 // Try to find subvector extract/insert patterns and reorder only such
6258 // patterns.
6259 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6260 Type *ScalarTy = GatheredScalars.front()->getType();
6261 size_t NumScalars = GatheredScalars.size();
6262 if (!isValidElementType(ScalarTy))
6263 return std::nullopt;
6264 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6265 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6266 SmallVector<int> ExtractMask;
6267 SmallVector<int> Mask;
6270 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6272 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6273 /*ForOrder=*/true);
6274 // No shuffled operands - ignore.
6275 if (GatherShuffles.empty() && ExtractShuffles.empty())
6276 return std::nullopt;
6277 OrdersType CurrentOrder(NumScalars, NumScalars);
6278 if (GatherShuffles.size() == 1 &&
6279 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6280 Entries.front().front()->isSame(TE.Scalars)) {
6281 // If the full matched node in whole tree rotation - no need to consider the
6282 // matching order, rotating the whole tree.
6283 if (TopToBottom)
6284 return std::nullopt;
6285 // No need to keep the order for the same user node.
6286 if (Entries.front().front()->UserTreeIndex.UserTE ==
6287 TE.UserTreeIndex.UserTE)
6288 return std::nullopt;
6289 // No need to keep the order for the matched root node, if it can be freely
6290 // reordered.
6291 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6292 return std::nullopt;
6293 // If shuffling 2 elements only and the matching node has reverse reuses -
6294 // no need to count order, both work fine.
6295 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6296 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6297 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6298 [](const auto &P) {
6299 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6300 }))
6301 return std::nullopt;
6302
6303 // Perfect match in the graph, will reuse the previously vectorized
6304 // node. Cost is 0.
6305 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6306 return CurrentOrder;
6307 }
6308 auto IsSplatMask = [](ArrayRef<int> Mask) {
6309 int SingleElt = PoisonMaskElem;
6310 return all_of(Mask, [&](int I) {
6311 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6312 SingleElt = I;
6313 return I == PoisonMaskElem || I == SingleElt;
6314 });
6315 };
6316 // Exclusive broadcast mask - ignore.
6317 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6318 (Entries.size() != 1 ||
6319 Entries.front().front()->ReorderIndices.empty())) ||
6320 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6321 return std::nullopt;
6322 SmallBitVector ShuffledSubMasks(NumParts);
6323 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6324 ArrayRef<int> Mask, int PartSz, int NumParts,
6325 function_ref<unsigned(unsigned)> GetVF) {
6326 for (int I : seq<int>(0, NumParts)) {
6327 if (ShuffledSubMasks.test(I))
6328 continue;
6329 const int VF = GetVF(I);
6330 if (VF == 0)
6331 continue;
6332 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6333 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6334 // Shuffle of at least 2 vectors - ignore.
6335 if (any_of(Slice, [&](unsigned I) { return I != NumScalars; })) {
6336 llvm::fill(Slice, NumScalars);
6337 ShuffledSubMasks.set(I);
6338 continue;
6339 }
6340 // Try to include as much elements from the mask as possible.
6341 int FirstMin = INT_MAX;
6342 int SecondVecFound = false;
6343 for (int K : seq<int>(Limit)) {
6344 int Idx = Mask[I * PartSz + K];
6345 if (Idx == PoisonMaskElem) {
6346 Value *V = GatheredScalars[I * PartSz + K];
6347 if (isConstant(V) && !isa<PoisonValue>(V)) {
6348 SecondVecFound = true;
6349 break;
6350 }
6351 continue;
6352 }
6353 if (Idx < VF) {
6354 if (FirstMin > Idx)
6355 FirstMin = Idx;
6356 } else {
6357 SecondVecFound = true;
6358 break;
6359 }
6360 }
6361 FirstMin = (FirstMin / PartSz) * PartSz;
6362 // Shuffle of at least 2 vectors - ignore.
6363 if (SecondVecFound) {
6364 llvm::fill(Slice, NumScalars);
6365 ShuffledSubMasks.set(I);
6366 continue;
6367 }
6368 for (int K : seq<int>(Limit)) {
6369 int Idx = Mask[I * PartSz + K];
6370 if (Idx == PoisonMaskElem)
6371 continue;
6372 Idx -= FirstMin;
6373 if (Idx >= PartSz) {
6374 SecondVecFound = true;
6375 break;
6376 }
6377 if (CurrentOrder[I * PartSz + Idx] >
6378 static_cast<unsigned>(I * PartSz + K) &&
6379 CurrentOrder[I * PartSz + Idx] !=
6380 static_cast<unsigned>(I * PartSz + Idx))
6381 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6382 }
6383 // Shuffle of at least 2 vectors - ignore.
6384 if (SecondVecFound) {
6385 llvm::fill(Slice, NumScalars);
6386 ShuffledSubMasks.set(I);
6387 continue;
6388 }
6389 }
6390 };
6391 int PartSz = getPartNumElems(NumScalars, NumParts);
6392 if (!ExtractShuffles.empty())
6393 TransformMaskToOrder(
6394 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6395 if (!ExtractShuffles[I])
6396 return 0U;
6397 unsigned VF = 0;
6398 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6399 for (unsigned Idx : seq<unsigned>(Sz)) {
6400 int K = I * PartSz + Idx;
6401 if (ExtractMask[K] == PoisonMaskElem)
6402 continue;
6403 if (!TE.ReuseShuffleIndices.empty())
6404 K = TE.ReuseShuffleIndices[K];
6405 if (K == PoisonMaskElem)
6406 continue;
6407 if (!TE.ReorderIndices.empty())
6408 K = std::distance(TE.ReorderIndices.begin(),
6409 find(TE.ReorderIndices, K));
6410 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6411 if (!EI)
6412 continue;
6413 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6414 ->getElementCount()
6415 .getKnownMinValue());
6416 }
6417 return VF;
6418 });
6419 // Check special corner case - single shuffle of the same entry.
6420 if (GatherShuffles.size() == 1 && NumParts != 1) {
6421 if (ShuffledSubMasks.any())
6422 return std::nullopt;
6423 PartSz = NumScalars;
6424 NumParts = 1;
6425 }
6426 if (!Entries.empty())
6427 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6428 if (!GatherShuffles[I])
6429 return 0U;
6430 return std::max(Entries[I].front()->getVectorFactor(),
6431 Entries[I].back()->getVectorFactor());
6432 });
6433 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6434 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6435 return std::nullopt;
6436 return std::move(CurrentOrder);
6437}
6438
6439static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6440 const TargetLibraryInfo &TLI,
6441 bool CompareOpcodes = true) {
6444 return false;
6445 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6446 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6447 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6448 (!GEP2 || GEP2->getNumOperands() == 2) &&
6449 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6450 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6451 !CompareOpcodes ||
6452 (GEP1 && GEP2 &&
6453 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6454}
6455
6456/// Calculates minimal alignment as a common alignment.
6457template <typename T>
6459 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6460 for (Value *V : VL)
6461 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6462 return CommonAlignment;
6463}
6464
6465/// Check if \p Order represents reverse order.
6467 assert(!Order.empty() &&
6468 "Order is empty. Please check it before using isReverseOrder.");
6469 unsigned Sz = Order.size();
6470 return all_of(enumerate(Order), [&](const auto &Pair) {
6471 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6472 });
6473}
6474
6475/// Checks if the provided list of pointers \p Pointers represents the strided
6476/// pointers for type ElemTy. If they are not, nullptr is returned.
6477/// Otherwise, SCEV* of the stride value is returned.
6478static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6479 const DataLayout &DL, ScalarEvolution &SE,
6480 SmallVectorImpl<unsigned> &SortedIndices) {
6482 const SCEV *PtrSCEVLowest = nullptr;
6483 const SCEV *PtrSCEVHighest = nullptr;
6484 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6485 // addresses).
6486 for (Value *Ptr : PointerOps) {
6487 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6488 if (!PtrSCEV)
6489 return nullptr;
6490 SCEVs.push_back(PtrSCEV);
6491 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6492 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6493 continue;
6494 }
6495 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6496 if (isa<SCEVCouldNotCompute>(Diff))
6497 return nullptr;
6498 if (Diff->isNonConstantNegative()) {
6499 PtrSCEVLowest = PtrSCEV;
6500 continue;
6501 }
6502 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6503 if (isa<SCEVCouldNotCompute>(Diff1))
6504 return nullptr;
6505 if (Diff1->isNonConstantNegative()) {
6506 PtrSCEVHighest = PtrSCEV;
6507 continue;
6508 }
6509 }
6510 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6511 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6512 if (isa<SCEVCouldNotCompute>(Dist))
6513 return nullptr;
6514 int Size = DL.getTypeStoreSize(ElemTy);
6515 auto TryGetStride = [&](const SCEV *Dist,
6516 const SCEV *Multiplier) -> const SCEV * {
6517 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6518 if (M->getOperand(0) == Multiplier)
6519 return M->getOperand(1);
6520 if (M->getOperand(1) == Multiplier)
6521 return M->getOperand(0);
6522 return nullptr;
6523 }
6524 if (Multiplier == Dist)
6525 return SE.getConstant(Dist->getType(), 1);
6526 return SE.getUDivExactExpr(Dist, Multiplier);
6527 };
6528 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6529 const SCEV *Stride = nullptr;
6530 if (Size != 1 || SCEVs.size() > 2) {
6531 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6532 Stride = TryGetStride(Dist, Sz);
6533 if (!Stride)
6534 return nullptr;
6535 }
6536 if (!Stride || isa<SCEVConstant>(Stride))
6537 return nullptr;
6538 // Iterate through all pointers and check if all distances are
6539 // unique multiple of Stride.
6540 using DistOrdPair = std::pair<int64_t, int>;
6541 auto Compare = llvm::less_first();
6542 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6543 int Cnt = 0;
6544 bool IsConsecutive = true;
6545 for (const SCEV *PtrSCEV : SCEVs) {
6546 unsigned Dist = 0;
6547 if (PtrSCEV != PtrSCEVLowest) {
6548 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6549 const SCEV *Coeff = TryGetStride(Diff, Stride);
6550 if (!Coeff)
6551 return nullptr;
6552 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6553 if (!SC || isa<SCEVCouldNotCompute>(SC))
6554 return nullptr;
6555 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6556 SE.getMulExpr(Stride, SC)))
6557 ->isZero())
6558 return nullptr;
6559 Dist = SC->getAPInt().getZExtValue();
6560 }
6561 // If the strides are not the same or repeated, we can't vectorize.
6562 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6563 return nullptr;
6564 auto Res = Offsets.emplace(Dist, Cnt);
6565 if (!Res.second)
6566 return nullptr;
6567 // Consecutive order if the inserted element is the last one.
6568 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6569 ++Cnt;
6570 }
6571 if (Offsets.size() != SCEVs.size())
6572 return nullptr;
6573 SortedIndices.clear();
6574 if (!IsConsecutive) {
6575 // Fill SortedIndices array only if it is non-consecutive.
6576 SortedIndices.resize(PointerOps.size());
6577 Cnt = 0;
6578 for (const std::pair<int64_t, int> &Pair : Offsets) {
6579 SortedIndices[Cnt] = Pair.second;
6580 ++Cnt;
6581 }
6582 }
6583 return Stride;
6584}
6585
6586static std::pair<InstructionCost, InstructionCost>
6587getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
6588 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6589 Type *ScalarTy, VectorType *VecTy);
6590
6591/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6592/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6593/// subvector pattern.
6594static InstructionCost
6596 VectorType *Tp, ArrayRef<int> Mask = {},
6598 int Index = 0, VectorType *SubTp = nullptr,
6600 VectorType *DstTy = Tp;
6601 if (!Mask.empty())
6602 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6603
6604 if (Kind != TTI::SK_PermuteTwoSrc)
6605 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6606 Args);
6607 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6608 int NumSubElts;
6610 Mask, NumSrcElts, NumSubElts, Index)) {
6611 if (Index + NumSubElts > NumSrcElts &&
6612 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6613 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6614 TTI::TCK_RecipThroughput, Index, Tp);
6615 }
6616 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6617 Args);
6618}
6619
6620/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6621/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6622/// instead of a scalar.
6623static InstructionCost
6625 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6626 bool Extract, TTI::TargetCostKind CostKind,
6627 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6629 "ScalableVectorType is not supported.");
6630 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6631 getNumElements(Ty) &&
6632 "Incorrect usage.");
6633 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6634 assert(SLPReVec && "Only supported by REVEC.");
6635 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6636 // of CreateInsertElement.
6637 unsigned ScalarTyNumElements = VecTy->getNumElements();
6639 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6640 if (!DemandedElts[I])
6641 continue;
6642 if (Insert)
6644 I * ScalarTyNumElements, VecTy);
6645 if (Extract)
6647 I * ScalarTyNumElements, VecTy);
6648 }
6649 return Cost;
6650 }
6651 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6652 CostKind, ForPoisonSrc, VL);
6653}
6654
6655/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6656/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6658 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6659 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6660 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6661 if (Opcode == Instruction::ExtractElement) {
6662 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6663 assert(SLPReVec && "Only supported by REVEC.");
6664 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6666 cast<VectorType>(Val), {}, CostKind,
6667 Index * VecTy->getNumElements(), VecTy);
6668 }
6669 }
6670 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6671 ScalarUserAndIdx);
6672}
6673
6674/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6675/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6677 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6678 VectorType *VecTy, unsigned Index,
6680 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6681 assert(SLPReVec && "Only supported by REVEC.");
6682 auto *SubTp =
6683 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6685 Index * ScalarTy->getNumElements(), SubTp) +
6686 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
6687 CostKind);
6688 }
6689 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6690}
6691
6692/// Creates subvector insert. Generates shuffle using \p Generator or
6693/// using default shuffle.
6695 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6696 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6697 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6698 return Vec;
6699 const unsigned SubVecVF = getNumElements(V->getType());
6700 // Create shuffle, insertvector requires that index is multiple of
6701 // the subvector length.
6702 const unsigned VecVF = getNumElements(Vec->getType());
6703 SmallVector<int> Mask(VecVF, PoisonMaskElem);
6704 if (isa<PoisonValue>(Vec)) {
6705 auto *Begin = std::next(Mask.begin(), Index);
6706 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6707 Vec = Builder.CreateShuffleVector(V, Mask);
6708 return Vec;
6709 }
6710 std::iota(Mask.begin(), Mask.end(), 0);
6711 std::iota(std::next(Mask.begin(), Index),
6712 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6713 if (Generator)
6714 return Generator(Vec, V, Mask);
6715 // 1. Resize V to the size of Vec.
6716 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6717 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6718 V = Builder.CreateShuffleVector(V, ResizeMask);
6719 // 2. Insert V into Vec.
6720 return Builder.CreateShuffleVector(Vec, V, Mask);
6721}
6722
6723/// Generates subvector extract using \p Generator or using default shuffle.
6725 unsigned SubVecVF, unsigned Index) {
6726 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6727 std::iota(Mask.begin(), Mask.end(), Index);
6728 return Builder.CreateShuffleVector(Vec, Mask);
6729}
6730
6731/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6732/// with \p Order.
6733/// \return true if the mask represents strided access, false - otherwise.
6735 ArrayRef<unsigned> Order, Type *ScalarTy,
6736 const DataLayout &DL, ScalarEvolution &SE,
6737 SmallVectorImpl<int> &CompressMask) {
6738 const unsigned Sz = PointerOps.size();
6739 CompressMask.assign(Sz, PoisonMaskElem);
6740 // The first element always set.
6741 CompressMask[0] = 0;
6742 // Check if the mask represents strided access.
6743 std::optional<unsigned> Stride = 0;
6744 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6745 for (unsigned I : seq<unsigned>(1, Sz)) {
6746 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6747 std::optional<int64_t> OptPos =
6748 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6749 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6750 return false;
6751 unsigned Pos = static_cast<unsigned>(*OptPos);
6752 CompressMask[I] = Pos;
6753 if (!Stride)
6754 continue;
6755 if (*Stride == 0) {
6756 *Stride = Pos;
6757 continue;
6758 }
6759 if (Pos != *Stride * I)
6760 Stride.reset();
6761 }
6762 return Stride.has_value();
6763}
6764
6765/// Checks if the \p VL can be transformed to a (masked)load + compress or
6766/// (masked) interleaved load.
6768 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6771 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6772 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6773 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6774 VectorType *&LoadVecTy) {
6775 InterleaveFactor = 0;
6776 Type *ScalarTy = VL.front()->getType();
6777 const size_t Sz = VL.size();
6778 auto *VecTy = getWidenedType(ScalarTy, Sz);
6780 SmallVector<int> Mask;
6781 if (!Order.empty())
6782 inversePermutation(Order, Mask);
6783 // Check external uses.
6784 for (const auto [I, V] : enumerate(VL)) {
6785 if (AreAllUsersVectorized(V))
6786 continue;
6787 InstructionCost ExtractCost =
6788 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6789 Mask.empty() ? I : Mask[I]);
6790 InstructionCost ScalarCost =
6791 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6792 if (ExtractCost <= ScalarCost)
6793 return false;
6794 }
6795 Value *Ptr0;
6796 Value *PtrN;
6797 if (Order.empty()) {
6798 Ptr0 = PointerOps.front();
6799 PtrN = PointerOps.back();
6800 } else {
6801 Ptr0 = PointerOps[Order.front()];
6802 PtrN = PointerOps[Order.back()];
6803 }
6804 std::optional<int64_t> Diff =
6805 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6806 if (!Diff)
6807 return false;
6808 const size_t MaxRegSize =
6810 .getFixedValue();
6811 // Check for very large distances between elements.
6812 if (*Diff / Sz >= MaxRegSize / 8)
6813 return false;
6814 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6815 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6816 Align CommonAlignment = LI->getAlign();
6817 IsMasked = !isSafeToLoadUnconditionally(
6818 Ptr0, LoadVecTy, CommonAlignment, DL,
6819 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6820 &TLI);
6821 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6822 LI->getPointerAddressSpace()))
6823 return false;
6824 // TODO: perform the analysis of each scalar load for better
6825 // safe-load-unconditionally analysis.
6826 bool IsStrided =
6827 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6828 assert(CompressMask.size() >= 2 && "At least two elements are required");
6829 SmallVector<Value *> OrderedPointerOps(PointerOps);
6830 if (!Order.empty())
6831 reorderScalars(OrderedPointerOps, Mask);
6832 auto [ScalarGEPCost, VectorGEPCost] =
6833 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6834 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6835 // The cost of scalar loads.
6836 InstructionCost ScalarLoadsCost =
6837 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6838 [&](InstructionCost C, Value *V) {
6839 return C + TTI.getInstructionCost(cast<Instruction>(V),
6840 CostKind);
6841 }) +
6842 ScalarGEPCost;
6843 APInt DemandedElts = APInt::getAllOnes(Sz);
6844 InstructionCost GatherCost =
6845 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6846 /*Insert=*/true,
6847 /*Extract=*/false, CostKind) +
6848 ScalarLoadsCost;
6849 InstructionCost LoadCost = 0;
6850 if (IsMasked) {
6851 LoadCost =
6852 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6853 LI->getPointerAddressSpace(), CostKind);
6854 } else {
6855 LoadCost =
6856 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6857 LI->getPointerAddressSpace(), CostKind);
6858 }
6859 if (IsStrided && !IsMasked && Order.empty()) {
6860 // Check for potential segmented(interleaved) loads.
6861 VectorType *AlignedLoadVecTy = getWidenedType(
6862 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
6863 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
6864 DL, cast<LoadInst>(VL.back()), &AC, &DT,
6865 &TLI))
6866 AlignedLoadVecTy = LoadVecTy;
6867 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6868 CommonAlignment,
6869 LI->getPointerAddressSpace())) {
6870 InstructionCost InterleavedCost =
6871 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6872 Instruction::Load, AlignedLoadVecTy,
6873 CompressMask[1], {}, CommonAlignment,
6874 LI->getPointerAddressSpace(), CostKind, IsMasked);
6875 if (InterleavedCost < GatherCost) {
6876 InterleaveFactor = CompressMask[1];
6877 LoadVecTy = AlignedLoadVecTy;
6878 return true;
6879 }
6880 }
6881 }
6882 InstructionCost CompressCost = ::getShuffleCost(
6883 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
6884 if (!Order.empty()) {
6885 SmallVector<int> NewMask(Sz, PoisonMaskElem);
6886 for (unsigned I : seq<unsigned>(Sz)) {
6887 NewMask[I] = CompressMask[Mask[I]];
6888 }
6889 CompressMask.swap(NewMask);
6890 }
6891 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6892 return TotalVecCost < GatherCost;
6893}
6894
6895/// Checks if the \p VL can be transformed to a (masked)load + compress or
6896/// (masked) interleaved load.
6897static bool
6900 const DataLayout &DL, ScalarEvolution &SE,
6901 AssumptionCache &AC, const DominatorTree &DT,
6902 const TargetLibraryInfo &TLI,
6903 const function_ref<bool(Value *)> AreAllUsersVectorized) {
6904 bool IsMasked;
6905 unsigned InterleaveFactor;
6906 SmallVector<int> CompressMask;
6907 VectorType *LoadVecTy;
6908 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
6909 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6910 CompressMask, LoadVecTy);
6911}
6912
6913/// Checks if strided loads can be generated out of \p VL loads with pointers \p
6914/// PointerOps:
6915/// 1. Target with strided load support is detected.
6916/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
6917/// potential stride <= MaxProfitableLoadStride and the potential stride is
6918/// power-of-2 (to avoid perf regressions for the very small number of loads)
6919/// and max distance > number of loads, or potential stride is -1.
6920/// 3. The loads are ordered, or number of unordered loads <=
6921/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
6922/// to avoid extra costs for very expensive shuffles).
6923/// 4. Any pointer operand is an instruction with the users outside of the
6924/// current graph (for masked gathers extra extractelement instructions
6925/// might be required).
6927 Align Alignment, const int64_t Diff,
6928 const size_t Sz) const {
6929 if (Diff % (Sz - 1) != 0)
6930 return false;
6931
6932 // Try to generate strided load node.
6933 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
6934 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
6935 return !isVectorized(U) && !MustGather.contains(U);
6936 });
6937 });
6938
6939 const uint64_t AbsoluteDiff = std::abs(Diff);
6940 auto *VecTy = getWidenedType(ScalarTy, Sz);
6941 if (IsAnyPointerUsedOutGraph ||
6942 (AbsoluteDiff > Sz &&
6944 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
6945 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
6946 Diff == -(static_cast<int64_t>(Sz) - 1)) {
6947 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
6948 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
6949 return false;
6950 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
6951 return false;
6952 return true;
6953 }
6954 return false;
6955}
6956
6958 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
6959 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
6960 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
6961 const size_t Sz = PointerOps.size();
6962 if (!isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz))
6963 return false;
6964
6965 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
6966
6967 // Iterate through all pointers and check if all distances are
6968 // unique multiple of Dist.
6970 for (Value *Ptr : PointerOps) {
6971 int64_t Dist = 0;
6972 if (Ptr == PtrN)
6973 Dist = Diff;
6974 else if (Ptr != Ptr0)
6975 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
6976 // If the strides are not the same or repeated, we can't
6977 // vectorize.
6978 if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
6979 break;
6980 }
6981 if (Dists.size() == Sz) {
6982 Type *StrideTy = DL->getIndexType(Ptr0->getType());
6983 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6984 SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
6985 return true;
6986 }
6987 return false;
6988}
6989
6991 Type *ScalarTy, Align CommonAlignment,
6992 SmallVectorImpl<unsigned> &SortedIndices,
6993 StridedPtrInfo &SPtrInfo) const {
6994 const unsigned Sz = PointerOps.size();
6995 FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, Sz);
6996 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
6997 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
6998 return false;
6999 if (const SCEV *Stride =
7000 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, SortedIndices)) {
7001 SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
7002 SPtrInfo.StrideSCEV = Stride;
7003 return true;
7004 }
7005 return false;
7006}
7007
7009 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
7010 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7011 unsigned *BestVF, bool TryRecursiveCheck) const {
7012 // Check that a vectorized load would load the same memory as a scalar
7013 // load. For example, we don't want to vectorize loads that are smaller
7014 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7015 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7016 // from such a struct, we read/write packed bits disagreeing with the
7017 // unvectorized version.
7018 if (BestVF)
7019 *BestVF = 0;
7021 return LoadsState::Gather;
7022 Type *ScalarTy = VL0->getType();
7023
7024 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7025 return LoadsState::Gather;
7026
7027 // Make sure all loads in the bundle are simple - we can't vectorize
7028 // atomic or volatile loads.
7029 PointerOps.clear();
7030 const size_t Sz = VL.size();
7031 PointerOps.resize(Sz);
7032 auto *POIter = PointerOps.begin();
7033 for (Value *V : VL) {
7034 auto *L = dyn_cast<LoadInst>(V);
7035 if (!L || !L->isSimple())
7036 return LoadsState::Gather;
7037 *POIter = L->getPointerOperand();
7038 ++POIter;
7039 }
7040
7041 Order.clear();
7042 // Check the order of pointer operands or that all pointers are the same.
7043 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7044
7045 auto *VecTy = getWidenedType(ScalarTy, Sz);
7046 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7047 if (!IsSorted) {
7048 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
7049 SPtrInfo))
7051
7052 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7053 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7054 return LoadsState::Gather;
7055
7056 if (!all_of(PointerOps, [&](Value *P) {
7057 return arePointersCompatible(P, PointerOps.front(), *TLI);
7058 }))
7059 return LoadsState::Gather;
7060
7061 } else {
7062 Value *Ptr0;
7063 Value *PtrN;
7064 if (Order.empty()) {
7065 Ptr0 = PointerOps.front();
7066 PtrN = PointerOps.back();
7067 } else {
7068 Ptr0 = PointerOps[Order.front()];
7069 PtrN = PointerOps[Order.back()];
7070 }
7071 std::optional<int64_t> Diff =
7072 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7073 // Check that the sorted loads are consecutive.
7074 if (static_cast<uint64_t>(*Diff) == Sz - 1)
7075 return LoadsState::Vectorize;
7076 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
7077 *TLI, [&](Value *V) {
7078 return areAllUsersVectorized(
7079 cast<Instruction>(V), UserIgnoreList);
7080 }))
7082 Align Alignment =
7083 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
7084 ->getAlign();
7085 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
7086 *Diff, Ptr0, PtrN, SPtrInfo))
7088 }
7089 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7090 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7091 return LoadsState::Gather;
7092 // Correctly identify compare the cost of loads + shuffles rather than
7093 // strided/masked gather loads. Returns true if vectorized + shuffles
7094 // representation is better than just gather.
7095 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7096 unsigned *BestVF,
7097 bool ProfitableGatherPointers) {
7098 if (BestVF)
7099 *BestVF = 0;
7100 // Compare masked gather cost and loads + insert subvector costs.
7102 auto [ScalarGEPCost, VectorGEPCost] =
7103 getGEPCosts(TTI, PointerOps, PointerOps.front(),
7104 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7105 // Estimate the cost of masked gather GEP. If not a splat, roughly
7106 // estimate as a buildvector, otherwise estimate as splat.
7107 APInt DemandedElts = APInt::getAllOnes(Sz);
7108 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7109 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
7110 if (static_cast<unsigned>(count_if(
7111 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7112 any_of(PointerOps, [&](Value *V) {
7113 return getUnderlyingObject(V) !=
7114 getUnderlyingObject(PointerOps.front());
7115 }))
7116 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
7117 DemandedElts, /*Insert=*/true,
7118 /*Extract=*/false, CostKind);
7119 else
7120 VectorGEPCost +=
7122 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7123 /*Insert=*/true, /*Extract=*/false, CostKind) +
7124 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7125 // The cost of scalar loads.
7126 InstructionCost ScalarLoadsCost =
7127 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7128 [&](InstructionCost C, Value *V) {
7129 return C + TTI.getInstructionCost(
7131 }) +
7132 ScalarGEPCost;
7133 // The cost of masked gather.
7134 InstructionCost MaskedGatherCost =
7135 TTI.getGatherScatterOpCost(
7136 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
7137 /*VariableMask=*/false, CommonAlignment, CostKind) +
7138 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7139 InstructionCost GatherCost =
7140 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7141 /*Insert=*/true,
7142 /*Extract=*/false, CostKind) +
7143 ScalarLoadsCost;
7144 // The list of loads is small or perform partial check already - directly
7145 // compare masked gather cost and gather cost.
7146 constexpr unsigned ListLimit = 4;
7147 if (!TryRecursiveCheck || VL.size() < ListLimit)
7148 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7149
7150 // FIXME: The following code has not been updated for non-power-of-2
7151 // vectors (and not whole registers). The splitting logic here does not
7152 // cover the original vector if the vector factor is not a power of two.
7153 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
7154 return false;
7155
7156 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7157 unsigned MinVF = getMinVF(2 * Sz);
7158 DemandedElts.clearAllBits();
7159 // Iterate through possible vectorization factors and check if vectorized +
7160 // shuffles is better than just gather.
7161 for (unsigned VF =
7162 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7163 VF >= MinVF;
7164 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7166 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7167 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7169 SmallVector<Value *> PointerOps;
7170 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7171 PointerOps, SPtrInfo, BestVF,
7172 /*TryRecursiveCheck=*/false);
7173 // Check that the sorted loads are consecutive.
7174 if (LS == LoadsState::Gather) {
7175 if (BestVF) {
7176 DemandedElts.setAllBits();
7177 break;
7178 }
7179 DemandedElts.setBits(Cnt, Cnt + VF);
7180 continue;
7181 }
7182 // If need the reorder - consider as high-cost masked gather for now.
7183 if ((LS == LoadsState::Vectorize ||
7186 !Order.empty() && !isReverseOrder(Order))
7188 States.push_back(LS);
7189 }
7190 if (DemandedElts.isAllOnes())
7191 // All loads gathered - try smaller VF.
7192 continue;
7193 // Can be vectorized later as a serie of loads/insertelements.
7194 InstructionCost VecLdCost = 0;
7195 if (!DemandedElts.isZero()) {
7196 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7197 /*Insert=*/true,
7198 /*Extract=*/false, CostKind) +
7199 ScalarGEPCost;
7200 for (unsigned Idx : seq<unsigned>(VL.size()))
7201 if (DemandedElts[Idx])
7202 VecLdCost +=
7203 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7204 }
7205 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7206 for (auto [I, LS] : enumerate(States)) {
7207 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7208 InstructionCost VectorGEPCost =
7209 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7210 ? 0
7211 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7212 LI0->getPointerOperand(),
7213 Instruction::GetElementPtr, CostKind, ScalarTy,
7214 SubVecTy)
7215 .second;
7216 if (LS == LoadsState::ScatterVectorize) {
7217 if (static_cast<unsigned>(
7218 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7219 PointerOps.size() - 1 ||
7220 any_of(PointerOps, [&](Value *V) {
7221 return getUnderlyingObject(V) !=
7222 getUnderlyingObject(PointerOps.front());
7223 }))
7224 VectorGEPCost += getScalarizationOverhead(
7225 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7226 /*Insert=*/true, /*Extract=*/false, CostKind);
7227 else
7228 VectorGEPCost +=
7230 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7231 /*Insert=*/true, /*Extract=*/false, CostKind) +
7232 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7233 CostKind);
7234 }
7235 switch (LS) {
7237 VecLdCost +=
7238 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7239 LI0->getPointerAddressSpace(), CostKind,
7241 VectorGEPCost;
7242 break;
7244 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7245 LI0->getPointerOperand(),
7246 /*VariableMask=*/false,
7247 CommonAlignment, CostKind) +
7248 VectorGEPCost;
7249 break;
7251 VecLdCost += TTI.getMaskedMemoryOpCost(
7252 Instruction::Load, SubVecTy, CommonAlignment,
7253 LI0->getPointerAddressSpace(), CostKind) +
7254 VectorGEPCost +
7256 {}, CostKind);
7257 break;
7259 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7260 LI0->getPointerOperand(),
7261 /*VariableMask=*/false,
7262 CommonAlignment, CostKind) +
7263 VectorGEPCost;
7264 break;
7265 case LoadsState::Gather:
7266 // Gathers are already calculated - ignore.
7267 continue;
7268 }
7269 SmallVector<int> ShuffleMask(VL.size());
7270 for (int Idx : seq<int>(0, VL.size()))
7271 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7272 if (I > 0)
7273 VecLdCost +=
7274 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7275 CostKind, I * VF, SubVecTy);
7276 }
7277 // If masked gather cost is higher - better to vectorize, so
7278 // consider it as a gather node. It will be better estimated
7279 // later.
7280 if (MaskedGatherCost >= VecLdCost &&
7281 VecLdCost - GatherCost < -SLPCostThreshold) {
7282 if (BestVF)
7283 *BestVF = VF;
7284 return true;
7285 }
7286 }
7287 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7288 };
7289 // TODO: need to improve analysis of the pointers, if not all of them are
7290 // GEPs or have > 2 operands, we end up with a gather node, which just
7291 // increases the cost.
7292 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7293 bool ProfitableGatherPointers =
7294 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7295 return L->isLoopInvariant(V);
7296 })) <= Sz / 2;
7297 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7299 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7300 (GEP && GEP->getNumOperands() == 2 &&
7301 isa<Constant, Instruction>(GEP->getOperand(1)));
7302 })) {
7303 // Check if potential masked gather can be represented as series
7304 // of loads + insertsubvectors.
7305 // If masked gather cost is higher - better to vectorize, so
7306 // consider it as a gather node. It will be better estimated
7307 // later.
7308 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7309 ProfitableGatherPointers))
7311 }
7312
7313 return LoadsState::Gather;
7314}
7315
7317 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7318 const DataLayout &DL, ScalarEvolution &SE,
7319 SmallVectorImpl<unsigned> &SortedIndices) {
7320 assert(
7321 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7322 "Expected list of pointer operands.");
7323 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7324 // Ptr into, sort and return the sorted indices with values next to one
7325 // another.
7327 std::pair<BasicBlock *, Value *>,
7329 Bases;
7330 Bases
7331 .try_emplace(std::make_pair(
7333 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7334
7335 SortedIndices.clear();
7336 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7337 auto Key = std::make_pair(BBs[Cnt + 1],
7339 bool Found = any_of(Bases.try_emplace(Key).first->second,
7340 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7341 std::optional<int64_t> Diff =
7342 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7343 ElemTy, Ptr, DL, SE,
7344 /*StrictCheck=*/true);
7345 if (!Diff)
7346 return false;
7347
7348 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7349 return true;
7350 });
7351
7352 if (!Found) {
7353 // If we haven't found enough to usefully cluster, return early.
7354 if (Bases.size() > VL.size() / 2 - 1)
7355 return false;
7356
7357 // Not found already - add a new Base
7358 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7359 }
7360 }
7361
7362 if (Bases.size() == VL.size())
7363 return false;
7364
7365 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7366 Bases.front().second.size() == VL.size()))
7367 return false;
7368
7369 // For each of the bases sort the pointers by Offset and check if any of the
7370 // base become consecutively allocated.
7371 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7372 SmallPtrSet<Value *, 13> FirstPointers;
7373 SmallPtrSet<Value *, 13> SecondPointers;
7374 Value *P1 = Ptr1;
7375 Value *P2 = Ptr2;
7376 unsigned Depth = 0;
7377 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7378 if (P1 == P2 || Depth > RecursionMaxDepth)
7379 return false;
7380 FirstPointers.insert(P1);
7381 SecondPointers.insert(P2);
7382 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7383 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7384 ++Depth;
7385 }
7386 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7387 "Unable to find matching root.");
7388 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7389 };
7390 for (auto &Base : Bases) {
7391 for (auto &Vec : Base.second) {
7392 if (Vec.size() > 1) {
7394 int64_t InitialOffset = std::get<1>(Vec[0]);
7395 bool AnyConsecutive =
7396 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7397 return std::get<1>(P.value()) ==
7398 int64_t(P.index()) + InitialOffset;
7399 });
7400 // Fill SortedIndices array only if it looks worth-while to sort the
7401 // ptrs.
7402 if (!AnyConsecutive)
7403 return false;
7404 }
7405 }
7406 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7407 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7408 });
7409 }
7410
7411 for (auto &T : Bases)
7412 for (const auto &Vec : T.second)
7413 for (const auto &P : Vec)
7414 SortedIndices.push_back(std::get<2>(P));
7415
7416 assert(SortedIndices.size() == VL.size() &&
7417 "Expected SortedIndices to be the size of VL");
7418 return true;
7419}
7420
7421std::optional<BoUpSLP::OrdersType>
7422BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7423 assert(TE.isGather() && "Expected gather node only.");
7424 Type *ScalarTy = TE.Scalars[0]->getType();
7425
7427 Ptrs.reserve(TE.Scalars.size());
7429 BBs.reserve(TE.Scalars.size());
7430 for (Value *V : TE.Scalars) {
7431 auto *L = dyn_cast<LoadInst>(V);
7432 if (!L || !L->isSimple())
7433 return std::nullopt;
7434 Ptrs.push_back(L->getPointerOperand());
7435 BBs.push_back(L->getParent());
7436 }
7437
7438 BoUpSLP::OrdersType Order;
7439 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7440 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7441 return std::move(Order);
7442 return std::nullopt;
7443}
7444
7445/// Check if two insertelement instructions are from the same buildvector.
7448 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7449 // Instructions must be from the same basic blocks.
7450 if (VU->getParent() != V->getParent())
7451 return false;
7452 // Checks if 2 insertelements are from the same buildvector.
7453 if (VU->getType() != V->getType())
7454 return false;
7455 // Multiple used inserts are separate nodes.
7456 if (!VU->hasOneUse() && !V->hasOneUse())
7457 return false;
7458 auto *IE1 = VU;
7459 auto *IE2 = V;
7460 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7461 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7462 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7463 return false;
7464 // Go through the vector operand of insertelement instructions trying to find
7465 // either VU as the original vector for IE2 or V as the original vector for
7466 // IE1.
7467 SmallBitVector ReusedIdx(
7468 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7469 bool IsReusedIdx = false;
7470 do {
7471 if (IE2 == VU && !IE1)
7472 return VU->hasOneUse();
7473 if (IE1 == V && !IE2)
7474 return V->hasOneUse();
7475 if (IE1 && IE1 != V) {
7476 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7477 IsReusedIdx |= ReusedIdx.test(Idx1);
7478 ReusedIdx.set(Idx1);
7479 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7480 IE1 = nullptr;
7481 else
7482 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7483 }
7484 if (IE2 && IE2 != VU) {
7485 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7486 IsReusedIdx |= ReusedIdx.test(Idx2);
7487 ReusedIdx.set(Idx2);
7488 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7489 IE2 = nullptr;
7490 else
7491 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7492 }
7493 } while (!IsReusedIdx && (IE1 || IE2));
7494 return false;
7495}
7496
7497/// Checks if the specified instruction \p I is an alternate operation for
7498/// the given \p MainOp and \p AltOp instructions.
7499static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7500 Instruction *AltOp,
7501 const TargetLibraryInfo &TLI);
7502
7503std::optional<BoUpSLP::OrdersType>
7504BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7505 bool IgnoreReorder) {
7506 // No need to reorder if need to shuffle reuses, still need to shuffle the
7507 // node.
7508 if (!TE.ReuseShuffleIndices.empty()) {
7509 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7510 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7511 "Reshuffling scalars not yet supported for nodes with padding");
7512
7513 if (isSplat(TE.Scalars))
7514 return std::nullopt;
7515 // Check if reuse shuffle indices can be improved by reordering.
7516 // For this, check that reuse mask is "clustered", i.e. each scalar values
7517 // is used once in each submask of size <number_of_scalars>.
7518 // Example: 4 scalar values.
7519 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7520 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7521 // element 3 is used twice in the second submask.
7522 unsigned Sz = TE.Scalars.size();
7523 if (TE.isGather()) {
7524 if (std::optional<OrdersType> CurrentOrder =
7525 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7526 SmallVector<int> Mask;
7527 fixupOrderingIndices(*CurrentOrder);
7528 inversePermutation(*CurrentOrder, Mask);
7529 ::addMask(Mask, TE.ReuseShuffleIndices);
7530 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7531 unsigned Sz = TE.Scalars.size();
7532 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7533 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7534 if (Idx != PoisonMaskElem)
7535 Res[Idx + K * Sz] = I + K * Sz;
7536 }
7537 return std::move(Res);
7538 }
7539 }
7540 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7541 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7542 2 * TE.getVectorFactor())) == 1)
7543 return std::nullopt;
7544 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7545 return std::nullopt;
7546 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7547 Sz)) {
7548 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7549 if (TE.ReorderIndices.empty())
7550 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7551 else
7552 inversePermutation(TE.ReorderIndices, ReorderMask);
7553 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7554 unsigned VF = ReorderMask.size();
7555 OrdersType ResOrder(VF, VF);
7556 unsigned NumParts = divideCeil(VF, Sz);
7557 SmallBitVector UsedVals(NumParts);
7558 for (unsigned I = 0; I < VF; I += Sz) {
7559 int Val = PoisonMaskElem;
7560 unsigned UndefCnt = 0;
7561 unsigned Limit = std::min(Sz, VF - I);
7562 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7563 [&](int Idx) {
7564 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7565 Val = Idx;
7566 if (Idx == PoisonMaskElem)
7567 ++UndefCnt;
7568 return Idx != PoisonMaskElem && Idx != Val;
7569 }) ||
7570 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7571 UndefCnt > Sz / 2)
7572 return std::nullopt;
7573 UsedVals.set(Val);
7574 for (unsigned K = 0; K < NumParts; ++K) {
7575 unsigned Idx = Val + Sz * K;
7576 if (Idx < VF && I + K < VF)
7577 ResOrder[Idx] = I + K;
7578 }
7579 }
7580 return std::move(ResOrder);
7581 }
7582 unsigned VF = TE.getVectorFactor();
7583 // Try build correct order for extractelement instructions.
7584 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
7585 TE.ReuseShuffleIndices.end());
7586 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7587 all_of(TE.Scalars, [Sz](Value *V) {
7588 if (isa<PoisonValue>(V))
7589 return true;
7590 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7591 return Idx && *Idx < Sz;
7592 })) {
7593 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
7594 "by BinaryOperator and CastInst.");
7595 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7596 if (TE.ReorderIndices.empty())
7597 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7598 else
7599 inversePermutation(TE.ReorderIndices, ReorderMask);
7600 for (unsigned I = 0; I < VF; ++I) {
7601 int &Idx = ReusedMask[I];
7602 if (Idx == PoisonMaskElem)
7603 continue;
7604 Value *V = TE.Scalars[ReorderMask[Idx]];
7605 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
7606 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
7607 }
7608 }
7609 // Build the order of the VF size, need to reorder reuses shuffles, they are
7610 // always of VF size.
7611 OrdersType ResOrder(VF);
7612 std::iota(ResOrder.begin(), ResOrder.end(), 0);
7613 auto *It = ResOrder.begin();
7614 for (unsigned K = 0; K < VF; K += Sz) {
7615 OrdersType CurrentOrder(TE.ReorderIndices);
7616 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
7617 if (SubMask.front() == PoisonMaskElem)
7618 std::iota(SubMask.begin(), SubMask.end(), 0);
7619 reorderOrder(CurrentOrder, SubMask);
7620 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
7621 std::advance(It, Sz);
7622 }
7623 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
7624 return Data.index() == Data.value();
7625 }))
7626 return std::nullopt; // No need to reorder.
7627 return std::move(ResOrder);
7628 }
7629 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7630 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7631 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
7632 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
7633 return std::nullopt;
7634 if (TE.State == TreeEntry::SplitVectorize ||
7635 ((TE.State == TreeEntry::Vectorize ||
7636 TE.State == TreeEntry::StridedVectorize ||
7637 TE.State == TreeEntry::CompressVectorize) &&
7639 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
7640 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7641 "Alternate instructions are only supported by "
7642 "BinaryOperator and CastInst.");
7643 return TE.ReorderIndices;
7644 }
7645 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7646 TE.isAltShuffle()) {
7647 assert(TE.ReuseShuffleIndices.empty() &&
7648 "ReuseShuffleIndices should be "
7649 "empty for alternate instructions.");
7650 SmallVector<int> Mask;
7651 TE.buildAltOpShuffleMask(
7652 [&](Instruction *I) {
7653 assert(TE.getMatchingMainOpOrAltOp(I) &&
7654 "Unexpected main/alternate opcode");
7655 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
7656 },
7657 Mask);
7658 const int VF = TE.getVectorFactor();
7659 OrdersType ResOrder(VF, VF);
7660 for (unsigned I : seq<unsigned>(VF)) {
7661 if (Mask[I] == PoisonMaskElem)
7662 continue;
7663 ResOrder[Mask[I] % VF] = I;
7664 }
7665 return std::move(ResOrder);
7666 }
7667 if (!TE.ReorderIndices.empty())
7668 return TE.ReorderIndices;
7669 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7670 if (!TE.ReorderIndices.empty())
7671 return TE.ReorderIndices;
7672
7673 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
7674 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
7675 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
7676 continue;
7677 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
7678 if (!II)
7679 continue;
7680 Instruction *BVHead = nullptr;
7681 BasicBlock *BB = II->getParent();
7682 while (II && II->hasOneUse() && II->getParent() == BB) {
7683 BVHead = II;
7684 II = dyn_cast<InsertElementInst>(II->getOperand(0));
7685 }
7686 I = BVHead;
7687 }
7688
7689 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
7690 assert(BB1 != BB2 && "Expected different basic blocks.");
7691 if (!DT->isReachableFromEntry(BB1))
7692 return false;
7693 if (!DT->isReachableFromEntry(BB2))
7694 return true;
7695 auto *NodeA = DT->getNode(BB1);
7696 auto *NodeB = DT->getNode(BB2);
7697 assert(NodeA && "Should only process reachable instructions");
7698 assert(NodeB && "Should only process reachable instructions");
7699 assert((NodeA == NodeB) ==
7700 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7701 "Different nodes should have different DFS numbers");
7702 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7703 };
7704 auto PHICompare = [&](unsigned I1, unsigned I2) {
7705 Value *V1 = TE.Scalars[I1];
7706 Value *V2 = TE.Scalars[I2];
7707 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
7708 return false;
7709 if (isa<PoisonValue>(V1))
7710 return true;
7711 if (isa<PoisonValue>(V2))
7712 return false;
7713 if (V1->getNumUses() < V2->getNumUses())
7714 return true;
7715 if (V1->getNumUses() > V2->getNumUses())
7716 return false;
7717 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
7718 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
7719 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7720 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7721 FirstUserOfPhi2->getParent());
7722 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
7723 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
7724 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
7725 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
7726 if (IE1 && !IE2)
7727 return true;
7728 if (!IE1 && IE2)
7729 return false;
7730 if (IE1 && IE2) {
7731 if (UserBVHead[I1] && !UserBVHead[I2])
7732 return true;
7733 if (!UserBVHead[I1])
7734 return false;
7735 if (UserBVHead[I1] == UserBVHead[I2])
7736 return getElementIndex(IE1) < getElementIndex(IE2);
7737 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
7738 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
7739 UserBVHead[I2]->getParent());
7740 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7741 }
7742 if (EE1 && !EE2)
7743 return true;
7744 if (!EE1 && EE2)
7745 return false;
7746 if (EE1 && EE2) {
7747 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
7748 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
7749 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
7750 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
7751 if (!Inst2 && !P2)
7752 return Inst1 || P1;
7753 if (EE1->getOperand(0) == EE2->getOperand(0))
7754 return getElementIndex(EE1) < getElementIndex(EE2);
7755 if (!Inst1 && Inst2)
7756 return false;
7757 if (Inst1 && Inst2) {
7758 if (Inst1->getParent() != Inst2->getParent())
7759 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
7760 return Inst1->comesBefore(Inst2);
7761 }
7762 if (!P1 && P2)
7763 return false;
7764 assert(P1 && P2 &&
7765 "Expected either instructions or arguments vector operands.");
7766 return P1->getArgNo() < P2->getArgNo();
7767 }
7768 return false;
7769 };
7770 OrdersType Phis(TE.Scalars.size());
7771 std::iota(Phis.begin(), Phis.end(), 0);
7772 stable_sort(Phis, PHICompare);
7773 if (isIdentityOrder(Phis))
7774 return std::nullopt; // No need to reorder.
7775 return std::move(Phis);
7776 }
7777 if (TE.isGather() &&
7778 (!TE.hasState() || !TE.isAltShuffle() ||
7779 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7780 allSameType(TE.Scalars)) {
7781 // TODO: add analysis of other gather nodes with extractelement
7782 // instructions and other values/instructions, not only undefs.
7783 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7785 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
7786 all_of(TE.Scalars, [](Value *V) {
7787 auto *EE = dyn_cast<ExtractElementInst>(V);
7788 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7789 })) {
7790 // Check that gather of extractelements can be represented as
7791 // just a shuffle of a single vector.
7792 OrdersType CurrentOrder;
7793 bool Reuse =
7794 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
7795 if (Reuse || !CurrentOrder.empty())
7796 return std::move(CurrentOrder);
7797 }
7798 // If the gather node is <undef, v, .., poison> and
7799 // insertelement poison, v, 0 [+ permute]
7800 // is cheaper than
7801 // insertelement poison, v, n - try to reorder.
7802 // If rotating the whole graph, exclude the permute cost, the whole graph
7803 // might be transformed.
7804 int Sz = TE.Scalars.size();
7805 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
7806 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
7807 const auto *It = find_if_not(TE.Scalars, isConstant);
7808 if (It == TE.Scalars.begin())
7809 return OrdersType();
7810 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
7811 if (It != TE.Scalars.end()) {
7812 OrdersType Order(Sz, Sz);
7813 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7814 Order[Idx] = 0;
7815 fixupOrderingIndices(Order);
7816 SmallVector<int> Mask;
7817 inversePermutation(Order, Mask);
7818 InstructionCost PermuteCost =
7819 TopToBottom
7820 ? 0
7821 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
7822 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
7823 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
7824 PoisonValue::get(Ty), *It);
7825 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
7826 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
7827 PoisonValue::get(Ty), *It);
7828 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7829 OrdersType Order(Sz, Sz);
7830 Order[Idx] = 0;
7831 return std::move(Order);
7832 }
7833 }
7834 }
7835 if (isSplat(TE.Scalars))
7836 return std::nullopt;
7837 if (TE.Scalars.size() >= 3)
7838 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
7839 return Order;
7840 // Check if can include the order of vectorized loads. For masked gathers do
7841 // extra analysis later, so include such nodes into a special list.
7842 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7843 SmallVector<Value *> PointerOps;
7844 StridedPtrInfo SPtrInfo;
7845 OrdersType CurrentOrder;
7846 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
7847 CurrentOrder, PointerOps, SPtrInfo);
7850 return std::move(CurrentOrder);
7851 }
7852 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
7853 // has been auditted for correctness with non-power-of-two vectors.
7854 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
7855 if (std::optional<OrdersType> CurrentOrder =
7856 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
7857 return CurrentOrder;
7858 }
7859 return std::nullopt;
7860}
7861
7862/// Checks if the given mask is a "clustered" mask with the same clusters of
7863/// size \p Sz, which are not identity submasks.
7865 unsigned Sz) {
7866 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
7867 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
7868 return false;
7869 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
7870 ArrayRef<int> Cluster = Mask.slice(I, Sz);
7871 if (Cluster != FirstCluster)
7872 return false;
7873 }
7874 return true;
7875}
7876
7877void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
7878 // Reorder reuses mask.
7879 reorderReuses(TE.ReuseShuffleIndices, Mask);
7880 const unsigned Sz = TE.Scalars.size();
7881 // For vectorized and non-clustered reused no need to do anything else.
7882 if (!TE.isGather() ||
7884 Sz) ||
7885 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
7886 return;
7887 SmallVector<int> NewMask;
7888 inversePermutation(TE.ReorderIndices, NewMask);
7889 addMask(NewMask, TE.ReuseShuffleIndices);
7890 // Clear reorder since it is going to be applied to the new mask.
7891 TE.ReorderIndices.clear();
7892 // Try to improve gathered nodes with clustered reuses, if possible.
7893 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
7894 SmallVector<unsigned> NewOrder(Slice);
7895 inversePermutation(NewOrder, NewMask);
7896 reorderScalars(TE.Scalars, NewMask);
7897 // Fill the reuses mask with the identity submasks.
7898 for (auto *It = TE.ReuseShuffleIndices.begin(),
7899 *End = TE.ReuseShuffleIndices.end();
7900 It != End; std::advance(It, Sz))
7901 std::iota(It, std::next(It, Sz), 0);
7902}
7903
7905 ArrayRef<unsigned> SecondaryOrder) {
7906 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
7907 "Expected same size of orders");
7908 size_t Sz = Order.size();
7909 SmallBitVector UsedIndices(Sz);
7910 for (unsigned Idx : seq<unsigned>(0, Sz)) {
7911 if (Order[Idx] != Sz)
7912 UsedIndices.set(Order[Idx]);
7913 }
7914 if (SecondaryOrder.empty()) {
7915 for (unsigned Idx : seq<unsigned>(0, Sz))
7916 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
7917 Order[Idx] = Idx;
7918 } else {
7919 for (unsigned Idx : seq<unsigned>(0, Sz))
7920 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7921 !UsedIndices.test(SecondaryOrder[Idx]))
7922 Order[Idx] = SecondaryOrder[Idx];
7923 }
7924}
7925
7928 return false;
7929
7930 constexpr unsigned TinyVF = 2;
7931 constexpr unsigned TinyTree = 10;
7932 constexpr unsigned PhiOpsLimit = 12;
7933 constexpr unsigned GatherLoadsLimit = 2;
7934 if (VectorizableTree.size() <= TinyTree)
7935 return true;
7936 if (VectorizableTree.front()->hasState() &&
7937 !VectorizableTree.front()->isGather() &&
7938 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7939 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7940 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7941 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7942 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7943 VectorizableTree.front()->ReorderIndices.empty()) {
7944 // Check if the tree has only single store and single (unordered) load node,
7945 // other nodes are phis or geps/binops, combined with phis, and/or single
7946 // gather load node
7947 if (VectorizableTree.front()->hasState() &&
7948 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7949 VectorizableTree.front()->Scalars.size() == TinyVF &&
7950 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7951 return false;
7952 // Single node, which require reorder - skip.
7953 if (VectorizableTree.front()->hasState() &&
7954 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7955 VectorizableTree.front()->ReorderIndices.empty()) {
7956 const unsigned ReorderedSplitsCnt =
7957 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7958 return TE->State == TreeEntry::SplitVectorize &&
7959 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7960 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7961 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
7962 });
7963 if (ReorderedSplitsCnt <= 1 &&
7964 static_cast<unsigned>(count_if(
7965 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7966 return ((!TE->isGather() &&
7967 (TE->ReorderIndices.empty() ||
7968 (TE->UserTreeIndex.UserTE &&
7969 TE->UserTreeIndex.UserTE->State ==
7970 TreeEntry::Vectorize &&
7971 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7972 .empty()))) ||
7973 (TE->isGather() && TE->ReorderIndices.empty() &&
7974 (!TE->hasState() || TE->isAltShuffle() ||
7975 TE->getOpcode() == Instruction::Load ||
7976 TE->getOpcode() == Instruction::ZExt ||
7977 TE->getOpcode() == Instruction::SExt))) &&
7978 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7979 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
7980 return !isConstant(V) && isVectorized(V);
7981 }));
7982 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7983 return false;
7984 }
7985 bool HasPhis = false;
7986 bool HasLoad = true;
7987 unsigned GatherLoads = 0;
7988 for (const std::unique_ptr<TreeEntry> &TE :
7989 ArrayRef(VectorizableTree).drop_front()) {
7990 if (TE->State == TreeEntry::SplitVectorize)
7991 continue;
7992 if (!TE->hasState()) {
7993 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
7995 continue;
7996 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7998 continue;
7999 return true;
8000 }
8001 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8002 if (!TE->isGather()) {
8003 HasLoad = false;
8004 continue;
8005 }
8006 if (HasLoad)
8007 return true;
8008 ++GatherLoads;
8009 if (GatherLoads >= GatherLoadsLimit)
8010 return true;
8011 }
8012 if (TE->getOpcode() == Instruction::GetElementPtr ||
8013 Instruction::isBinaryOp(TE->getOpcode()))
8014 continue;
8015 if (TE->getOpcode() != Instruction::PHI &&
8016 (!TE->hasCopyableElements() ||
8017 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
8018 TE->Scalars.size() / 2))
8019 return true;
8020 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8021 TE->getNumOperands() > PhiOpsLimit)
8022 return false;
8023 HasPhis = true;
8024 }
8025 return !HasPhis;
8026 }
8027 return true;
8028}
8029
8030void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8031 ArrayRef<int> MaskOrder) {
8032 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8033 SmallVector<int> NewMask(getVectorFactor());
8034 SmallVector<int> NewMaskOrder(getVectorFactor());
8035 std::iota(NewMask.begin(), NewMask.end(), 0);
8036 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8037 if (Idx == 0) {
8038 copy(Mask, NewMask.begin());
8039 copy(MaskOrder, NewMaskOrder.begin());
8040 } else {
8041 assert(Idx == 1 && "Expected either 0 or 1 index.");
8042 unsigned Offset = CombinedEntriesWithIndices.back().second;
8043 for (unsigned I : seq<unsigned>(Mask.size())) {
8044 NewMask[I + Offset] = Mask[I] + Offset;
8045 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8046 }
8047 }
8048 reorderScalars(Scalars, NewMask);
8049 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
8050 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
8051 ReorderIndices.clear();
8052}
8053
8055 // Maps VF to the graph nodes.
8057 // ExtractElement gather nodes which can be vectorized and need to handle
8058 // their ordering.
8060
8061 // Phi nodes can have preferred ordering based on their result users
8063
8064 // AltShuffles can also have a preferred ordering that leads to fewer
8065 // instructions, e.g., the addsub instruction in x86.
8066 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8067
8068 // Maps a TreeEntry to the reorder indices of external users.
8070 ExternalUserReorderMap;
8071 // Find all reorderable nodes with the given VF.
8072 // Currently the are vectorized stores,loads,extracts + some gathering of
8073 // extracts.
8074 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8075 const std::unique_ptr<TreeEntry> &TE) {
8076 // Look for external users that will probably be vectorized.
8077 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8078 findExternalStoreUsersReorderIndices(TE.get());
8079 if (!ExternalUserReorderIndices.empty()) {
8080 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8081 ExternalUserReorderMap.try_emplace(TE.get(),
8082 std::move(ExternalUserReorderIndices));
8083 }
8084
8085 // Patterns like [fadd,fsub] can be combined into a single instruction in
8086 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8087 // to take into account their order when looking for the most used order.
8088 if (TE->hasState() && TE->isAltShuffle() &&
8089 TE->State != TreeEntry::SplitVectorize) {
8090 Type *ScalarTy = TE->Scalars[0]->getType();
8091 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
8092 unsigned Opcode0 = TE->getOpcode();
8093 unsigned Opcode1 = TE->getAltOpcode();
8094 SmallBitVector OpcodeMask(
8095 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
8096 // If this pattern is supported by the target then we consider the order.
8097 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8098 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8099 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
8100 }
8101 // TODO: Check the reverse order too.
8102 }
8103
8104 bool IgnoreReorder =
8105 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8106 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8107 VectorizableTree.front()->getOpcode() == Instruction::Store);
8108 if (std::optional<OrdersType> CurrentOrder =
8109 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
8110 // Do not include ordering for nodes used in the alt opcode vectorization,
8111 // better to reorder them during bottom-to-top stage. If follow the order
8112 // here, it causes reordering of the whole graph though actually it is
8113 // profitable just to reorder the subgraph that starts from the alternate
8114 // opcode vectorization node. Such nodes already end-up with the shuffle
8115 // instruction and it is just enough to change this shuffle rather than
8116 // rotate the scalars for the whole graph.
8117 unsigned Cnt = 0;
8118 const TreeEntry *UserTE = TE.get();
8119 while (UserTE && Cnt < RecursionMaxDepth) {
8120 if (!UserTE->UserTreeIndex)
8121 break;
8122 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8123 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8124 UserTE->UserTreeIndex.UserTE->Idx != 0)
8125 return;
8126 UserTE = UserTE->UserTreeIndex.UserTE;
8127 ++Cnt;
8128 }
8129 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8130 if (!(TE->State == TreeEntry::Vectorize ||
8131 TE->State == TreeEntry::StridedVectorize ||
8132 TE->State == TreeEntry::SplitVectorize ||
8133 TE->State == TreeEntry::CompressVectorize) ||
8134 !TE->ReuseShuffleIndices.empty())
8135 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8136 if (TE->State == TreeEntry::Vectorize &&
8137 TE->getOpcode() == Instruction::PHI)
8138 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8139 }
8140 });
8141
8142 // Reorder the graph nodes according to their vectorization factor.
8143 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8144 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8145 auto It = VFToOrderedEntries.find(VF);
8146 if (It == VFToOrderedEntries.end())
8147 continue;
8148 // Try to find the most profitable order. We just are looking for the most
8149 // used order and reorder scalar elements in the nodes according to this
8150 // mostly used order.
8151 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8152 // Delete VF entry upon exit.
8153 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
8154
8155 // All operands are reordered and used only in this node - propagate the
8156 // most used order to the user node.
8159 OrdersUses;
8160 for (const TreeEntry *OpTE : OrderedEntries) {
8161 // No need to reorder this nodes, still need to extend and to use shuffle,
8162 // just need to merge reordering shuffle and the reuse shuffle.
8163 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8164 OpTE->State != TreeEntry::SplitVectorize)
8165 continue;
8166 // Count number of orders uses.
8167 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8168 &PhisToOrders]() -> const OrdersType & {
8169 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8170 auto It = GathersToOrders.find(OpTE);
8171 if (It != GathersToOrders.end())
8172 return It->second;
8173 }
8174 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8175 auto It = AltShufflesToOrders.find(OpTE);
8176 if (It != AltShufflesToOrders.end())
8177 return It->second;
8178 }
8179 if (OpTE->State == TreeEntry::Vectorize &&
8180 OpTE->getOpcode() == Instruction::PHI) {
8181 auto It = PhisToOrders.find(OpTE);
8182 if (It != PhisToOrders.end())
8183 return It->second;
8184 }
8185 return OpTE->ReorderIndices;
8186 }();
8187 // First consider the order of the external scalar users.
8188 auto It = ExternalUserReorderMap.find(OpTE);
8189 if (It != ExternalUserReorderMap.end()) {
8190 const auto &ExternalUserReorderIndices = It->second;
8191 // If the OpTE vector factor != number of scalars - use natural order,
8192 // it is an attempt to reorder node with reused scalars but with
8193 // external uses.
8194 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8195 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8196 ExternalUserReorderIndices.size();
8197 } else {
8198 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8199 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8200 }
8201 // No other useful reorder data in this entry.
8202 if (Order.empty())
8203 continue;
8204 }
8205 // Stores actually store the mask, not the order, need to invert.
8206 if (OpTE->State == TreeEntry::Vectorize &&
8207 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8208 assert(!OpTE->isAltShuffle() &&
8209 "Alternate instructions are only supported by BinaryOperator "
8210 "and CastInst.");
8211 SmallVector<int> Mask;
8212 inversePermutation(Order, Mask);
8213 unsigned E = Order.size();
8214 OrdersType CurrentOrder(E, E);
8215 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8216 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8217 });
8218 fixupOrderingIndices(CurrentOrder);
8219 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8220 } else {
8221 ++OrdersUses.try_emplace(Order, 0).first->second;
8222 }
8223 }
8224 if (OrdersUses.empty())
8225 continue;
8226 // Choose the most used order.
8227 unsigned IdentityCnt = 0;
8228 unsigned FilledIdentityCnt = 0;
8229 OrdersType IdentityOrder(VF, VF);
8230 for (auto &Pair : OrdersUses) {
8231 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8232 if (!Pair.first.empty())
8233 FilledIdentityCnt += Pair.second;
8234 IdentityCnt += Pair.second;
8235 combineOrders(IdentityOrder, Pair.first);
8236 }
8237 }
8238 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8239 unsigned Cnt = IdentityCnt;
8240 for (auto &Pair : OrdersUses) {
8241 // Prefer identity order. But, if filled identity found (non-empty order)
8242 // with same number of uses, as the new candidate order, we can choose
8243 // this candidate order.
8244 if (Cnt < Pair.second ||
8245 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8246 Cnt == Pair.second && !BestOrder.empty() &&
8247 isIdentityOrder(BestOrder))) {
8248 combineOrders(Pair.first, BestOrder);
8249 BestOrder = Pair.first;
8250 Cnt = Pair.second;
8251 } else {
8252 combineOrders(BestOrder, Pair.first);
8253 }
8254 }
8255 // Set order of the user node.
8256 if (isIdentityOrder(BestOrder))
8257 continue;
8258 fixupOrderingIndices(BestOrder);
8259 SmallVector<int> Mask;
8260 inversePermutation(BestOrder, Mask);
8261 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8262 unsigned E = BestOrder.size();
8263 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8264 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8265 });
8266 // Do an actual reordering, if profitable.
8267 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8268 // Just do the reordering for the nodes with the given VF.
8269 if (TE->Scalars.size() != VF) {
8270 if (TE->ReuseShuffleIndices.size() == VF) {
8271 assert(TE->State != TreeEntry::SplitVectorize &&
8272 "Split vectorized not expected.");
8273 // Need to reorder the reuses masks of the operands with smaller VF to
8274 // be able to find the match between the graph nodes and scalar
8275 // operands of the given node during vectorization/cost estimation.
8276 assert(
8277 (!TE->UserTreeIndex ||
8278 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8279 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8280 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8281 "All users must be of VF size.");
8282 if (SLPReVec) {
8283 assert(SLPReVec && "Only supported by REVEC.");
8284 // ShuffleVectorInst does not do reorderOperands (and it should not
8285 // because ShuffleVectorInst supports only a limited set of
8286 // patterns). Only do reorderNodeWithReuses if the user is not
8287 // ShuffleVectorInst.
8288 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8289 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8290 continue;
8291 }
8292 // Update ordering of the operands with the smaller VF than the given
8293 // one.
8294 reorderNodeWithReuses(*TE, Mask);
8295 // Update orders in user split vectorize nodes.
8296 if (TE->UserTreeIndex &&
8297 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8298 TE->UserTreeIndex.UserTE->reorderSplitNode(
8299 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8300 }
8301 continue;
8302 }
8303 if ((TE->State == TreeEntry::SplitVectorize &&
8304 TE->ReuseShuffleIndices.empty()) ||
8305 ((TE->State == TreeEntry::Vectorize ||
8306 TE->State == TreeEntry::StridedVectorize ||
8307 TE->State == TreeEntry::CompressVectorize) &&
8309 InsertElementInst>(TE->getMainOp()) ||
8310 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8311 assert(
8312 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8313 TE->ReuseShuffleIndices.empty())) &&
8314 "Alternate instructions are only supported by BinaryOperator "
8315 "and CastInst.");
8316 // Build correct orders for extract{element,value}, loads,
8317 // stores and alternate (split) nodes.
8318 reorderOrder(TE->ReorderIndices, Mask);
8319 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8320 TE->reorderOperands(Mask);
8321 } else {
8322 // Reorder the node and its operands.
8323 TE->reorderOperands(Mask);
8324 assert(TE->ReorderIndices.empty() &&
8325 "Expected empty reorder sequence.");
8326 reorderScalars(TE->Scalars, Mask);
8327 }
8328 if (!TE->ReuseShuffleIndices.empty()) {
8329 // Apply reversed order to keep the original ordering of the reused
8330 // elements to avoid extra reorder indices shuffling.
8331 OrdersType CurrentOrder;
8332 reorderOrder(CurrentOrder, MaskOrder);
8333 SmallVector<int> NewReuses;
8334 inversePermutation(CurrentOrder, NewReuses);
8335 addMask(NewReuses, TE->ReuseShuffleIndices);
8336 TE->ReuseShuffleIndices.swap(NewReuses);
8337 } else if (TE->UserTreeIndex &&
8338 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8339 // Update orders in user split vectorize nodes.
8340 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8341 Mask, MaskOrder);
8342 }
8343 }
8344}
8345
8346void BoUpSLP::buildReorderableOperands(
8347 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8348 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8349 SmallVectorImpl<TreeEntry *> &GatherOps) {
8350 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8351 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8352 return OpData.first == I &&
8353 (OpData.second->State == TreeEntry::Vectorize ||
8354 OpData.second->State == TreeEntry::StridedVectorize ||
8355 OpData.second->State == TreeEntry::CompressVectorize ||
8356 OpData.second->State == TreeEntry::SplitVectorize);
8357 }))
8358 continue;
8359 // Do not request operands, if they do not exist.
8360 if (UserTE->hasState()) {
8361 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8362 UserTE->getOpcode() == Instruction::ExtractValue)
8363 continue;
8364 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8365 continue;
8366 if (UserTE->getOpcode() == Instruction::Store &&
8367 UserTE->State == TreeEntry::Vectorize && I == 1)
8368 continue;
8369 if (UserTE->getOpcode() == Instruction::Load &&
8370 (UserTE->State == TreeEntry::Vectorize ||
8371 UserTE->State == TreeEntry::StridedVectorize ||
8372 UserTE->State == TreeEntry::CompressVectorize))
8373 continue;
8374 }
8375 TreeEntry *TE = getOperandEntry(UserTE, I);
8376 assert(TE && "Expected operand entry.");
8377 if (!TE->isGather()) {
8378 // Add the node to the list of the ordered nodes with the identity
8379 // order.
8380 Edges.emplace_back(I, TE);
8381 // Add ScatterVectorize nodes to the list of operands, where just
8382 // reordering of the scalars is required. Similar to the gathers, so
8383 // simply add to the list of gathered ops.
8384 // If there are reused scalars, process this node as a regular vectorize
8385 // node, just reorder reuses mask.
8386 if (TE->State == TreeEntry::ScatterVectorize &&
8387 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8388 GatherOps.push_back(TE);
8389 continue;
8390 }
8391 if (ReorderableGathers.contains(TE))
8392 GatherOps.push_back(TE);
8393 }
8394}
8395
8396void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8397 struct TreeEntryCompare {
8398 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8399 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8400 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8401 return LHS->Idx < RHS->Idx;
8402 }
8403 };
8405 DenseSet<const TreeEntry *> GathersToOrders;
8406 // Find all reorderable leaf nodes with the given VF.
8407 // Currently the are vectorized loads,extracts without alternate operands +
8408 // some gathering of extracts.
8410 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8411 if (TE->State != TreeEntry::Vectorize &&
8412 TE->State != TreeEntry::StridedVectorize &&
8413 TE->State != TreeEntry::CompressVectorize &&
8414 TE->State != TreeEntry::SplitVectorize)
8415 NonVectorized.insert(TE.get());
8416 if (std::optional<OrdersType> CurrentOrder =
8417 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8418 Queue.push(TE.get());
8419 if (!(TE->State == TreeEntry::Vectorize ||
8420 TE->State == TreeEntry::StridedVectorize ||
8421 TE->State == TreeEntry::CompressVectorize ||
8422 TE->State == TreeEntry::SplitVectorize) ||
8423 !TE->ReuseShuffleIndices.empty())
8424 GathersToOrders.insert(TE.get());
8425 }
8426 }
8427
8428 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8429 // I.e., if the node has operands, that are reordered, try to make at least
8430 // one operand order in the natural order and reorder others + reorder the
8431 // user node itself.
8432 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8433 while (!Queue.empty()) {
8434 // 1. Filter out only reordered nodes.
8435 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8436 TreeEntry *TE = Queue.top();
8437 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8438 Queue.pop();
8439 SmallVector<TreeEntry *> OrderedOps(1, TE);
8440 while (!Queue.empty()) {
8441 TE = Queue.top();
8442 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8443 break;
8444 Queue.pop();
8445 OrderedOps.push_back(TE);
8446 }
8447 for (TreeEntry *TE : OrderedOps) {
8448 if (!(TE->State == TreeEntry::Vectorize ||
8449 TE->State == TreeEntry::StridedVectorize ||
8450 TE->State == TreeEntry::CompressVectorize ||
8451 TE->State == TreeEntry::SplitVectorize ||
8452 (TE->isGather() && GathersToOrders.contains(TE))) ||
8453 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8454 !Visited.insert(TE).second)
8455 continue;
8456 // Build a map between user nodes and their operands order to speedup
8457 // search. The graph currently does not provide this dependency directly.
8458 Users.first = TE->UserTreeIndex.UserTE;
8459 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8460 }
8461 if (Users.first) {
8462 auto &Data = Users;
8463 if (Data.first->State == TreeEntry::SplitVectorize) {
8464 assert(
8465 Data.second.size() <= 2 &&
8466 "Expected not greater than 2 operands for split vectorize node.");
8467 if (any_of(Data.second,
8468 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8469 continue;
8470 // Update orders in user split vectorize nodes.
8471 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8472 "Expected exactly 2 entries.");
8473 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8474 TreeEntry &OpTE = *VectorizableTree[P.first];
8475 OrdersType Order = OpTE.ReorderIndices;
8476 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8477 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8478 continue;
8479 const auto BestOrder =
8480 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8481 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8482 continue;
8483 Order = *BestOrder;
8484 }
8485 fixupOrderingIndices(Order);
8486 SmallVector<int> Mask;
8487 inversePermutation(Order, Mask);
8488 const unsigned E = Order.size();
8489 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8490 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8491 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8492 });
8493 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8494 // Clear ordering of the operand.
8495 if (!OpTE.ReorderIndices.empty()) {
8496 OpTE.ReorderIndices.clear();
8497 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8498 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8499 } else {
8500 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8501 reorderScalars(OpTE.Scalars, Mask);
8502 }
8503 }
8504 if (Data.first->ReuseShuffleIndices.empty() &&
8505 !Data.first->ReorderIndices.empty()) {
8506 // Insert user node to the list to try to sink reordering deeper in
8507 // the graph.
8508 Queue.push(Data.first);
8509 }
8510 continue;
8511 }
8512 // Check that operands are used only in the User node.
8513 SmallVector<TreeEntry *> GatherOps;
8514 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8515 GatherOps);
8516 // All operands are reordered and used only in this node - propagate the
8517 // most used order to the user node.
8520 OrdersUses;
8521 // Do the analysis for each tree entry only once, otherwise the order of
8522 // the same node my be considered several times, though might be not
8523 // profitable.
8526 for (const auto &Op : Data.second) {
8527 TreeEntry *OpTE = Op.second;
8528 if (!VisitedOps.insert(OpTE).second)
8529 continue;
8530 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8531 continue;
8532 const auto Order = [&]() -> const OrdersType {
8533 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8534 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8535 IgnoreReorder)
8536 .value_or(OrdersType(1));
8537 return OpTE->ReorderIndices;
8538 }();
8539 // The order is partially ordered, skip it in favor of fully non-ordered
8540 // orders.
8541 if (Order.size() == 1)
8542 continue;
8543
8544 // Check that the reordering does not increase number of shuffles, i.e.
8545 // same-values-nodes has same parents or their parents has same parents.
8546 if (!Order.empty() && !isIdentityOrder(Order)) {
8547 Value *Root = OpTE->hasState()
8548 ? OpTE->getMainOp()
8549 : *find_if_not(OpTE->Scalars, isConstant);
8550 auto GetSameNodesUsers = [&](Value *Root) {
8552 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8553 if (TE != OpTE && TE->UserTreeIndex &&
8554 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8555 TE->Scalars.size() == OpTE->Scalars.size() &&
8556 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8557 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8558 Res.insert(TE->UserTreeIndex.UserTE);
8559 }
8560 for (const TreeEntry *TE : getTreeEntries(Root)) {
8561 if (TE != OpTE && TE->UserTreeIndex &&
8562 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8563 TE->Scalars.size() == OpTE->Scalars.size() &&
8564 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8565 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8566 Res.insert(TE->UserTreeIndex.UserTE);
8567 }
8568 return Res.takeVector();
8569 };
8570 auto GetNumOperands = [](const TreeEntry *TE) {
8571 if (TE->State == TreeEntry::SplitVectorize)
8572 return TE->getNumOperands();
8573 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8574 return CI->arg_size();
8575 return TE->getNumOperands();
8576 };
8577 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8578 const TreeEntry *TE) {
8580 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8582 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8585 continue;
8586 const TreeEntry *Op = getOperandEntry(TE, Idx);
8587 if (Op->isGather() && Op->hasState()) {
8588 const TreeEntry *VecOp =
8589 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
8590 if (VecOp)
8591 Op = VecOp;
8592 }
8593 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
8594 return false;
8595 }
8596 return true;
8597 };
8598 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
8599 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
8600 if (!RevisitedOps.insert(UTE).second)
8601 return false;
8602 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
8603 !UTE->ReuseShuffleIndices.empty() ||
8604 (UTE->UserTreeIndex &&
8605 UTE->UserTreeIndex.UserTE == Data.first) ||
8606 (Data.first->UserTreeIndex &&
8607 Data.first->UserTreeIndex.UserTE == UTE) ||
8608 (IgnoreReorder && UTE->UserTreeIndex &&
8609 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8610 NodeShouldBeReorderedWithOperands(UTE);
8611 }))
8612 continue;
8613 for (TreeEntry *UTE : Users) {
8615 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
8617 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
8620 continue;
8621 const TreeEntry *Op = getOperandEntry(UTE, Idx);
8622 Visited.erase(Op);
8623 Queue.push(const_cast<TreeEntry *>(Op));
8624 }
8625 }
8626 }
8627 unsigned NumOps = count_if(
8628 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
8629 return P.second == OpTE;
8630 });
8631 // Stores actually store the mask, not the order, need to invert.
8632 if (OpTE->State == TreeEntry::Vectorize &&
8633 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8634 assert(!OpTE->isAltShuffle() &&
8635 "Alternate instructions are only supported by BinaryOperator "
8636 "and CastInst.");
8637 SmallVector<int> Mask;
8638 inversePermutation(Order, Mask);
8639 unsigned E = Order.size();
8640 OrdersType CurrentOrder(E, E);
8641 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8642 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8643 });
8644 fixupOrderingIndices(CurrentOrder);
8645 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
8646 } else {
8647 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
8648 }
8649 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
8650 const auto AllowsReordering = [&](const TreeEntry *TE) {
8651 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8652 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8653 (IgnoreReorder && TE->Idx == 0))
8654 return true;
8655 if (TE->isGather()) {
8656 if (GathersToOrders.contains(TE))
8657 return !getReorderingData(*TE, /*TopToBottom=*/false,
8658 IgnoreReorder)
8659 .value_or(OrdersType(1))
8660 .empty();
8661 return true;
8662 }
8663 return false;
8664 };
8665 if (OpTE->UserTreeIndex) {
8666 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8667 if (!VisitedUsers.insert(UserTE).second)
8668 continue;
8669 // May reorder user node if it requires reordering, has reused
8670 // scalars, is an alternate op vectorize node or its op nodes require
8671 // reordering.
8672 if (AllowsReordering(UserTE))
8673 continue;
8674 // Check if users allow reordering.
8675 // Currently look up just 1 level of operands to avoid increase of
8676 // the compile time.
8677 // Profitable to reorder if definitely more operands allow
8678 // reordering rather than those with natural order.
8680 if (static_cast<unsigned>(count_if(
8681 Ops, [UserTE, &AllowsReordering](
8682 const std::pair<unsigned, TreeEntry *> &Op) {
8683 return AllowsReordering(Op.second) &&
8684 Op.second->UserTreeIndex.UserTE == UserTE;
8685 })) <= Ops.size() / 2)
8686 ++Res.first->second;
8687 }
8688 }
8689 if (OrdersUses.empty()) {
8690 Visited.insert_range(llvm::make_second_range(Data.second));
8691 continue;
8692 }
8693 // Choose the most used order.
8694 unsigned IdentityCnt = 0;
8695 unsigned VF = Data.second.front().second->getVectorFactor();
8696 OrdersType IdentityOrder(VF, VF);
8697 for (auto &Pair : OrdersUses) {
8698 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8699 IdentityCnt += Pair.second;
8700 combineOrders(IdentityOrder, Pair.first);
8701 }
8702 }
8703 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8704 unsigned Cnt = IdentityCnt;
8705 for (auto &Pair : OrdersUses) {
8706 // Prefer identity order. But, if filled identity found (non-empty
8707 // order) with same number of uses, as the new candidate order, we can
8708 // choose this candidate order.
8709 if (Cnt < Pair.second) {
8710 combineOrders(Pair.first, BestOrder);
8711 BestOrder = Pair.first;
8712 Cnt = Pair.second;
8713 } else {
8714 combineOrders(BestOrder, Pair.first);
8715 }
8716 }
8717 // Set order of the user node.
8718 if (isIdentityOrder(BestOrder)) {
8719 Visited.insert_range(llvm::make_second_range(Data.second));
8720 continue;
8721 }
8722 fixupOrderingIndices(BestOrder);
8723 // Erase operands from OrderedEntries list and adjust their orders.
8724 VisitedOps.clear();
8725 SmallVector<int> Mask;
8726 inversePermutation(BestOrder, Mask);
8727 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8728 unsigned E = BestOrder.size();
8729 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8730 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8731 });
8732 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
8733 TreeEntry *TE = Op.second;
8734 if (!VisitedOps.insert(TE).second)
8735 continue;
8736 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
8737 reorderNodeWithReuses(*TE, Mask);
8738 continue;
8739 }
8740 // Gathers are processed separately.
8741 if (TE->State != TreeEntry::Vectorize &&
8742 TE->State != TreeEntry::StridedVectorize &&
8743 TE->State != TreeEntry::CompressVectorize &&
8744 TE->State != TreeEntry::SplitVectorize &&
8745 (TE->State != TreeEntry::ScatterVectorize ||
8746 TE->ReorderIndices.empty()))
8747 continue;
8748 assert((BestOrder.size() == TE->ReorderIndices.size() ||
8749 TE->ReorderIndices.empty()) &&
8750 "Non-matching sizes of user/operand entries.");
8751 reorderOrder(TE->ReorderIndices, Mask);
8752 if (IgnoreReorder && TE == VectorizableTree.front().get())
8753 IgnoreReorder = false;
8754 }
8755 // For gathers just need to reorder its scalars.
8756 for (TreeEntry *Gather : GatherOps) {
8757 assert(Gather->ReorderIndices.empty() &&
8758 "Unexpected reordering of gathers.");
8759 if (!Gather->ReuseShuffleIndices.empty()) {
8760 // Just reorder reuses indices.
8761 reorderReuses(Gather->ReuseShuffleIndices, Mask);
8762 continue;
8763 }
8764 reorderScalars(Gather->Scalars, Mask);
8765 Visited.insert(Gather);
8766 }
8767 // Reorder operands of the user node and set the ordering for the user
8768 // node itself.
8769 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
8770 return TE.isAltShuffle() &&
8771 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8772 TE.ReorderIndices.empty());
8773 };
8774 if (Data.first->State != TreeEntry::Vectorize ||
8776 Data.first->getMainOp()) ||
8777 IsNotProfitableAltCodeNode(*Data.first))
8778 Data.first->reorderOperands(Mask);
8779 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
8780 IsNotProfitableAltCodeNode(*Data.first) ||
8781 Data.first->State == TreeEntry::StridedVectorize ||
8782 Data.first->State == TreeEntry::CompressVectorize) {
8783 reorderScalars(Data.first->Scalars, Mask);
8784 reorderOrder(Data.first->ReorderIndices, MaskOrder,
8785 /*BottomOrder=*/true);
8786 if (Data.first->ReuseShuffleIndices.empty() &&
8787 !Data.first->ReorderIndices.empty() &&
8788 !IsNotProfitableAltCodeNode(*Data.first)) {
8789 // Insert user node to the list to try to sink reordering deeper in
8790 // the graph.
8791 Queue.push(Data.first);
8792 }
8793 } else {
8794 reorderOrder(Data.first->ReorderIndices, Mask);
8795 }
8796 }
8797 }
8798 // If the reordering is unnecessary, just remove the reorder.
8799 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8800 VectorizableTree.front()->ReuseShuffleIndices.empty())
8801 VectorizableTree.front()->ReorderIndices.clear();
8802}
8803
8804Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
8805 if (Entry.hasState() &&
8806 (Entry.getOpcode() == Instruction::Store ||
8807 Entry.getOpcode() == Instruction::Load) &&
8808 Entry.State == TreeEntry::StridedVectorize &&
8809 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
8810 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
8811 return dyn_cast<Instruction>(Entry.Scalars.front());
8812}
8813
8815 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
8816 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8817 DenseMap<Value *, unsigned> ScalarToExtUses;
8818 SmallPtrSet<Value *, 4> ExternalUsers;
8819 // Collect the values that we need to extract from the tree.
8820 for (auto &TEPtr : VectorizableTree) {
8821 TreeEntry *Entry = TEPtr.get();
8822
8823 // No need to handle users of gathered values.
8824 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8825 continue;
8826
8827 // For each lane:
8828 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8829 Value *Scalar = Entry->Scalars[Lane];
8830 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
8831 continue;
8832
8833 // All uses must be replaced already? No need to do it again.
8834 auto It = ScalarToExtUses.find(Scalar);
8835 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
8836 continue;
8837
8838 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8839 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8840 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
8841 << " from " << *Scalar << "for many users.\n");
8842 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8843 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8844 ExternalUsesWithNonUsers.insert(Scalar);
8845 continue;
8846 }
8847
8848 // Check if the scalar is externally used as an extra arg.
8849 const auto ExtI = ExternallyUsedValues.find(Scalar);
8850 if (ExtI != ExternallyUsedValues.end()) {
8851 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8852 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
8853 << FoundLane << " from " << *Scalar << ".\n");
8854 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
8855 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8856 continue;
8857 }
8858 for (User *U : Scalar->users()) {
8859 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
8860
8861 Instruction *UserInst = dyn_cast<Instruction>(U);
8862 if (!UserInst || isDeleted(UserInst))
8863 continue;
8864
8865 // Ignore users in the user ignore list.
8866 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8867 continue;
8868
8869 // Skip in-tree scalars that become vectors
8870 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
8871 !UseEntries.empty()) {
8872 // Some in-tree scalars will remain as scalar in vectorized
8873 // instructions. If that is the case, the one in FoundLane will
8874 // be used.
8875 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8876 isa<LoadInst, StoreInst>(UserInst)) ||
8877 isa<CallInst>(UserInst)) ||
8878 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8879 return UseEntry->State == TreeEntry::ScatterVectorize ||
8881 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8882 TTI);
8883 })) {
8884 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
8885 << ".\n");
8886 assert(none_of(UseEntries,
8887 [](TreeEntry *UseEntry) {
8888 return UseEntry->isGather();
8889 }) &&
8890 "Bad state");
8891 continue;
8892 }
8893 U = nullptr;
8894 if (It != ScalarToExtUses.end()) {
8895 ExternalUses[It->second].User = nullptr;
8896 break;
8897 }
8898 }
8899
8900 if (U && Scalar->hasNUsesOrMore(UsesLimit))
8901 U = nullptr;
8902 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8903 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
8904 << " from lane " << FoundLane << " from " << *Scalar
8905 << ".\n");
8906 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8907 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8908 ExternalUsesWithNonUsers.insert(Scalar);
8909 if (!U)
8910 break;
8911 }
8912 }
8913 }
8914}
8915
8917BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
8920 PtrToStoresMap;
8921 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
8922 Value *V = TE->Scalars[Lane];
8923 // Don't iterate over the users of constant data.
8924 if (!isa<Instruction>(V))
8925 continue;
8926 // To save compilation time we don't visit if we have too many users.
8927 if (V->hasNUsesOrMore(UsesLimit))
8928 break;
8929
8930 // Collect stores per pointer object.
8931 for (User *U : V->users()) {
8932 auto *SI = dyn_cast<StoreInst>(U);
8933 // Test whether we can handle the store. V might be a global, which could
8934 // be used in a different function.
8935 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
8936 !isValidElementType(SI->getValueOperand()->getType()))
8937 continue;
8938 // Skip entry if already
8939 if (isVectorized(U))
8940 continue;
8941
8942 Value *Ptr =
8943 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
8944 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
8945 SI->getValueOperand()->getType(), Ptr}];
8946 // For now just keep one store per pointer object per lane.
8947 // TODO: Extend this to support multiple stores per pointer per lane
8948 if (StoresVec.size() > Lane)
8949 continue;
8950 if (!StoresVec.empty()) {
8951 std::optional<int64_t> Diff = getPointersDiff(
8952 SI->getValueOperand()->getType(), SI->getPointerOperand(),
8953 SI->getValueOperand()->getType(),
8954 StoresVec.front()->getPointerOperand(), *DL, *SE,
8955 /*StrictCheck=*/true);
8956 // We failed to compare the pointers so just abandon this store.
8957 if (!Diff)
8958 continue;
8959 }
8960 StoresVec.push_back(SI);
8961 }
8962 }
8963 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
8964 unsigned I = 0;
8965 for (auto &P : PtrToStoresMap) {
8966 Res[I].swap(P.second);
8967 ++I;
8968 }
8969 return Res;
8970}
8971
8972bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
8973 OrdersType &ReorderIndices) const {
8974 // We check whether the stores in StoreVec can form a vector by sorting them
8975 // and checking whether they are consecutive.
8976
8977 // To avoid calling getPointersDiff() while sorting we create a vector of
8978 // pairs {store, offset from first} and sort this instead.
8980 StoreInst *S0 = StoresVec[0];
8981 StoreOffsetVec.emplace_back(0, 0);
8982 Type *S0Ty = S0->getValueOperand()->getType();
8983 Value *S0Ptr = S0->getPointerOperand();
8984 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
8985 StoreInst *SI = StoresVec[Idx];
8986 std::optional<int64_t> Diff =
8987 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
8988 SI->getPointerOperand(), *DL, *SE,
8989 /*StrictCheck=*/true);
8990 StoreOffsetVec.emplace_back(*Diff, Idx);
8991 }
8992
8993 // Check if the stores are consecutive by checking if their difference is 1.
8994 if (StoreOffsetVec.size() != StoresVec.size())
8995 return false;
8996 sort(StoreOffsetVec, llvm::less_first());
8997 unsigned Idx = 0;
8998 int64_t PrevDist = 0;
8999 for (const auto &P : StoreOffsetVec) {
9000 if (Idx > 0 && P.first != PrevDist + 1)
9001 return false;
9002 PrevDist = P.first;
9003 ++Idx;
9004 }
9005
9006 // Calculate the shuffle indices according to their offset against the sorted
9007 // StoreOffsetVec.
9008 ReorderIndices.assign(StoresVec.size(), 0);
9009 bool IsIdentity = true;
9010 for (auto [I, P] : enumerate(StoreOffsetVec)) {
9011 ReorderIndices[P.second] = I;
9012 IsIdentity &= P.second == I;
9013 }
9014 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9015 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
9016 // same convention here.
9017 if (IsIdentity)
9018 ReorderIndices.clear();
9019
9020 return true;
9021}
9022
9023#ifndef NDEBUG
9025 for (unsigned Idx : Order)
9026 dbgs() << Idx << ", ";
9027 dbgs() << "\n";
9028}
9029#endif
9030
9032BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9033 unsigned NumLanes = TE->Scalars.size();
9034
9035 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9036
9037 // Holds the reorder indices for each candidate store vector that is a user of
9038 // the current TreeEntry.
9039 SmallVector<OrdersType, 1> ExternalReorderIndices;
9040
9041 // Now inspect the stores collected per pointer and look for vectorization
9042 // candidates. For each candidate calculate the reorder index vector and push
9043 // it into `ExternalReorderIndices`
9044 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9045 // If we have fewer than NumLanes stores, then we can't form a vector.
9046 if (StoresVec.size() != NumLanes)
9047 continue;
9048
9049 // If the stores are not consecutive then abandon this StoresVec.
9050 OrdersType ReorderIndices;
9051 if (!canFormVector(StoresVec, ReorderIndices))
9052 continue;
9053
9054 // We now know that the scalars in StoresVec can form a vector instruction,
9055 // so set the reorder indices.
9056 ExternalReorderIndices.push_back(ReorderIndices);
9057 }
9058 return ExternalReorderIndices;
9059}
9060
9062 const SmallDenseSet<Value *> &UserIgnoreLst) {
9063 deleteTree();
9064 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9065 "TreeEntryToStridedPtrInfoMap is not cleared");
9066 UserIgnoreList = &UserIgnoreLst;
9067 if (!allSameType(Roots))
9068 return;
9069 buildTreeRec(Roots, 0, EdgeInfo());
9070}
9071
9073 deleteTree();
9074 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9075 "TreeEntryToStridedPtrInfoMap is not cleared");
9076 if (!allSameType(Roots))
9077 return;
9078 buildTreeRec(Roots, 0, EdgeInfo());
9079}
9080
9081/// Tries to find subvector of loads and builds new vector of only loads if can
9082/// be profitable.
9084 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9086 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9087 bool AddNew = true) {
9088 if (VL.empty())
9089 return;
9090 Type *ScalarTy = getValueType(VL.front());
9091 if (!isValidElementType(ScalarTy))
9092 return;
9094 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9095 for (Value *V : VL) {
9096 auto *LI = dyn_cast<LoadInst>(V);
9097 if (!LI)
9098 continue;
9099 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9100 continue;
9101 bool IsFound = false;
9102 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
9103 assert(LI->getParent() == Data.front().first->getParent() &&
9104 LI->getType() == Data.front().first->getType() &&
9105 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9106 getUnderlyingObject(Data.front().first->getPointerOperand(),
9108 "Expected loads with the same type, same parent and same "
9109 "underlying pointer.");
9110 std::optional<int64_t> Dist = getPointersDiff(
9111 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
9112 Data.front().first->getPointerOperand(), DL, SE,
9113 /*StrictCheck=*/true);
9114 if (!Dist)
9115 continue;
9116 auto It = Map.find(*Dist);
9117 if (It != Map.end() && It->second != LI)
9118 continue;
9119 if (It == Map.end()) {
9120 Data.emplace_back(LI, *Dist);
9121 Map.try_emplace(*Dist, LI);
9122 }
9123 IsFound = true;
9124 break;
9125 }
9126 if (!IsFound) {
9127 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9128 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9129 }
9130 }
9131 auto FindMatchingLoads =
9134 &GatheredLoads,
9135 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9136 int64_t &Offset, unsigned &Start) {
9137 if (Loads.empty())
9138 return GatheredLoads.end();
9139 LoadInst *LI = Loads.front().first;
9140 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9141 if (Idx < Start)
9142 continue;
9143 ToAdd.clear();
9144 if (LI->getParent() != Data.front().first->getParent() ||
9145 LI->getType() != Data.front().first->getType())
9146 continue;
9147 std::optional<int64_t> Dist =
9149 Data.front().first->getType(),
9150 Data.front().first->getPointerOperand(), DL, SE,
9151 /*StrictCheck=*/true);
9152 if (!Dist)
9153 continue;
9154 SmallSet<int64_t, 4> DataDists;
9156 for (std::pair<LoadInst *, int64_t> P : Data) {
9157 DataDists.insert(P.second);
9158 DataLoads.insert(P.first);
9159 }
9160 // Found matching gathered loads - check if all loads are unique or
9161 // can be effectively vectorized.
9162 unsigned NumUniques = 0;
9163 for (auto [Cnt, Pair] : enumerate(Loads)) {
9164 bool Used = DataLoads.contains(Pair.first);
9165 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9166 ++NumUniques;
9167 ToAdd.insert(Cnt);
9168 } else if (Used) {
9169 Repeated.insert(Cnt);
9170 }
9171 }
9172 if (NumUniques > 0 &&
9173 (Loads.size() == NumUniques ||
9174 (Loads.size() - NumUniques >= 2 &&
9175 Loads.size() - NumUniques >= Loads.size() / 2 &&
9176 (has_single_bit(Data.size() + NumUniques) ||
9177 bit_ceil(Data.size()) <
9178 bit_ceil(Data.size() + NumUniques))))) {
9179 Offset = *Dist;
9180 Start = Idx + 1;
9181 return std::next(GatheredLoads.begin(), Idx);
9182 }
9183 }
9184 ToAdd.clear();
9185 return GatheredLoads.end();
9186 };
9187 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9188 unsigned Start = 0;
9189 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9190 int64_t Offset = 0;
9191 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9192 Offset, Start);
9193 while (It != GatheredLoads.end()) {
9194 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9195 for (unsigned Idx : LocalToAdd)
9196 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9197 ToAdd.insert_range(LocalToAdd);
9198 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9199 Start);
9200 }
9201 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9202 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9203 })) {
9204 auto AddNewLoads =
9206 for (unsigned Idx : seq<unsigned>(Data.size())) {
9207 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9208 continue;
9209 Loads.push_back(Data[Idx]);
9210 }
9211 };
9212 if (!AddNew) {
9213 LoadInst *LI = Data.front().first;
9214 It = find_if(
9215 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9216 return PD.front().first->getParent() == LI->getParent() &&
9217 PD.front().first->getType() == LI->getType();
9218 });
9219 while (It != GatheredLoads.end()) {
9220 AddNewLoads(*It);
9221 It = std::find_if(
9222 std::next(It), GatheredLoads.end(),
9223 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9224 return PD.front().first->getParent() == LI->getParent() &&
9225 PD.front().first->getType() == LI->getType();
9226 });
9227 }
9228 }
9229 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9230 AddNewLoads(GatheredLoads.emplace_back());
9231 }
9232 }
9233}
9234
9235void BoUpSLP::tryToVectorizeGatheredLoads(
9236 const SmallMapVector<
9237 std::tuple<BasicBlock *, Value *, Type *>,
9238 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9239 &GatheredLoads) {
9240 GatheredLoadsEntriesFirst = VectorizableTree.size();
9241
9242 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9243 LoadEntriesToVectorize.size());
9244 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9245 Set.insert_range(VectorizableTree[Idx]->Scalars);
9246
9247 // Sort loads by distance.
9248 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9249 const std::pair<LoadInst *, int64_t> &L2) {
9250 return L1.second > L2.second;
9251 };
9252
9253 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9254 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9255 Loads.size());
9256 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9257 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9258 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9259 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9260 };
9261
9262 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9263 BoUpSLP::ValueSet &VectorizedLoads,
9264 SmallVectorImpl<LoadInst *> &NonVectorized,
9265 bool Final, unsigned MaxVF) {
9267 unsigned StartIdx = 0;
9268 SmallVector<int> CandidateVFs;
9269 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9270 CandidateVFs.push_back(MaxVF);
9271 for (int NumElts = getFloorFullVectorNumberOfElements(
9272 *TTI, Loads.front()->getType(), MaxVF);
9273 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9274 *TTI, Loads.front()->getType(), NumElts - 1)) {
9275 CandidateVFs.push_back(NumElts);
9276 if (VectorizeNonPowerOf2 && NumElts > 2)
9277 CandidateVFs.push_back(NumElts - 1);
9278 }
9279
9280 if (Final && CandidateVFs.empty())
9281 return Results;
9282
9283 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9284 for (unsigned NumElts : CandidateVFs) {
9285 if (Final && NumElts > BestVF)
9286 continue;
9287 SmallVector<unsigned> MaskedGatherVectorized;
9288 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9289 ++Cnt) {
9290 ArrayRef<LoadInst *> Slice =
9291 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9292 if (VectorizedLoads.count(Slice.front()) ||
9293 VectorizedLoads.count(Slice.back()) ||
9295 continue;
9296 // Check if it is profitable to try vectorizing gathered loads. It is
9297 // profitable if we have more than 3 consecutive loads or if we have
9298 // less but all users are vectorized or deleted.
9299 bool AllowToVectorize = false;
9300 // Check if it is profitable to vectorize 2-elements loads.
9301 if (NumElts == 2) {
9302 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9303 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9304 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9305 for (LoadInst *LI : Slice) {
9306 // If single use/user - allow to vectorize.
9307 if (LI->hasOneUse())
9308 continue;
9309 // 1. Check if number of uses equals number of users.
9310 // 2. All users are deleted.
9311 // 3. The load broadcasts are not allowed or the load is not
9312 // broadcasted.
9313 if (static_cast<unsigned int>(std::distance(
9314 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9315 return false;
9316 if (!IsLegalBroadcastLoad)
9317 continue;
9318 if (LI->hasNUsesOrMore(UsesLimit))
9319 return false;
9320 for (User *U : LI->users()) {
9321 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9322 continue;
9323 for (const TreeEntry *UTE : getTreeEntries(U)) {
9324 for (int I : seq<int>(UTE->getNumOperands())) {
9325 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9326 return V == LI || isa<PoisonValue>(V);
9327 }))
9328 // Found legal broadcast - do not vectorize.
9329 return false;
9330 }
9331 }
9332 }
9333 }
9334 return true;
9335 };
9336 AllowToVectorize = CheckIfAllowed(Slice);
9337 } else {
9338 AllowToVectorize =
9339 (NumElts >= 3 ||
9340 any_of(ValueToGatherNodes.at(Slice.front()),
9341 [=](const TreeEntry *TE) {
9342 return TE->Scalars.size() == 2 &&
9343 ((TE->Scalars.front() == Slice.front() &&
9344 TE->Scalars.back() == Slice.back()) ||
9345 (TE->Scalars.front() == Slice.back() &&
9346 TE->Scalars.back() == Slice.front()));
9347 })) &&
9348 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9349 Slice.size());
9350 }
9351 if (AllowToVectorize) {
9352 SmallVector<Value *> PointerOps;
9353 OrdersType CurrentOrder;
9354 // Try to build vector load.
9355 ArrayRef<Value *> Values(
9356 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9357 StridedPtrInfo SPtrInfo;
9358 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9359 PointerOps, SPtrInfo, &BestVF);
9360 if (LS != LoadsState::Gather ||
9361 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9362 if (LS == LoadsState::ScatterVectorize) {
9363 if (MaskedGatherVectorized.empty() ||
9364 Cnt >= MaskedGatherVectorized.back() + NumElts)
9365 MaskedGatherVectorized.push_back(Cnt);
9366 continue;
9367 }
9368 if (LS != LoadsState::Gather) {
9369 Results.emplace_back(Values, LS);
9370 VectorizedLoads.insert_range(Slice);
9371 // If we vectorized initial block, no need to try to vectorize it
9372 // again.
9373 if (Cnt == StartIdx)
9374 StartIdx += NumElts;
9375 }
9376 // Check if the whole array was vectorized already - exit.
9377 if (StartIdx >= Loads.size())
9378 break;
9379 // Erase last masked gather candidate, if another candidate within
9380 // the range is found to be better.
9381 if (!MaskedGatherVectorized.empty() &&
9382 Cnt < MaskedGatherVectorized.back() + NumElts)
9383 MaskedGatherVectorized.pop_back();
9384 Cnt += NumElts - 1;
9385 continue;
9386 }
9387 }
9388 if (!AllowToVectorize || BestVF == 0)
9390 }
9391 // Mark masked gathers candidates as vectorized, if any.
9392 for (unsigned Cnt : MaskedGatherVectorized) {
9393 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9394 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9395 ArrayRef<Value *> Values(
9396 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9397 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9398 VectorizedLoads.insert_range(Slice);
9399 // If we vectorized initial block, no need to try to vectorize it again.
9400 if (Cnt == StartIdx)
9401 StartIdx += NumElts;
9402 }
9403 }
9404 for (LoadInst *LI : Loads) {
9405 if (!VectorizedLoads.contains(LI))
9406 NonVectorized.push_back(LI);
9407 }
9408 return Results;
9409 };
9410 auto ProcessGatheredLoads =
9411 [&, &TTI = *TTI](
9413 bool Final = false) {
9414 SmallVector<LoadInst *> NonVectorized;
9415 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9416 GatheredLoads) {
9417 if (LoadsDists.size() <= 1) {
9418 NonVectorized.push_back(LoadsDists.back().first);
9419 continue;
9420 }
9422 LoadsDists);
9423 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9424 stable_sort(LocalLoadsDists, LoadSorter);
9426 unsigned MaxConsecutiveDistance = 0;
9427 unsigned CurrentConsecutiveDist = 1;
9428 int64_t LastDist = LocalLoadsDists.front().second;
9429 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9430 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9431 if (isVectorized(L.first))
9432 continue;
9433 assert(LastDist >= L.second &&
9434 "Expected first distance always not less than second");
9435 if (static_cast<uint64_t>(LastDist - L.second) ==
9436 CurrentConsecutiveDist) {
9437 ++CurrentConsecutiveDist;
9438 MaxConsecutiveDistance =
9439 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9440 Loads.push_back(L.first);
9441 continue;
9442 }
9443 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9444 !Loads.empty())
9445 Loads.pop_back();
9446 CurrentConsecutiveDist = 1;
9447 LastDist = L.second;
9448 Loads.push_back(L.first);
9449 }
9450 if (Loads.size() <= 1)
9451 continue;
9452 if (AllowMaskedGather)
9453 MaxConsecutiveDistance = Loads.size();
9454 else if (MaxConsecutiveDistance < 2)
9455 continue;
9456 BoUpSLP::ValueSet VectorizedLoads;
9457 SmallVector<LoadInst *> SortedNonVectorized;
9459 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9460 Final, MaxConsecutiveDistance);
9461 if (!Results.empty() && !SortedNonVectorized.empty() &&
9462 OriginalLoads.size() == Loads.size() &&
9463 MaxConsecutiveDistance == Loads.size() &&
9465 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9466 return P.second == LoadsState::ScatterVectorize;
9467 })) {
9468 VectorizedLoads.clear();
9469 SmallVector<LoadInst *> UnsortedNonVectorized;
9471 UnsortedResults =
9472 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9473 UnsortedNonVectorized, Final,
9474 OriginalLoads.size());
9475 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9476 SortedNonVectorized.swap(UnsortedNonVectorized);
9477 Results.swap(UnsortedResults);
9478 }
9479 }
9480 for (auto [Slice, _] : Results) {
9481 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9482 << Slice.size() << ")\n");
9483 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9484 for (Value *L : Slice)
9485 if (!isVectorized(L))
9486 SortedNonVectorized.push_back(cast<LoadInst>(L));
9487 continue;
9488 }
9489
9490 // Select maximum VF as a maximum of user gathered nodes and
9491 // distance between scalar loads in these nodes.
9492 unsigned MaxVF = Slice.size();
9493 unsigned UserMaxVF = 0;
9494 unsigned InterleaveFactor = 0;
9495 if (MaxVF == 2) {
9496 UserMaxVF = MaxVF;
9497 } else {
9498 // Found distance between segments of the interleaved loads.
9499 std::optional<unsigned> InterleavedLoadsDistance = 0;
9500 unsigned Order = 0;
9501 std::optional<unsigned> CommonVF = 0;
9502 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9503 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9504 for (auto [Idx, V] : enumerate(Slice)) {
9505 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9506 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9507 unsigned Pos =
9508 EntryToPosition.try_emplace(E, Idx).first->second;
9509 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9510 if (CommonVF) {
9511 if (*CommonVF == 0) {
9512 CommonVF = E->Scalars.size();
9513 continue;
9514 }
9515 if (*CommonVF != E->Scalars.size())
9516 CommonVF.reset();
9517 }
9518 // Check if the load is the part of the interleaved load.
9519 if (Pos != Idx && InterleavedLoadsDistance) {
9520 if (!DeinterleavedNodes.contains(E) &&
9521 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9522 if (isa<Constant>(V))
9523 return false;
9524 if (isVectorized(V))
9525 return true;
9526 const auto &Nodes = ValueToGatherNodes.at(V);
9527 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9528 !is_contained(Slice, V);
9529 })) {
9530 InterleavedLoadsDistance.reset();
9531 continue;
9532 }
9533 DeinterleavedNodes.insert(E);
9534 if (*InterleavedLoadsDistance == 0) {
9535 InterleavedLoadsDistance = Idx - Pos;
9536 continue;
9537 }
9538 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9539 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9540 InterleavedLoadsDistance.reset();
9541 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9542 }
9543 }
9544 }
9545 DeinterleavedNodes.clear();
9546 // Check if the large load represents interleaved load operation.
9547 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9548 CommonVF.value_or(0) != 0) {
9549 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9550 unsigned VF = *CommonVF;
9551 OrdersType Order;
9552 SmallVector<Value *> PointerOps;
9553 StridedPtrInfo SPtrInfo;
9554 // Segmented load detected - vectorize at maximum vector factor.
9555 if (InterleaveFactor <= Slice.size() &&
9556 TTI.isLegalInterleavedAccessType(
9557 getWidenedType(Slice.front()->getType(), VF),
9558 InterleaveFactor,
9559 cast<LoadInst>(Slice.front())->getAlign(),
9560 cast<LoadInst>(Slice.front())
9562 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
9563 SPtrInfo) == LoadsState::Vectorize) {
9564 UserMaxVF = InterleaveFactor * VF;
9565 } else {
9566 InterleaveFactor = 0;
9567 }
9568 }
9569 // Cannot represent the loads as consecutive vectorizable nodes -
9570 // just exit.
9571 unsigned ConsecutiveNodesSize = 0;
9572 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9573 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9574 [&, Slice = Slice](const auto &P) {
9575 const auto *It = find_if(Slice, [&](Value *V) {
9576 return std::get<1>(P).contains(V);
9577 });
9578 if (It == Slice.end())
9579 return false;
9580 const TreeEntry &TE =
9581 *VectorizableTree[std::get<0>(P)];
9582 ArrayRef<Value *> VL = TE.Scalars;
9583 OrdersType Order;
9584 SmallVector<Value *> PointerOps;
9585 StridedPtrInfo SPtrInfo;
9587 VL, VL.front(), Order, PointerOps, SPtrInfo);
9588 if (State == LoadsState::ScatterVectorize ||
9590 return false;
9591 ConsecutiveNodesSize += VL.size();
9592 size_t Start = std::distance(Slice.begin(), It);
9593 size_t Sz = Slice.size() - Start;
9594 return Sz < VL.size() ||
9595 Slice.slice(Start, VL.size()) != VL;
9596 }))
9597 continue;
9598 // Try to build long masked gather loads.
9599 UserMaxVF = bit_ceil(UserMaxVF);
9600 if (InterleaveFactor == 0 &&
9601 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9602 [&, Slice = Slice](unsigned Idx) {
9603 OrdersType Order;
9604 SmallVector<Value *> PointerOps;
9605 StridedPtrInfo SPtrInfo;
9606 return canVectorizeLoads(
9607 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9608 Slice[Idx * UserMaxVF], Order, PointerOps,
9609 SPtrInfo) == LoadsState::ScatterVectorize;
9610 }))
9611 UserMaxVF = MaxVF;
9612 if (Slice.size() != ConsecutiveNodesSize)
9613 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9614 }
9615 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9616 bool IsVectorized = true;
9617 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
9618 ArrayRef<Value *> SubSlice =
9619 Slice.slice(I, std::min(VF, E - I));
9620 if (isVectorized(SubSlice.front()))
9621 continue;
9622 // Check if the subslice is to be-vectorized entry, which is not
9623 // equal to entry.
9624 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9625 [&](const auto &P) {
9626 return !SubSlice.equals(
9627 VectorizableTree[std::get<0>(P)]
9628 ->Scalars) &&
9629 set_is_subset(SubSlice, std::get<1>(P));
9630 }))
9631 continue;
9632 unsigned Sz = VectorizableTree.size();
9633 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
9634 if (Sz == VectorizableTree.size()) {
9635 IsVectorized = false;
9636 // Try non-interleaved vectorization with smaller vector
9637 // factor.
9638 if (InterleaveFactor > 0) {
9639 VF = 2 * (MaxVF / InterleaveFactor);
9640 InterleaveFactor = 0;
9641 }
9642 continue;
9643 }
9644 }
9645 if (IsVectorized)
9646 break;
9647 }
9648 }
9649 NonVectorized.append(SortedNonVectorized);
9650 }
9651 return NonVectorized;
9652 };
9653 for (const auto &GLs : GatheredLoads) {
9654 const auto &Ref = GLs.second;
9655 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
9656 if (!Ref.empty() && !NonVectorized.empty() &&
9657 std::accumulate(
9658 Ref.begin(), Ref.end(), 0u,
9659 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9660 -> unsigned { return S + LoadsDists.size(); }) !=
9661 NonVectorized.size() &&
9662 IsMaskedGatherSupported(NonVectorized)) {
9664 FinalGatheredLoads;
9665 for (LoadInst *LI : NonVectorized) {
9666 // Reinsert non-vectorized loads to other list of loads with the same
9667 // base pointers.
9668 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
9669 FinalGatheredLoads,
9670 /*AddNew=*/false);
9671 }
9672 // Final attempt to vectorize non-vectorized loads.
9673 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
9674 }
9675 }
9676 // Try to vectorize postponed load entries, previously marked as gathered.
9677 for (unsigned Idx : LoadEntriesToVectorize) {
9678 const TreeEntry &E = *VectorizableTree[Idx];
9679 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
9680 // Avoid reordering, if possible.
9681 if (!E.ReorderIndices.empty()) {
9682 // Build a mask out of the reorder indices and reorder scalars per this
9683 // mask.
9684 SmallVector<int> ReorderMask;
9685 inversePermutation(E.ReorderIndices, ReorderMask);
9686 reorderScalars(GatheredScalars, ReorderMask);
9687 }
9688 buildTreeRec(GatheredScalars, 0, EdgeInfo());
9689 }
9690 // If no new entries created, consider it as no gathered loads entries must be
9691 // handled.
9692 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9693 VectorizableTree.size())
9694 GatheredLoadsEntriesFirst.reset();
9695}
9696
9697/// Generates key/subkey pair for the given value to provide effective sorting
9698/// of the values and better detection of the vectorizable values sequences. The
9699/// keys/subkeys can be used for better sorting of the values themselves (keys)
9700/// and in values subgroups (subkeys).
9701static std::pair<size_t, size_t> generateKeySubkey(
9702 Value *V, const TargetLibraryInfo *TLI,
9703 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
9704 bool AllowAlternate) {
9705 hash_code Key = hash_value(V->getValueID() + 2);
9706 hash_code SubKey = hash_value(0);
9707 // Sort the loads by the distance between the pointers.
9708 if (auto *LI = dyn_cast<LoadInst>(V)) {
9709 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
9710 if (LI->isSimple())
9711 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
9712 else
9713 Key = SubKey = hash_value(LI);
9714 } else if (isVectorLikeInstWithConstOps(V)) {
9715 // Sort extracts by the vector operands.
9717 Key = hash_value(Value::UndefValueVal + 1);
9718 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
9719 if (!isUndefVector(EI->getVectorOperand()).all() &&
9720 !isa<UndefValue>(EI->getIndexOperand()))
9721 SubKey = hash_value(EI->getVectorOperand());
9722 }
9723 } else if (auto *I = dyn_cast<Instruction>(V)) {
9724 // Sort other instructions just by the opcodes except for CMPInst.
9725 // For CMP also sort by the predicate kind.
9727 isValidForAlternation(I->getOpcode())) {
9728 if (AllowAlternate)
9729 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
9730 else
9731 Key = hash_combine(hash_value(I->getOpcode()), Key);
9732 SubKey = hash_combine(
9733 hash_value(I->getOpcode()), hash_value(I->getType()),
9735 ? I->getType()
9736 : cast<CastInst>(I)->getOperand(0)->getType()));
9737 // For casts, look through the only operand to improve compile time.
9738 if (isa<CastInst>(I)) {
9739 std::pair<size_t, size_t> OpVals =
9740 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
9741 /*AllowAlternate=*/true);
9742 Key = hash_combine(OpVals.first, Key);
9743 SubKey = hash_combine(OpVals.first, SubKey);
9744 }
9745 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
9746 CmpInst::Predicate Pred = CI->getPredicate();
9747 if (CI->isCommutative())
9748 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
9750 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
9751 hash_value(SwapPred),
9752 hash_value(CI->getOperand(0)->getType()));
9753 } else if (auto *Call = dyn_cast<CallInst>(I)) {
9756 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
9757 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
9758 SubKey = hash_combine(hash_value(I->getOpcode()),
9759 hash_value(Call->getCalledFunction()));
9760 } else {
9762 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
9763 }
9764 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
9765 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
9766 hash_value(Op.Tag), SubKey);
9767 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
9768 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
9769 SubKey = hash_value(Gep->getPointerOperand());
9770 else
9771 SubKey = hash_value(Gep);
9772 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
9773 !isa<ConstantInt>(I->getOperand(1))) {
9774 // Do not try to vectorize instructions with potentially high cost.
9775 SubKey = hash_value(I);
9776 } else {
9777 SubKey = hash_value(I->getOpcode());
9778 }
9779 Key = hash_combine(hash_value(I->getParent()), Key);
9780 }
9781 return std::make_pair(Key, SubKey);
9782}
9783
9784/// Checks if the specified instruction \p I is an main operation for the given
9785/// \p MainOp and \p AltOp instructions.
9786static bool isMainInstruction(Instruction *I, Instruction *MainOp,
9787 Instruction *AltOp, const TargetLibraryInfo &TLI);
9788
9789bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
9790 ArrayRef<Value *> VL) const {
9791 Type *ScalarTy = S.getMainOp()->getType();
9792 unsigned Opcode0 = S.getOpcode();
9793 unsigned Opcode1 = S.getAltOpcode();
9794 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9795 // If this pattern is supported by the target then consider it profitable.
9796 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
9797 Opcode1, OpcodeMask))
9798 return true;
9799 SmallVector<ValueList> Operands;
9800 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
9801 Operands.emplace_back();
9802 // Prepare the operand vector.
9803 for (Value *V : VL) {
9804 if (isa<PoisonValue>(V)) {
9805 Operands.back().push_back(
9806 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
9807 continue;
9808 }
9809 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
9810 }
9811 }
9812 if (Operands.size() == 2) {
9813 // Try find best operands candidates.
9814 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
9816 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
9817 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
9818 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
9819 std::optional<int> Res = findBestRootPair(Candidates);
9820 switch (Res.value_or(0)) {
9821 case 0:
9822 break;
9823 case 1:
9824 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
9825 break;
9826 case 2:
9827 std::swap(Operands[0][I], Operands[1][I]);
9828 break;
9829 default:
9830 llvm_unreachable("Unexpected index.");
9831 }
9832 }
9833 }
9834 DenseSet<unsigned> UniqueOpcodes;
9835 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
9836 unsigned NonInstCnt = 0;
9837 // Estimate number of instructions, required for the vectorized node and for
9838 // the buildvector node.
9839 unsigned UndefCnt = 0;
9840 // Count the number of extra shuffles, required for vector nodes.
9841 unsigned ExtraShuffleInsts = 0;
9842 // Check that operands do not contain same values and create either perfect
9843 // diamond match or shuffled match.
9844 if (Operands.size() == 2) {
9845 // Do not count same operands twice.
9846 if (Operands.front() == Operands.back()) {
9847 Operands.erase(Operands.begin());
9848 } else if (!allConstant(Operands.front()) &&
9849 all_of(Operands.front(), [&](Value *V) {
9850 return is_contained(Operands.back(), V);
9851 })) {
9852 Operands.erase(Operands.begin());
9853 ++ExtraShuffleInsts;
9854 }
9855 }
9856 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
9857 // Vectorize node, if:
9858 // 1. at least single operand is constant or splat.
9859 // 2. Operands have many loop invariants (the instructions are not loop
9860 // invariants).
9861 // 3. At least single unique operands is supposed to vectorized.
9862 return none_of(Operands,
9863 [&](ArrayRef<Value *> Op) {
9864 if (allConstant(Op) ||
9865 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
9866 getSameOpcode(Op, *TLI)))
9867 return false;
9868 DenseMap<Value *, unsigned> Uniques;
9869 for (Value *V : Op) {
9871 isVectorized(V) || (L && L->isLoopInvariant(V))) {
9872 if (isa<UndefValue>(V))
9873 ++UndefCnt;
9874 continue;
9875 }
9876 auto Res = Uniques.try_emplace(V, 0);
9877 // Found first duplicate - need to add shuffle.
9878 if (!Res.second && Res.first->second == 1)
9879 ++ExtraShuffleInsts;
9880 ++Res.first->getSecond();
9881 if (auto *I = dyn_cast<Instruction>(V))
9882 UniqueOpcodes.insert(I->getOpcode());
9883 else if (Res.second)
9884 ++NonInstCnt;
9885 }
9886 return none_of(Uniques, [&](const auto &P) {
9887 return P.first->hasNUsesOrMore(P.second + 1) &&
9888 none_of(P.first->users(), [&](User *U) {
9889 return isVectorized(U) || Uniques.contains(U);
9890 });
9891 });
9892 }) ||
9893 // Do not vectorize node, if estimated number of vector instructions is
9894 // more than estimated number of buildvector instructions. Number of
9895 // vector operands is number of vector instructions + number of vector
9896 // instructions for operands (buildvectors). Number of buildvector
9897 // instructions is just number_of_operands * number_of_scalars.
9898 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9899 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
9900 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9901}
9902
9903/// Builds the arguments types vector for the given call instruction with the
9904/// given \p ID for the specified vector factor.
9907 const unsigned VF, unsigned MinBW,
9908 const TargetTransformInfo *TTI) {
9909 SmallVector<Type *> ArgTys;
9910 for (auto [Idx, Arg] : enumerate(CI->args())) {
9913 ArgTys.push_back(Arg->getType());
9914 continue;
9915 }
9916 if (MinBW > 0) {
9917 ArgTys.push_back(
9918 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
9919 continue;
9920 }
9921 }
9922 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
9923 }
9924 return ArgTys;
9925}
9926
9927/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
9928/// function (if possible) calls. Returns invalid cost for the corresponding
9929/// calls, if they cannot be vectorized/will be scalarized.
9930static std::pair<InstructionCost, InstructionCost>
9933 ArrayRef<Type *> ArgTys) {
9934 auto Shape = VFShape::get(CI->getFunctionType(),
9936 false /*HasGlobalPred*/);
9937 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9938 auto LibCost = InstructionCost::getInvalid();
9939 if (!CI->isNoBuiltin() && VecFunc) {
9940 // Calculate the cost of the vector library call.
9941 // If the corresponding vector call is cheaper, return its cost.
9942 LibCost =
9943 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
9944 }
9946
9947 // Calculate the cost of the vector intrinsic call.
9948 FastMathFlags FMF;
9949 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9950 FMF = FPCI->getFastMathFlags();
9951 const InstructionCost ScalarLimit = 10000;
9952 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
9953 LibCost.isValid() ? LibCost : ScalarLimit);
9954 auto IntrinsicCost =
9955 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
9956 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
9957 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
9959
9960 return {IntrinsicCost, LibCost};
9961}
9962
9963BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9964 const InstructionsState &S, ArrayRef<Value *> VL,
9965 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
9966 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9967 assert(S.getMainOp() &&
9968 "Expected instructions with same/alternate opcodes only.");
9969
9970 unsigned ShuffleOrOp =
9971 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
9972 Instruction *VL0 = S.getMainOp();
9973 switch (ShuffleOrOp) {
9974 case Instruction::PHI: {
9975 // Too many operands - gather, most probably won't be vectorized.
9976 if (VL0->getNumOperands() > MaxPHINumOperands)
9977 return TreeEntry::NeedToGather;
9978 // Check for terminator values (e.g. invoke).
9979 for (Value *V : VL) {
9980 auto *PHI = dyn_cast<PHINode>(V);
9981 if (!PHI)
9982 continue;
9983 for (Value *Incoming : PHI->incoming_values()) {
9985 if (Term && Term->isTerminator()) {
9987 << "SLP: Need to swizzle PHINodes (terminator use).\n");
9988 return TreeEntry::NeedToGather;
9989 }
9990 }
9991 }
9992
9993 return TreeEntry::Vectorize;
9994 }
9995 case Instruction::ExtractElement:
9996 if (any_of(VL, [&](Value *V) {
9997 auto *EI = dyn_cast<ExtractElementInst>(V);
9998 if (!EI)
9999 return true;
10000 return isVectorized(EI->getOperand(0));
10001 }))
10002 return TreeEntry::NeedToGather;
10003 [[fallthrough]];
10004 case Instruction::ExtractValue: {
10005 bool Reuse = canReuseExtract(VL, CurrentOrder);
10006 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
10007 // non-full registers).
10008 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
10009 return TreeEntry::NeedToGather;
10010 if (Reuse || !CurrentOrder.empty())
10011 return TreeEntry::Vectorize;
10012 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10013 return TreeEntry::NeedToGather;
10014 }
10015 case Instruction::InsertElement: {
10016 // Check that we have a buildvector and not a shuffle of 2 or more
10017 // different vectors.
10018 ValueSet SourceVectors;
10019 for (Value *V : VL) {
10020 if (isa<PoisonValue>(V)) {
10021 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10022 return TreeEntry::NeedToGather;
10023 }
10024 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
10025 assert(getElementIndex(V) != std::nullopt &&
10026 "Non-constant or undef index?");
10027 }
10028
10029 if (count_if(VL, [&SourceVectors](Value *V) {
10030 return !SourceVectors.contains(V);
10031 }) >= 2) {
10032 // Found 2nd source vector - cancel.
10033 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10034 "different source vectors.\n");
10035 return TreeEntry::NeedToGather;
10036 }
10037
10038 if (any_of(VL, [&SourceVectors](Value *V) {
10039 // The last InsertElement can have multiple uses.
10040 return SourceVectors.contains(V) && !V->hasOneUse();
10041 })) {
10042 assert(SLPReVec && "Only supported by REVEC.");
10043 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10044 "multiple uses.\n");
10045 return TreeEntry::NeedToGather;
10046 }
10047
10048 return TreeEntry::Vectorize;
10049 }
10050 case Instruction::Load: {
10051 // Check that a vectorized load would load the same memory as a scalar
10052 // load. For example, we don't want to vectorize loads that are smaller
10053 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10054 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10055 // from such a struct, we read/write packed bits disagreeing with the
10056 // unvectorized version.
10057 auto IsGatheredNode = [&]() {
10058 if (!GatheredLoadsEntriesFirst)
10059 return false;
10060 return all_of(VL, [&](Value *V) {
10061 if (isa<PoisonValue>(V))
10062 return true;
10063 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
10064 return TE->Idx >= *GatheredLoadsEntriesFirst;
10065 });
10066 });
10067 };
10068 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
10070 return TreeEntry::Vectorize;
10072 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10073 // Delay slow vectorized nodes for better vectorization attempts.
10074 LoadEntriesToVectorize.insert(VectorizableTree.size());
10075 return TreeEntry::NeedToGather;
10076 }
10077 return IsGatheredNode() ? TreeEntry::NeedToGather
10078 : TreeEntry::CompressVectorize;
10080 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10081 // Delay slow vectorized nodes for better vectorization attempts.
10082 LoadEntriesToVectorize.insert(VectorizableTree.size());
10083 return TreeEntry::NeedToGather;
10084 }
10085 return IsGatheredNode() ? TreeEntry::NeedToGather
10086 : TreeEntry::ScatterVectorize;
10088 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10089 // Delay slow vectorized nodes for better vectorization attempts.
10090 LoadEntriesToVectorize.insert(VectorizableTree.size());
10091 return TreeEntry::NeedToGather;
10092 }
10093 return IsGatheredNode() ? TreeEntry::NeedToGather
10094 : TreeEntry::StridedVectorize;
10095 case LoadsState::Gather:
10096#ifndef NDEBUG
10097 Type *ScalarTy = VL0->getType();
10098 if (DL->getTypeSizeInBits(ScalarTy) !=
10099 DL->getTypeAllocSizeInBits(ScalarTy))
10100 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10101 else if (any_of(VL, [](Value *V) {
10102 auto *LI = dyn_cast<LoadInst>(V);
10103 return !LI || !LI->isSimple();
10104 }))
10105 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10106 else
10107 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10108#endif // NDEBUG
10110 return TreeEntry::NeedToGather;
10111 }
10112 llvm_unreachable("Unexpected state of loads");
10113 }
10114 case Instruction::ZExt:
10115 case Instruction::SExt:
10116 case Instruction::FPToUI:
10117 case Instruction::FPToSI:
10118 case Instruction::FPExt:
10119 case Instruction::PtrToInt:
10120 case Instruction::IntToPtr:
10121 case Instruction::SIToFP:
10122 case Instruction::UIToFP:
10123 case Instruction::Trunc:
10124 case Instruction::FPTrunc:
10125 case Instruction::BitCast: {
10126 Type *SrcTy = VL0->getOperand(0)->getType();
10127 for (Value *V : VL) {
10128 if (isa<PoisonValue>(V))
10129 continue;
10130 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10131 if (Ty != SrcTy || !isValidElementType(Ty)) {
10132 LLVM_DEBUG(
10133 dbgs() << "SLP: Gathering casts with different src types.\n");
10134 return TreeEntry::NeedToGather;
10135 }
10136 }
10137 return TreeEntry::Vectorize;
10138 }
10139 case Instruction::ICmp:
10140 case Instruction::FCmp: {
10141 // Check that all of the compares have the same predicate.
10142 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10144 Type *ComparedTy = VL0->getOperand(0)->getType();
10145 for (Value *V : VL) {
10146 if (isa<PoisonValue>(V))
10147 continue;
10148 auto *Cmp = cast<CmpInst>(V);
10149 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10150 Cmp->getOperand(0)->getType() != ComparedTy) {
10151 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10152 return TreeEntry::NeedToGather;
10153 }
10154 }
10155 return TreeEntry::Vectorize;
10156 }
10157 case Instruction::Select:
10158 case Instruction::FNeg:
10159 case Instruction::Add:
10160 case Instruction::FAdd:
10161 case Instruction::Sub:
10162 case Instruction::FSub:
10163 case Instruction::Mul:
10164 case Instruction::FMul:
10165 case Instruction::UDiv:
10166 case Instruction::SDiv:
10167 case Instruction::FDiv:
10168 case Instruction::URem:
10169 case Instruction::SRem:
10170 case Instruction::FRem:
10171 case Instruction::Shl:
10172 case Instruction::LShr:
10173 case Instruction::AShr:
10174 case Instruction::And:
10175 case Instruction::Or:
10176 case Instruction::Xor:
10177 case Instruction::Freeze:
10178 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10179 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10180 auto *I = dyn_cast<Instruction>(V);
10181 return I && I->isBinaryOp() && !I->isFast();
10182 }))
10183 return TreeEntry::NeedToGather;
10184 return TreeEntry::Vectorize;
10185 case Instruction::GetElementPtr: {
10186 // We don't combine GEPs with complicated (nested) indexing.
10187 for (Value *V : VL) {
10188 auto *I = dyn_cast<GetElementPtrInst>(V);
10189 if (!I)
10190 continue;
10191 if (I->getNumOperands() != 2) {
10192 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10193 return TreeEntry::NeedToGather;
10194 }
10195 }
10196
10197 // We can't combine several GEPs into one vector if they operate on
10198 // different types.
10199 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10200 for (Value *V : VL) {
10201 auto *GEP = dyn_cast<GEPOperator>(V);
10202 if (!GEP)
10203 continue;
10204 Type *CurTy = GEP->getSourceElementType();
10205 if (Ty0 != CurTy) {
10206 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10207 return TreeEntry::NeedToGather;
10208 }
10209 }
10210
10211 // We don't combine GEPs with non-constant indexes.
10212 Type *Ty1 = VL0->getOperand(1)->getType();
10213 for (Value *V : VL) {
10214 auto *I = dyn_cast<GetElementPtrInst>(V);
10215 if (!I)
10216 continue;
10217 auto *Op = I->getOperand(1);
10218 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10219 (Op->getType() != Ty1 &&
10220 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10221 Op->getType()->getScalarSizeInBits() >
10222 DL->getIndexSizeInBits(
10223 V->getType()->getPointerAddressSpace())))) {
10224 LLVM_DEBUG(
10225 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10226 return TreeEntry::NeedToGather;
10227 }
10228 }
10229
10230 return TreeEntry::Vectorize;
10231 }
10232 case Instruction::Store: {
10233 // Check if the stores are consecutive or if we need to swizzle them.
10234 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10235 // Avoid types that are padded when being allocated as scalars, while
10236 // being packed together in a vector (such as i1).
10237 if (DL->getTypeSizeInBits(ScalarTy) !=
10238 DL->getTypeAllocSizeInBits(ScalarTy)) {
10239 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10240 return TreeEntry::NeedToGather;
10241 }
10242 // Make sure all stores in the bundle are simple - we can't vectorize
10243 // atomic or volatile stores.
10244 for (Value *V : VL) {
10245 auto *SI = cast<StoreInst>(V);
10246 if (!SI->isSimple()) {
10247 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10248 return TreeEntry::NeedToGather;
10249 }
10250 PointerOps.push_back(SI->getPointerOperand());
10251 }
10252
10253 // Check the order of pointer operands.
10254 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10255 Value *Ptr0;
10256 Value *PtrN;
10257 if (CurrentOrder.empty()) {
10258 Ptr0 = PointerOps.front();
10259 PtrN = PointerOps.back();
10260 } else {
10261 Ptr0 = PointerOps[CurrentOrder.front()];
10262 PtrN = PointerOps[CurrentOrder.back()];
10263 }
10264 std::optional<int64_t> Dist =
10265 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10266 // Check that the sorted pointer operands are consecutive.
10267 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10268 return TreeEntry::Vectorize;
10269 }
10270
10271 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10272 return TreeEntry::NeedToGather;
10273 }
10274 case Instruction::Call: {
10275 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10276 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10277 auto *I = dyn_cast<Instruction>(V);
10278 return I && !I->isFast();
10279 }))
10280 return TreeEntry::NeedToGather;
10281 // Check if the calls are all to the same vectorizable intrinsic or
10282 // library function.
10283 CallInst *CI = cast<CallInst>(VL0);
10285
10286 VFShape Shape = VFShape::get(
10287 CI->getFunctionType(),
10288 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10289 false /*HasGlobalPred*/);
10290 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10291
10292 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10293 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10294 return TreeEntry::NeedToGather;
10295 }
10296 Function *F = CI->getCalledFunction();
10297 unsigned NumArgs = CI->arg_size();
10298 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10299 for (unsigned J = 0; J != NumArgs; ++J)
10301 ScalarArgs[J] = CI->getArgOperand(J);
10302 for (Value *V : VL) {
10303 CallInst *CI2 = dyn_cast<CallInst>(V);
10304 if (!CI2 || CI2->getCalledFunction() != F ||
10305 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10306 (VecFunc &&
10307 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10309 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10310 << "\n");
10311 return TreeEntry::NeedToGather;
10312 }
10313 // Some intrinsics have scalar arguments and should be same in order for
10314 // them to be vectorized.
10315 for (unsigned J = 0; J != NumArgs; ++J) {
10317 Value *A1J = CI2->getArgOperand(J);
10318 if (ScalarArgs[J] != A1J) {
10320 << "SLP: mismatched arguments in call:" << *CI
10321 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10322 return TreeEntry::NeedToGather;
10323 }
10324 }
10325 }
10326 // Verify that the bundle operands are identical between the two calls.
10327 if (CI->hasOperandBundles() &&
10328 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10329 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10330 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10331 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10332 << "!=" << *V << '\n');
10333 return TreeEntry::NeedToGather;
10334 }
10335 }
10336 SmallVector<Type *> ArgTys =
10337 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10338 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10339 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10340 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10341 return TreeEntry::NeedToGather;
10342
10343 return TreeEntry::Vectorize;
10344 }
10345 case Instruction::ShuffleVector: {
10346 if (!S.isAltShuffle()) {
10347 // REVEC can support non alternate shuffle.
10349 return TreeEntry::Vectorize;
10350 // If this is not an alternate sequence of opcode like add-sub
10351 // then do not vectorize this instruction.
10352 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10353 return TreeEntry::NeedToGather;
10354 }
10355 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10356 LLVM_DEBUG(
10357 dbgs()
10358 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10359 "the whole alt sequence is not profitable.\n");
10360 return TreeEntry::NeedToGather;
10361 }
10362
10363 return TreeEntry::Vectorize;
10364 }
10365 default:
10366 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10367 return TreeEntry::NeedToGather;
10368 }
10369}
10370
10371namespace {
10372/// Allows to correctly handle operands of the phi nodes based on the \p Main
10373/// PHINode order of incoming basic blocks/values.
10374class PHIHandler {
10375 DominatorTree &DT;
10376 PHINode *Main = nullptr;
10379
10380public:
10381 PHIHandler() = delete;
10382 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10383 : DT(DT), Main(Main), Phis(Phis),
10384 Operands(Main->getNumIncomingValues(),
10385 SmallVector<Value *>(Phis.size(), nullptr)) {}
10386 void buildOperands() {
10387 constexpr unsigned FastLimit = 4;
10388 if (Main->getNumIncomingValues() <= FastLimit) {
10389 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10390 BasicBlock *InBB = Main->getIncomingBlock(I);
10391 if (!DT.isReachableFromEntry(InBB)) {
10392 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10393 continue;
10394 }
10395 // Prepare the operand vector.
10396 for (auto [Idx, V] : enumerate(Phis)) {
10397 auto *P = dyn_cast<PHINode>(V);
10398 if (!P) {
10400 "Expected isa instruction or poison value.");
10401 Operands[I][Idx] = V;
10402 continue;
10403 }
10404 if (P->getIncomingBlock(I) == InBB)
10405 Operands[I][Idx] = P->getIncomingValue(I);
10406 else
10407 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10408 }
10409 }
10410 return;
10411 }
10412 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10413 Blocks;
10414 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10415 BasicBlock *InBB = Main->getIncomingBlock(I);
10416 if (!DT.isReachableFromEntry(InBB)) {
10417 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10418 continue;
10419 }
10420 Blocks.try_emplace(InBB).first->second.push_back(I);
10421 }
10422 for (auto [Idx, V] : enumerate(Phis)) {
10423 if (isa<PoisonValue>(V)) {
10424 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10425 Operands[I][Idx] = V;
10426 continue;
10427 }
10428 auto *P = cast<PHINode>(V);
10429 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10430 BasicBlock *InBB = P->getIncomingBlock(I);
10431 if (InBB == Main->getIncomingBlock(I)) {
10432 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
10433 continue;
10434 Operands[I][Idx] = P->getIncomingValue(I);
10435 continue;
10436 }
10437 auto *It = Blocks.find(InBB);
10438 if (It == Blocks.end())
10439 continue;
10440 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10441 }
10442 }
10443 for (const auto &P : Blocks) {
10444 ArrayRef<unsigned> IncomingValues = P.second;
10445 if (IncomingValues.size() <= 1)
10446 continue;
10447 unsigned BasicI = IncomingValues.consume_front();
10448 for (unsigned I : IncomingValues) {
10449 assert(all_of(enumerate(Operands[I]),
10450 [&](const auto &Data) {
10451 return !Data.value() ||
10452 Data.value() == Operands[BasicI][Data.index()];
10453 }) &&
10454 "Expected empty operands list.");
10455 Operands[I] = Operands[BasicI];
10456 }
10457 }
10458 }
10459 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10460};
10461} // namespace
10462
10463/// Returns main/alternate instructions for the given \p VL. Unlike
10464/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10465/// node support.
10466/// \returns first main/alt instructions, if only poisons and instruction with
10467/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10468static std::pair<Instruction *, Instruction *>
10470 Instruction *MainOp = nullptr;
10471 Instruction *AltOp = nullptr;
10472 for (Value *V : VL) {
10473 if (isa<PoisonValue>(V))
10474 continue;
10475 auto *I = dyn_cast<Instruction>(V);
10476 if (!I)
10477 return {};
10478 if (!MainOp) {
10479 MainOp = I;
10480 continue;
10481 }
10482 if (MainOp->getOpcode() == I->getOpcode()) {
10483 if (I->getParent() != MainOp->getParent())
10484 return {};
10485 continue;
10486 }
10487 if (!AltOp) {
10488 AltOp = I;
10489 continue;
10490 }
10491 if (AltOp->getOpcode() == I->getOpcode()) {
10492 if (I->getParent() != AltOp->getParent())
10493 return {};
10494 continue;
10495 }
10496 return {};
10497 }
10498 if (!AltOp)
10499 return {};
10500 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10501 "Expected different main and alt instructions.");
10502 return std::make_pair(MainOp, AltOp);
10503}
10504
10505/// Checks that every instruction appears once in the list and if not, packs
10506/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10507/// unique scalars is extended by poison values to the whole register size.
10508///
10509/// \returns false if \p VL could not be uniquified, in which case \p VL is
10510/// unchanged and \p ReuseShuffleIndices is empty.
10512 SmallVectorImpl<int> &ReuseShuffleIndices,
10513 const TargetTransformInfo &TTI,
10514 const TargetLibraryInfo &TLI,
10515 const InstructionsState &S,
10516 const BoUpSLP::EdgeInfo &UserTreeIdx,
10517 bool TryPad = false) {
10518 // Check that every instruction appears once in this bundle.
10519 SmallVector<Value *> UniqueValues;
10520 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10521 for (Value *V : VL) {
10522 if (isConstant(V)) {
10523 // Constants are always considered distinct, even if the same constant
10524 // appears multiple times in VL.
10525 ReuseShuffleIndices.emplace_back(
10526 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10527 UniqueValues.emplace_back(V);
10528 continue;
10529 }
10530 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10531 ReuseShuffleIndices.emplace_back(Res.first->second);
10532 if (Res.second)
10533 UniqueValues.emplace_back(V);
10534 }
10535
10536 // Easy case: VL has unique values and a "natural" size
10537 size_t NumUniqueScalarValues = UniqueValues.size();
10538 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10539 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10540 if (NumUniqueScalarValues == VL.size() &&
10541 (VectorizeNonPowerOf2 || IsFullVectors)) {
10542 ReuseShuffleIndices.clear();
10543 return true;
10544 }
10545
10546 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10547 if ((UserTreeIdx.UserTE &&
10548 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10550 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10551 "for nodes with padding.\n");
10552 ReuseShuffleIndices.clear();
10553 return false;
10554 }
10555
10556 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10557 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10558 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10559 return isa<UndefValue>(V) || !isConstant(V);
10560 }))) {
10561 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10562 S.getMainOp()->isSafeToRemove() &&
10563 (S.areInstructionsWithCopyableElements() ||
10564 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10565 // Find the number of elements, which forms full vectors.
10566 unsigned PWSz = getFullVectorNumberOfElements(
10567 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10568 PWSz = std::min<unsigned>(PWSz, VL.size());
10569 if (PWSz == VL.size()) {
10570 // We ended up with the same size after removing duplicates and
10571 // upgrading the resulting vector size to a "nice size". Just keep
10572 // the initial VL then.
10573 ReuseShuffleIndices.clear();
10574 } else {
10575 // Pad unique values with poison to grow the vector to a "nice" size
10576 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10577 UniqueValues.end());
10578 PaddedUniqueValues.append(
10579 PWSz - UniqueValues.size(),
10580 PoisonValue::get(UniqueValues.front()->getType()));
10581 // Check that extended with poisons/copyable operations are still valid
10582 // for vectorization (div/rem are not allowed).
10583 if ((!S.areInstructionsWithCopyableElements() &&
10584 !getSameOpcode(PaddedUniqueValues, TLI).valid()) ||
10585 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10586 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10587 isa<CallInst>(S.getMainOp())))) {
10588 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10589 ReuseShuffleIndices.clear();
10590 return false;
10591 }
10592 VL = std::move(PaddedUniqueValues);
10593 }
10594 return true;
10595 }
10596 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10597 ReuseShuffleIndices.clear();
10598 return false;
10599 }
10600 VL = std::move(UniqueValues);
10601 return true;
10602}
10603
10604bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
10605 const InstructionsState &LocalState,
10606 SmallVectorImpl<Value *> &Op1,
10607 SmallVectorImpl<Value *> &Op2,
10608 OrdersType &ReorderIndices) const {
10609 constexpr unsigned SmallNodeSize = 4;
10610 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10612 return false;
10613
10614 // Check if this is a duplicate of another split entry.
10615 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
10616 << ".\n");
10617 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
10618 if (E->isSame(VL)) {
10619 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
10620 << *LocalState.getMainOp() << ".\n");
10621 return false;
10622 }
10623 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
10624 if (all_of(VL, [&](Value *V) {
10625 return isa<PoisonValue>(V) || Values.contains(V);
10626 })) {
10627 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
10628 return false;
10629 }
10630 }
10631
10632 ReorderIndices.assign(VL.size(), VL.size());
10633 SmallBitVector Op1Indices(VL.size());
10634 for (auto [Idx, V] : enumerate(VL)) {
10635 auto *I = dyn_cast<Instruction>(V);
10636 if (!I) {
10637 Op1.push_back(V);
10638 Op1Indices.set(Idx);
10639 continue;
10640 }
10641 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10642 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
10643 *TLI)) ||
10644 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10645 !isAlternateInstruction(I, LocalState.getMainOp(),
10646 LocalState.getAltOp(), *TLI))) {
10647 Op1.push_back(V);
10648 Op1Indices.set(Idx);
10649 continue;
10650 }
10651 Op2.push_back(V);
10652 }
10653 Type *ScalarTy = getValueType(VL.front());
10654 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
10655 unsigned Opcode0 = LocalState.getOpcode();
10656 unsigned Opcode1 = LocalState.getAltOpcode();
10657 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10658 // Enable split node, only if all nodes do not form legal alternate
10659 // instruction (like X86 addsub).
10660 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
10661 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
10662 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10663 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10664 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
10665 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
10666 return false;
10667 // Enable split node, only if all nodes are power-of-2/full registers.
10668 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
10669 for (unsigned Idx : seq<unsigned>(VL.size())) {
10670 if (Op1Indices.test(Idx)) {
10671 ReorderIndices[Op1Cnt] = Idx;
10672 ++Op1Cnt;
10673 } else {
10674 ReorderIndices[Op2Cnt] = Idx;
10675 ++Op2Cnt;
10676 }
10677 }
10678 if (isIdentityOrder(ReorderIndices))
10679 ReorderIndices.clear();
10680 SmallVector<int> Mask;
10681 if (!ReorderIndices.empty())
10682 inversePermutation(ReorderIndices, Mask);
10683 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10684 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
10685 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
10686 // Check non-profitable single register ops, which better to be represented
10687 // as alternate ops.
10688 if (NumParts >= VL.size())
10689 return false;
10691 InstructionCost InsertCost = ::getShuffleCost(
10692 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
10693 FixedVectorType *SubVecTy =
10694 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
10695 InstructionCost NewShuffleCost =
10696 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
10697 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10698 (Mask.empty() || InsertCost >= NewShuffleCost))
10699 return false;
10700 if ((LocalState.getMainOp()->isBinaryOp() &&
10701 LocalState.getAltOp()->isBinaryOp() &&
10702 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10703 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10704 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10705 (LocalState.getMainOp()->isUnaryOp() &&
10706 LocalState.getAltOp()->isUnaryOp())) {
10707 InstructionCost OriginalVecOpsCost =
10708 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10709 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10710 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
10711 for (unsigned Idx : seq<unsigned>(VL.size())) {
10712 if (isa<PoisonValue>(VL[Idx]))
10713 continue;
10714 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
10715 }
10716 InstructionCost OriginalCost =
10717 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
10718 VecTy, OriginalMask, Kind);
10719 InstructionCost NewVecOpsCost =
10720 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10721 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10722 InstructionCost NewCost =
10723 NewVecOpsCost + InsertCost +
10724 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10725 VectorizableTree.front()->getOpcode() == Instruction::Store
10726 ? NewShuffleCost
10727 : 0);
10728 // If not profitable to split - exit.
10729 if (NewCost >= OriginalCost)
10730 return false;
10731 }
10732 return true;
10733}
10734
10735namespace {
10736/// Class accepts incoming list of values, checks if it is able to model
10737/// "copyable" values as compatible operations, and generates the list of values
10738/// for scheduling and list of operands doe the new nodes.
10739class InstructionsCompatibilityAnalysis {
10740 DominatorTree &DT;
10741 const DataLayout &DL;
10742 const TargetTransformInfo &TTI;
10743 const TargetLibraryInfo &TLI;
10744 unsigned MainOpcode = 0;
10745 Instruction *MainOp = nullptr;
10746
10747 /// Checks if the opcode is supported as the main opcode for copyable
10748 /// elements.
10749 static bool isSupportedOpcode(const unsigned Opcode) {
10750 return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
10751 Opcode == Instruction::Shl || Opcode == Instruction::SDiv ||
10752 Opcode == Instruction::UDiv || Opcode == Instruction::And ||
10753 Opcode == Instruction::Or || Opcode == Instruction::Xor;
10754 }
10755
10756 /// Identifies the best candidate value, which represents main opcode
10757 /// operation.
10758 /// Currently the best candidate is the Add instruction with the parent
10759 /// block with the highest DFS incoming number (block, that dominates other).
10760 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
10761 BasicBlock *Parent = nullptr;
10762 // Checks if the instruction has supported opcode.
10763 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
10764 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
10765 return false;
10766 return I && isSupportedOpcode(I->getOpcode()) &&
10767 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
10768 };
10769 // Exclude operands instructions immediately to improve compile time, it
10770 // will be unable to schedule anyway.
10771 SmallDenseSet<Value *, 8> Operands;
10772 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10773 bool AnyUndef = false;
10774 for (Value *V : VL) {
10775 auto *I = dyn_cast<Instruction>(V);
10776 if (!I) {
10777 AnyUndef |= isa<UndefValue>(V);
10778 continue;
10779 }
10780 if (!DT.isReachableFromEntry(I->getParent()))
10781 continue;
10782 if (Candidates.empty()) {
10783 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10784 Parent = I->getParent();
10785 Operands.insert(I->op_begin(), I->op_end());
10786 continue;
10787 }
10788 if (Parent == I->getParent()) {
10789 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10790 Operands.insert(I->op_begin(), I->op_end());
10791 continue;
10792 }
10793 auto *NodeA = DT.getNode(Parent);
10794 auto *NodeB = DT.getNode(I->getParent());
10795 assert(NodeA && "Should only process reachable instructions");
10796 assert(NodeB && "Should only process reachable instructions");
10797 assert((NodeA == NodeB) ==
10798 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10799 "Different nodes should have different DFS numbers");
10800 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10801 Candidates.clear();
10802 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10803 Parent = I->getParent();
10804 Operands.clear();
10805 Operands.insert(I->op_begin(), I->op_end());
10806 }
10807 }
10808 unsigned BestOpcodeNum = 0;
10809 MainOp = nullptr;
10810 for (const auto &P : Candidates) {
10811 if (P.second.size() < BestOpcodeNum)
10812 continue;
10813 for (Instruction *I : P.second) {
10814 if (IsSupportedInstruction(I, AnyUndef) && !Operands.contains(I)) {
10815 MainOp = I;
10816 BestOpcodeNum = P.second.size();
10817 break;
10818 }
10819 }
10820 }
10821 if (MainOp) {
10822 // Do not match, if any copyable is a terminator from the same block as
10823 // the main operation.
10824 if (any_of(VL, [&](Value *V) {
10825 auto *I = dyn_cast<Instruction>(V);
10826 return I && I->getParent() == MainOp->getParent() &&
10827 I->isTerminator();
10828 })) {
10829 MainOp = nullptr;
10830 return;
10831 }
10832 MainOpcode = MainOp->getOpcode();
10833 }
10834 }
10835
10836 /// Returns the idempotent value for the \p MainOp with the detected \p
10837 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
10838 /// the operand itself, since V or V == V.
10839 Value *selectBestIdempotentValue() const {
10840 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10841 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
10842 !MainOp->isCommutative());
10843 }
10844
10845 /// Returns the value and operands for the \p V, considering if it is original
10846 /// instruction and its actual operands should be returned, or it is a
10847 /// copyable element and its should be represented as idempotent instruction.
10848 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
10849 if (isa<PoisonValue>(V))
10850 return {V, V};
10851 if (!S.isCopyableElement(V))
10852 return convertTo(cast<Instruction>(V), S).second;
10853 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10854 return {V, selectBestIdempotentValue()};
10855 }
10856
10857 /// Builds operands for the original instructions.
10858 void
10859 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
10860 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
10861
10862 unsigned ShuffleOrOp =
10863 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10864 Instruction *VL0 = S.getMainOp();
10865
10866 switch (ShuffleOrOp) {
10867 case Instruction::PHI: {
10868 auto *PH = cast<PHINode>(VL0);
10869
10870 // Keeps the reordered operands to avoid code duplication.
10871 PHIHandler Handler(DT, PH, VL);
10872 Handler.buildOperands();
10873 Operands.assign(PH->getNumOperands(), {});
10874 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
10875 Operands[I].assign(Handler.getOperands(I).begin(),
10876 Handler.getOperands(I).end());
10877 return;
10878 }
10879 case Instruction::ExtractValue:
10880 case Instruction::ExtractElement:
10881 // This is a special case, as it does not gather, but at the same time
10882 // we are not extending buildTree_rec() towards the operands.
10883 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
10884 return;
10885 case Instruction::InsertElement:
10886 Operands.assign(2, {VL.size(), nullptr});
10887 for (auto [Idx, V] : enumerate(VL)) {
10888 auto *IE = cast<InsertElementInst>(V);
10889 for (auto [OpIdx, Ops] : enumerate(Operands))
10890 Ops[Idx] = IE->getOperand(OpIdx);
10891 }
10892 return;
10893 case Instruction::Load:
10894 Operands.assign(
10895 1, {VL.size(),
10896 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
10897 for (auto [V, Op] : zip(VL, Operands.back())) {
10898 auto *LI = dyn_cast<LoadInst>(V);
10899 if (!LI)
10900 continue;
10901 Op = LI->getPointerOperand();
10902 }
10903 return;
10904 case Instruction::ZExt:
10905 case Instruction::SExt:
10906 case Instruction::FPToUI:
10907 case Instruction::FPToSI:
10908 case Instruction::FPExt:
10909 case Instruction::PtrToInt:
10910 case Instruction::IntToPtr:
10911 case Instruction::SIToFP:
10912 case Instruction::UIToFP:
10913 case Instruction::Trunc:
10914 case Instruction::FPTrunc:
10915 case Instruction::BitCast:
10916 case Instruction::ICmp:
10917 case Instruction::FCmp:
10918 case Instruction::Select:
10919 case Instruction::FNeg:
10920 case Instruction::Add:
10921 case Instruction::FAdd:
10922 case Instruction::Sub:
10923 case Instruction::FSub:
10924 case Instruction::Mul:
10925 case Instruction::FMul:
10926 case Instruction::UDiv:
10927 case Instruction::SDiv:
10928 case Instruction::FDiv:
10929 case Instruction::URem:
10930 case Instruction::SRem:
10931 case Instruction::FRem:
10932 case Instruction::Shl:
10933 case Instruction::LShr:
10934 case Instruction::AShr:
10935 case Instruction::And:
10936 case Instruction::Or:
10937 case Instruction::Xor:
10938 case Instruction::Freeze:
10939 case Instruction::Store:
10940 case Instruction::ShuffleVector:
10941 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
10942 for (auto [Idx, V] : enumerate(VL)) {
10943 auto *I = dyn_cast<Instruction>(V);
10944 if (!I) {
10945 for (auto [OpIdx, Ops] : enumerate(Operands))
10946 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
10947 continue;
10948 }
10949 auto [Op, ConvertedOps] = convertTo(I, S);
10950 for (auto [OpIdx, Ops] : enumerate(Operands))
10951 Ops[Idx] = ConvertedOps[OpIdx];
10952 }
10953 return;
10954 case Instruction::GetElementPtr: {
10955 Operands.assign(2, {VL.size(), nullptr});
10956 // Need to cast all indices to the same type before vectorization to
10957 // avoid crash.
10958 // Required to be able to find correct matches between different gather
10959 // nodes and reuse the vectorized values rather than trying to gather them
10960 // again.
10961 const unsigned IndexIdx = 1;
10962 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
10963 Type *Ty =
10964 all_of(VL,
10965 [&](Value *V) {
10967 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
10968 })
10969 ? VL0Ty
10970 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
10971 ->getPointerOperandType()
10972 ->getScalarType());
10973 for (auto [Idx, V] : enumerate(VL)) {
10975 if (!GEP) {
10976 Operands[0][Idx] = V;
10977 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10978 continue;
10979 }
10980 Operands[0][Idx] = GEP->getPointerOperand();
10981 auto *Op = GEP->getOperand(IndexIdx);
10982 auto *CI = dyn_cast<ConstantInt>(Op);
10983 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
10984 CI, Ty, CI->getValue().isSignBitSet(), DL)
10985 : Op;
10986 }
10987 return;
10988 }
10989 case Instruction::Call: {
10990 auto *CI = cast<CallInst>(VL0);
10992 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
10994 continue;
10995 auto &Ops = Operands.emplace_back();
10996 for (Value *V : VL) {
10997 auto *I = dyn_cast<Instruction>(V);
10998 Ops.push_back(I ? I->getOperand(Idx)
10999 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
11000 }
11001 }
11002 return;
11003 }
11004 default:
11005 break;
11006 }
11007 llvm_unreachable("Unexpected vectorization of the instructions.");
11008 }
11009
11010public:
11011 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
11012 const TargetTransformInfo &TTI,
11013 const TargetLibraryInfo &TLI)
11014 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
11015
11016 InstructionsState
11017 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
11018 bool TryCopyableElementsVectorization,
11019 bool WithProfitabilityCheck = false,
11020 bool SkipSameCodeCheck = false) {
11021 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
11022 ? InstructionsState::invalid()
11023 : getSameOpcode(VL, TLI);
11024 if (S)
11025 return S;
11026 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
11027 return S;
11028 findAndSetMainInstruction(VL, R);
11029 if (!MainOp)
11030 return InstructionsState::invalid();
11031 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
11032 if (!WithProfitabilityCheck)
11033 return S;
11034 // Check if it is profitable to vectorize the instruction.
11035 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
11036 auto BuildCandidates =
11037 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
11038 Value *V2) {
11039 if (V1 != V2 && isa<PHINode>(V1))
11040 return;
11041 auto *I1 = dyn_cast<Instruction>(V1);
11042 auto *I2 = dyn_cast<Instruction>(V2);
11043 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
11044 I1->getParent() != I2->getParent())
11045 return;
11046 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
11047 };
11048 if (VL.size() == 2) {
11049 // Check if the operands allow better vectorization.
11050 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
11051 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11052 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11053 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
11054 R.findBestRootPair(Candidates1) &&
11055 R.findBestRootPair(Candidates2);
11056 if (!Res && isCommutative(MainOp)) {
11057 Candidates1.clear();
11058 Candidates2.clear();
11059 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11060 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11061 Res = !Candidates1.empty() && !Candidates2.empty() &&
11062 R.findBestRootPair(Candidates1) &&
11063 R.findBestRootPair(Candidates2);
11064 }
11065 if (!Res)
11066 return InstructionsState::invalid();
11068 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
11069 InstructionCost VectorCost;
11070 FixedVectorType *VecTy =
11071 getWidenedType(S.getMainOp()->getType(), VL.size());
11072 switch (MainOpcode) {
11073 case Instruction::Add:
11074 case Instruction::LShr:
11075 case Instruction::Shl:
11076 case Instruction::SDiv:
11077 case Instruction::UDiv:
11078 case Instruction::And:
11079 case Instruction::Or:
11080 case Instruction::Xor:
11081 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
11082 break;
11083 default:
11084 llvm_unreachable("Unexpected instruction.");
11085 }
11086 if (VectorCost > ScalarCost)
11087 return InstructionsState::invalid();
11088 return S;
11089 }
11090 assert(Operands.size() == 2 && "Unexpected number of operands!");
11091 unsigned CopyableNum =
11092 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
11093 if (CopyableNum < VL.size() / 2)
11094 return S;
11095 // Too many phi copyables - exit.
11096 const unsigned Limit = VL.size() / 24;
11097 if ((CopyableNum >= VL.size() - Limit ||
11098 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
11099 CopyableNum >= MaxPHINumOperands) &&
11100 all_of(VL, [&](Value *V) {
11101 return isa<PHINode>(V) || !S.isCopyableElement(V);
11102 }))
11103 return InstructionsState::invalid();
11104 // Check profitability if number of copyables > VL.size() / 2.
11105 // 1. Reorder operands for better matching.
11106 if (isCommutative(MainOp)) {
11107 for (auto &Ops : Operands) {
11108 // Make instructions the first operands.
11109 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
11110 std::swap(Ops.front(), Ops.back());
11111 continue;
11112 }
11113 // Make constants the second operands.
11114 if (isa<Constant>(Ops.front())) {
11115 std::swap(Ops.front(), Ops.back());
11116 continue;
11117 }
11118 }
11119 }
11120 // 2. Check, if operands can be vectorized.
11121 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
11122 return InstructionsState::invalid();
11123 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11124 if (allConstant(Ops) || isSplat(Ops))
11125 return true;
11126 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
11127 // one is different.
11128 constexpr unsigned Limit = 4;
11129 if (Operands.front().size() >= Limit) {
11130 SmallDenseMap<const Value *, unsigned> Counters;
11131 for (Value *V : Ops) {
11132 if (isa<UndefValue>(V))
11133 continue;
11134 ++Counters[V];
11135 }
11136 if (Counters.size() == 2 &&
11137 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
11138 return C.second == 1;
11139 }))
11140 return true;
11141 }
11142 // First operand not a constant or splat? Last attempt - check for
11143 // potential vectorization.
11144 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11145 InstructionsState OpS = Analysis.buildInstructionsState(
11146 Ops, R, /*TryCopyableElementsVectorization=*/true);
11147 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
11148 return false;
11149 unsigned CopyableNum =
11150 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
11151 return CopyableNum <= VL.size() / 2;
11152 };
11153 if (!CheckOperand(Operands.front()))
11154 return InstructionsState::invalid();
11155
11156 return S;
11157 }
11158
11159 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11160 ArrayRef<Value *> VL) {
11161 assert(S && "Invalid state!");
11163 if (S.areInstructionsWithCopyableElements()) {
11164 MainOp = S.getMainOp();
11165 MainOpcode = S.getOpcode();
11166 Operands.assign(MainOp->getNumOperands(),
11167 BoUpSLP::ValueList(VL.size(), nullptr));
11168 for (auto [Idx, V] : enumerate(VL)) {
11169 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11170 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
11171 Operands[OperandIdx][Idx] = Operand;
11172 }
11173 } else {
11174 buildOriginalOperands(S, VL, Operands);
11175 }
11176 return Operands;
11177 }
11178};
11179} // namespace
11180
11181BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11182 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11183 bool TryCopyableElementsVectorization) const {
11184 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11185
11186 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11187 InstructionsState S = Analysis.buildInstructionsState(
11188 VL, *this, TryCopyableElementsVectorization,
11189 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
11190
11191 // Don't go into catchswitch blocks, which can happen with PHIs.
11192 // Such blocks can only have PHIs and the catchswitch. There is no
11193 // place to insert a shuffle if we need to, so just avoid that issue.
11194 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
11195 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11196 // Do not try to pack to avoid extra instructions here.
11197 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11198 /*TryToFindDuplicates=*/false);
11199 }
11200
11201 // Check if this is a duplicate of another entry.
11202 if (S) {
11203 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11204 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11205 if (E->isSame(VL)) {
11206 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11207 << ".\n");
11208 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11209 }
11210 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11211 if (all_of(VL, [&](Value *V) {
11212 return isa<PoisonValue>(V) || Values.contains(V) ||
11213 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11214 LI->getLoopFor(S.getMainOp()->getParent()) &&
11215 isVectorized(V));
11216 })) {
11217 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11218 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11219 }
11220 }
11221 }
11222
11223 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11224 // a load), in which case peek through to include it in the tree, without
11225 // ballooning over-budget.
11226 if (Depth >= RecursionMaxDepth &&
11227 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
11228 (match(S.getMainOp(), m_Load(m_Value())) ||
11229 all_of(VL, [&S](const Value *I) {
11230 return match(I,
11232 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11233 })))) {
11234 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11235 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11236 }
11237
11238 // Don't handle scalable vectors
11239 if (S && S.getOpcode() == Instruction::ExtractElement &&
11241 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11242 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11243 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11244 }
11245
11246 // Don't handle vectors.
11247 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11248 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11249 // Do not try to pack to avoid extra instructions here.
11250 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11251 /*TryToFindDuplicates=*/false);
11252 }
11253
11254 // If all of the operands are identical or constant we have a simple solution.
11255 // If we deal with insert/extract instructions, they all must have constant
11256 // indices, otherwise we should gather them, not try to vectorize.
11257 // If alternate op node with 2 elements with gathered operands - do not
11258 // vectorize.
11259 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11260 if (!S || !S.isAltShuffle() || VL.size() > 2)
11261 return false;
11262 if (VectorizableTree.size() < MinTreeSize)
11263 return false;
11264 if (Depth >= RecursionMaxDepth - 1)
11265 return true;
11266 // Check if all operands are extracts, part of vector node or can build a
11267 // regular vectorize node.
11268 SmallVector<unsigned, 8> InstsCount;
11269 for (Value *V : VL) {
11270 auto *I = cast<Instruction>(V);
11271 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11272 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11273 }));
11274 }
11275 bool IsCommutative =
11276 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11277 if ((IsCommutative &&
11278 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11279 (!IsCommutative &&
11280 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11281 return true;
11282 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11284 auto *I1 = cast<Instruction>(VL.front());
11285 auto *I2 = cast<Instruction>(VL.back());
11286 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11287 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11288 I2->getOperand(Op));
11289 if (static_cast<unsigned>(count_if(
11290 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11292 })) >= S.getMainOp()->getNumOperands() / 2)
11293 return false;
11294 if (S.getMainOp()->getNumOperands() > 2)
11295 return true;
11296 if (IsCommutative) {
11297 // Check permuted operands.
11298 Candidates.clear();
11299 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11300 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11301 I2->getOperand((Op + 1) % E));
11302 if (any_of(
11303 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11305 }))
11306 return false;
11307 }
11308 return true;
11309 };
11310 SmallVector<unsigned> SortedIndices;
11311 BasicBlock *BB = nullptr;
11312 bool IsScatterVectorizeUserTE =
11313 UserTreeIdx.UserTE &&
11314 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11315 bool AreAllSameBlock = S.valid();
11316 bool AreScatterAllGEPSameBlock =
11317 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11318 VL.size() > 2 &&
11319 all_of(VL,
11320 [&BB](Value *V) {
11321 auto *I = dyn_cast<GetElementPtrInst>(V);
11322 if (!I)
11323 return doesNotNeedToBeScheduled(V);
11324 if (!BB)
11325 BB = I->getParent();
11326 return BB == I->getParent() && I->getNumOperands() == 2;
11327 }) &&
11328 BB &&
11329 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11330 SortedIndices));
11331 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11332 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
11333 (S &&
11335 S.getMainOp()) &&
11337 NotProfitableForVectorization(VL)) {
11338 if (!S) {
11339 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11340 "C,S,B,O, small shuffle. \n";
11341 dbgs() << "[";
11342 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11343 dbgs() << "]\n");
11344 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11345 /*TryToFindDuplicates=*/true,
11346 /*TrySplitVectorize=*/true);
11347 }
11348 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11349 dbgs() << "[";
11350 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11351 dbgs() << "]\n");
11352 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11353 }
11354
11355 // Don't vectorize ephemeral values.
11356 if (S && !EphValues.empty()) {
11357 for (Value *V : VL) {
11358 if (EphValues.count(V)) {
11359 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11360 << ") is ephemeral.\n");
11361 // Do not try to pack to avoid extra instructions here.
11362 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11363 /*TryToFindDuplicates=*/false);
11364 }
11365 }
11366 }
11367
11368 // We now know that this is a vector of instructions of the same type from
11369 // the same block.
11370
11371 // Check that none of the instructions in the bundle are already in the tree
11372 // and the node may be not profitable for the vectorization as the small
11373 // alternate node.
11374 if (S && S.isAltShuffle()) {
11375 auto GetNumVectorizedExtracted = [&]() {
11376 APInt Extracted = APInt::getZero(VL.size());
11377 APInt Vectorized = APInt::getAllOnes(VL.size());
11378 for (auto [Idx, V] : enumerate(VL)) {
11379 auto *I = dyn_cast<Instruction>(V);
11380 if (!I || doesNotNeedToBeScheduled(I) ||
11381 all_of(I->operands(), [&](const Use &U) {
11382 return isa<ExtractElementInst>(U.get());
11383 }))
11384 continue;
11385 if (isVectorized(I))
11386 Vectorized.clearBit(Idx);
11387 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11388 Extracted.setBit(Idx);
11389 }
11390 return std::make_pair(Vectorized, Extracted);
11391 };
11392 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11394 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11395 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11396 // Rough cost estimation, if the vector code (+ potential extracts) is
11397 // more profitable than the scalar + buildvector.
11398 Type *ScalarTy = VL.front()->getType();
11399 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11400 InstructionCost VectorizeCostEstimate =
11401 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
11402 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11403 /*Insert=*/false, /*Extract=*/true, Kind);
11404 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11405 *TTI, ScalarTy, VecTy, Vectorized,
11406 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11407 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11408 }
11409 if (PreferScalarize) {
11410 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11411 "node is not profitable.\n");
11412 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11413 }
11414 }
11415
11416 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11417 if (UserIgnoreList && !UserIgnoreList->empty()) {
11418 for (Value *V : VL) {
11419 if (UserIgnoreList->contains(V)) {
11420 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11421 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11422 }
11423 }
11424 }
11425
11426 // Special processing for sorted pointers for ScatterVectorize node with
11427 // constant indeces only.
11428 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11429 assert(VL.front()->getType()->isPointerTy() &&
11431 "Expected pointers only.");
11432 // Reset S to make it GetElementPtr kind of node.
11433 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11434 assert(It != VL.end() && "Expected at least one GEP.");
11435 S = getSameOpcode(*It, *TLI);
11436 }
11437
11438 // Check that all of the users of the scalars that we want to vectorize are
11439 // schedulable.
11440 Instruction *VL0 = S.getMainOp();
11441 BB = VL0->getParent();
11442
11443 if (S &&
11445 !DT->isReachableFromEntry(BB))) {
11446 // Don't go into unreachable blocks. They may contain instructions with
11447 // dependency cycles which confuse the final scheduling.
11448 // Do not vectorize EH and non-returning blocks, not profitable in most
11449 // cases.
11450 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11451 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11452 }
11453 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11454}
11455
11456void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11457 const EdgeInfo &UserTreeIdx,
11458 unsigned InterleaveFactor) {
11459 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11460
11461 SmallVector<int> ReuseShuffleIndices;
11462 SmallVector<Value *> VL(VLRef);
11463
11464 // Tries to build split node.
11465 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11466 SmallVector<Value *> Op1, Op2;
11467 OrdersType ReorderIndices;
11468 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11469 return false;
11470
11471 auto Invalid = ScheduleBundle::invalid();
11472 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11473 UserTreeIdx, {}, ReorderIndices);
11474 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11475 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11476 InstructionsState S = getSameOpcode(Op, *TLI);
11477 if (S && (isa<LoadInst>(S.getMainOp()) ||
11478 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11479 // Build gather node for loads, they will be gathered later.
11480 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11481 Idx == 0 ? 0 : Op1.size());
11482 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11483 } else {
11484 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11485 Idx == 0 ? 0 : Op1.size());
11486 buildTreeRec(Op, Depth, {TE, Idx});
11487 }
11488 };
11489 AddNode(Op1, 0);
11490 AddNode(Op2, 1);
11491 return true;
11492 };
11493
11494 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11495 bool AreConsts = false;
11496 for (Value *V : VL) {
11497 if (isa<PoisonValue>(V))
11498 continue;
11499 if (isa<Constant>(V)) {
11500 AreConsts = true;
11501 continue;
11502 }
11503 if (!isa<PHINode>(V))
11504 return false;
11505 }
11506 return AreConsts;
11507 };
11508 if (AreOnlyConstsWithPHIs(VL)) {
11509 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11510 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11511 return;
11512 }
11513
11514 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11515 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11516 InstructionsState S = Legality.getInstructionsState();
11517 if (!Legality.isLegal()) {
11518 if (Legality.trySplitVectorize()) {
11519 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11520 // Last chance to try to vectorize alternate node.
11521 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11522 return;
11523 }
11524 if (!S)
11525 Legality = getScalarsVectorizationLegality(
11526 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11527 if (!Legality.isLegal()) {
11528 if (Legality.tryToFindDuplicates())
11529 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11530 UserTreeIdx);
11531
11532 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11533 return;
11534 }
11535 S = Legality.getInstructionsState();
11536 }
11537
11538 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11539 if (S.isAltShuffle() && TrySplitNode(S))
11540 return;
11541
11542 // Check that every instruction appears once in this bundle.
11543 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11544 /*TryPad=*/true)) {
11545 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11546 return;
11547 }
11548
11549 // Perform specific checks for each particular instruction kind.
11550 bool IsScatterVectorizeUserTE =
11551 UserTreeIdx.UserTE &&
11552 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11553 OrdersType CurrentOrder;
11554 SmallVector<Value *> PointerOps;
11555 StridedPtrInfo SPtrInfo;
11556 TreeEntry::EntryState State = getScalarsVectorizationState(
11557 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11558 if (State == TreeEntry::NeedToGather) {
11559 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11560 return;
11561 }
11562
11563 Instruction *VL0 = S.getMainOp();
11564 BasicBlock *BB = VL0->getParent();
11565 auto &BSRef = BlocksSchedules[BB];
11566 if (!BSRef)
11567 BSRef = std::make_unique<BlockScheduling>(BB);
11568
11569 BlockScheduling &BS = *BSRef;
11570
11571 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11572 std::optional<ScheduleBundle *> BundlePtr =
11573 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
11574#ifdef EXPENSIVE_CHECKS
11575 // Make sure we didn't break any internal invariants
11576 BS.verify();
11577#endif
11578 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11579 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11580 // Last chance to try to vectorize alternate node.
11581 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11582 return;
11583 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11584 NonScheduledFirst.insert(VL.front());
11585 if (S.getOpcode() == Instruction::Load &&
11586 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11588 return;
11589 }
11590 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11591 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
11592 ScheduleBundle Empty;
11593 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
11594 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
11595
11596 unsigned ShuffleOrOp =
11597 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11598 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
11599 // Postpone PHI nodes creation
11600 SmallVector<unsigned> PHIOps;
11601 for (unsigned I : seq<unsigned>(Operands.size())) {
11602 ArrayRef<Value *> Op = Operands[I];
11603 if (Op.empty())
11604 continue;
11605 InstructionsState S = getSameOpcode(Op, *TLI);
11606 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11607 buildTreeRec(Op, Depth + 1, {TE, I});
11608 else
11609 PHIOps.push_back(I);
11610 }
11611 for (unsigned I : PHIOps)
11612 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11613 };
11614 switch (ShuffleOrOp) {
11615 case Instruction::PHI: {
11616 TreeEntry *TE =
11617 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11618 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
11619 TE->dump());
11620
11621 TE->setOperands(Operands);
11622 CreateOperandNodes(TE, Operands);
11623 return;
11624 }
11625 case Instruction::ExtractValue:
11626 case Instruction::ExtractElement: {
11627 if (CurrentOrder.empty()) {
11628 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
11629 } else {
11630 LLVM_DEBUG({
11631 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
11632 "with order";
11633 for (unsigned Idx : CurrentOrder)
11634 dbgs() << " " << Idx;
11635 dbgs() << "\n";
11636 });
11637 fixupOrderingIndices(CurrentOrder);
11638 }
11639 // Insert new order with initial value 0, if it does not exist,
11640 // otherwise return the iterator to the existing one.
11641 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11642 ReuseShuffleIndices, CurrentOrder);
11643 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
11644 "(ExtractValueInst/ExtractElementInst).\n";
11645 TE->dump());
11646 // This is a special case, as it does not gather, but at the same time
11647 // we are not extending buildTreeRec() towards the operands.
11648 TE->setOperands(Operands);
11649 return;
11650 }
11651 case Instruction::InsertElement: {
11652 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
11653
11654 auto OrdCompare = [](const std::pair<int, int> &P1,
11655 const std::pair<int, int> &P2) {
11656 return P1.first > P2.first;
11657 };
11658 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
11659 decltype(OrdCompare)>
11660 Indices(OrdCompare);
11661 for (int I = 0, E = VL.size(); I < E; ++I) {
11662 unsigned Idx = *getElementIndex(VL[I]);
11663 Indices.emplace(Idx, I);
11664 }
11665 OrdersType CurrentOrder(VL.size(), VL.size());
11666 bool IsIdentity = true;
11667 for (int I = 0, E = VL.size(); I < E; ++I) {
11668 CurrentOrder[Indices.top().second] = I;
11669 IsIdentity &= Indices.top().second == I;
11670 Indices.pop();
11671 }
11672 if (IsIdentity)
11673 CurrentOrder.clear();
11674 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11675 {}, CurrentOrder);
11676 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
11677 TE->dump());
11678
11679 TE->setOperands(Operands);
11680 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
11681 return;
11682 }
11683 case Instruction::Load: {
11684 // Check that a vectorized load would load the same memory as a scalar
11685 // load. For example, we don't want to vectorize loads that are smaller
11686 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
11687 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
11688 // from such a struct, we read/write packed bits disagreeing with the
11689 // unvectorized version.
11690 TreeEntry *TE = nullptr;
11691 fixupOrderingIndices(CurrentOrder);
11692 switch (State) {
11693 case TreeEntry::Vectorize:
11694 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11695 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11696 if (CurrentOrder.empty())
11697 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
11698 TE->dump());
11699 else
11701 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
11702 TE->dump());
11703 break;
11704 case TreeEntry::CompressVectorize:
11705 // Vectorizing non-consecutive loads with (masked)load + compress.
11706 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11707 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11708 LLVM_DEBUG(
11709 dbgs()
11710 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11711 TE->dump());
11712 break;
11713 case TreeEntry::StridedVectorize:
11714 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11715 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11716 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11717 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
11718 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
11719 TE->dump());
11720 break;
11721 case TreeEntry::ScatterVectorize:
11722 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11723 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11724 UserTreeIdx, ReuseShuffleIndices);
11725 LLVM_DEBUG(
11726 dbgs()
11727 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11728 TE->dump());
11729 break;
11730 case TreeEntry::CombinedVectorize:
11731 case TreeEntry::SplitVectorize:
11732 case TreeEntry::NeedToGather:
11733 llvm_unreachable("Unexpected loads state.");
11734 }
11735 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11736 assert(Operands.size() == 1 && "Expected a single operand only");
11737 SmallVector<int> Mask;
11738 inversePermutation(CurrentOrder, Mask);
11739 reorderScalars(Operands.front(), Mask);
11740 }
11741 TE->setOperands(Operands);
11742 if (State == TreeEntry::ScatterVectorize)
11743 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
11744 return;
11745 }
11746 case Instruction::ZExt:
11747 case Instruction::SExt:
11748 case Instruction::FPToUI:
11749 case Instruction::FPToSI:
11750 case Instruction::FPExt:
11751 case Instruction::PtrToInt:
11752 case Instruction::IntToPtr:
11753 case Instruction::SIToFP:
11754 case Instruction::UIToFP:
11755 case Instruction::Trunc:
11756 case Instruction::FPTrunc:
11757 case Instruction::BitCast: {
11758 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11759 std::make_pair(std::numeric_limits<unsigned>::min(),
11760 std::numeric_limits<unsigned>::max()));
11761 if (ShuffleOrOp == Instruction::ZExt ||
11762 ShuffleOrOp == Instruction::SExt) {
11763 CastMaxMinBWSizes = std::make_pair(
11764 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11765 PrevMaxBW),
11766 std::min<unsigned>(
11767 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11768 PrevMinBW));
11769 } else if (ShuffleOrOp == Instruction::Trunc) {
11770 CastMaxMinBWSizes = std::make_pair(
11771 std::max<unsigned>(
11772 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11773 PrevMaxBW),
11774 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11775 PrevMinBW));
11776 }
11777 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11778 ReuseShuffleIndices);
11779 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
11780 TE->dump());
11781
11782 TE->setOperands(Operands);
11783 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11784 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11785 if (ShuffleOrOp == Instruction::Trunc) {
11786 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11787 } else if (ShuffleOrOp == Instruction::SIToFP ||
11788 ShuffleOrOp == Instruction::UIToFP) {
11789 unsigned NumSignBits =
11790 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11791 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
11792 APInt Mask = DB->getDemandedBits(OpI);
11793 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
11794 }
11795 if (NumSignBits * 2 >=
11796 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11797 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11798 }
11799 return;
11800 }
11801 case Instruction::ICmp:
11802 case Instruction::FCmp: {
11803 // Check that all of the compares have the same predicate.
11804 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
11805 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11806 ReuseShuffleIndices);
11807 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
11808 TE->dump());
11809
11810 VLOperands Ops(VL, Operands, S, *this);
11811 if (cast<CmpInst>(VL0)->isCommutative()) {
11812 // Commutative predicate - collect + sort operands of the instructions
11813 // so that each side is more likely to have the same opcode.
11815 "Commutative Predicate mismatch");
11816 Ops.reorder();
11817 Operands.front() = Ops.getVL(0);
11818 Operands.back() = Ops.getVL(1);
11819 } else {
11820 // Collect operands - commute if it uses the swapped predicate.
11821 for (auto [Idx, V] : enumerate(VL)) {
11822 if (isa<PoisonValue>(V))
11823 continue;
11824 auto *Cmp = cast<CmpInst>(V);
11825 if (Cmp->getPredicate() != P0)
11826 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11827 }
11828 }
11829 TE->setOperands(Operands);
11830 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11831 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11832 if (ShuffleOrOp == Instruction::ICmp) {
11833 unsigned NumSignBits0 =
11834 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11835 if (NumSignBits0 * 2 >=
11836 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11837 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11838 unsigned NumSignBits1 =
11839 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
11840 if (NumSignBits1 * 2 >=
11841 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
11842 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11843 }
11844 return;
11845 }
11846 case Instruction::Select:
11847 case Instruction::FNeg:
11848 case Instruction::Add:
11849 case Instruction::FAdd:
11850 case Instruction::Sub:
11851 case Instruction::FSub:
11852 case Instruction::Mul:
11853 case Instruction::FMul:
11854 case Instruction::UDiv:
11855 case Instruction::SDiv:
11856 case Instruction::FDiv:
11857 case Instruction::URem:
11858 case Instruction::SRem:
11859 case Instruction::FRem:
11860 case Instruction::Shl:
11861 case Instruction::LShr:
11862 case Instruction::AShr:
11863 case Instruction::And:
11864 case Instruction::Or:
11865 case Instruction::Xor:
11866 case Instruction::Freeze: {
11867 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11868 ReuseShuffleIndices);
11869 LLVM_DEBUG(
11870 dbgs() << "SLP: added a new TreeEntry "
11871 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11872 TE->dump());
11873
11874 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
11875 VLOperands Ops(VL, Operands, S, *this);
11876 Ops.reorder();
11877 Operands[0] = Ops.getVL(0);
11878 Operands[1] = Ops.getVL(1);
11879 }
11880 TE->setOperands(Operands);
11881 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11882 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11883 return;
11884 }
11885 case Instruction::GetElementPtr: {
11886 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11887 ReuseShuffleIndices);
11888 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
11889 TE->dump());
11890 TE->setOperands(Operands);
11891
11892 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
11893 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11894 return;
11895 }
11896 case Instruction::Store: {
11897 bool Consecutive = CurrentOrder.empty();
11898 if (!Consecutive)
11899 fixupOrderingIndices(CurrentOrder);
11900 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11901 ReuseShuffleIndices, CurrentOrder);
11902 if (Consecutive)
11903 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
11904 TE->dump());
11905 else
11906 LLVM_DEBUG(
11907 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
11908 TE->dump());
11909 TE->setOperands(Operands);
11910 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
11911 return;
11912 }
11913 case Instruction::Call: {
11914 // Check if the calls are all to the same vectorizable intrinsic or
11915 // library function.
11916 CallInst *CI = cast<CallInst>(VL0);
11918
11919 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11920 ReuseShuffleIndices);
11921 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
11922 TE->dump());
11923 if (isCommutative(VL0)) {
11924 VLOperands Ops(VL, Operands, S, *this);
11925 Ops.reorder();
11926 Operands[0] = Ops.getVL(0);
11927 Operands[1] = Ops.getVL(1);
11928 }
11929 TE->setOperands(Operands);
11930 for (unsigned I : seq<unsigned>(CI->arg_size())) {
11931 // For scalar operands no need to create an entry since no need to
11932 // vectorize it.
11934 continue;
11935 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11936 }
11937 return;
11938 }
11939 case Instruction::ShuffleVector: {
11940 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11941 ReuseShuffleIndices);
11942 if (S.isAltShuffle()) {
11943 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
11944 TE->dump());
11945 } else {
11946 assert(SLPReVec && "Only supported by REVEC.");
11947 LLVM_DEBUG(
11948 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11949 TE->dump());
11950 }
11951
11952 // Reorder operands if reordering would enable vectorization.
11953 auto *CI = dyn_cast<CmpInst>(VL0);
11954 if (CI && any_of(VL, [](Value *V) {
11955 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
11956 })) {
11957 auto *MainCI = cast<CmpInst>(S.getMainOp());
11958 auto *AltCI = cast<CmpInst>(S.getAltOp());
11959 CmpInst::Predicate MainP = MainCI->getPredicate();
11960 CmpInst::Predicate AltP = AltCI->getPredicate();
11961 assert(MainP != AltP &&
11962 "Expected different main/alternate predicates.");
11963 // Collect operands - commute if it uses the swapped predicate or
11964 // alternate operation.
11965 for (auto [Idx, V] : enumerate(VL)) {
11966 if (isa<PoisonValue>(V))
11967 continue;
11968 auto *Cmp = cast<CmpInst>(V);
11969
11970 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
11971 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11972 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11973 } else {
11974 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11975 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11976 }
11977 }
11978 TE->setOperands(Operands);
11979 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11980 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11981 return;
11982 }
11983
11984 if (isa<BinaryOperator>(VL0) || CI) {
11985 VLOperands Ops(VL, Operands, S, *this);
11986 Ops.reorder();
11987 Operands[0] = Ops.getVL(0);
11988 Operands[1] = Ops.getVL(1);
11989 }
11990 TE->setOperands(Operands);
11991 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11992 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11993 return;
11994 }
11995 default:
11996 break;
11997 }
11998 llvm_unreachable("Unexpected vectorization of the instructions.");
11999}
12000
12002 unsigned N = 1;
12003 Type *EltTy = T;
12004
12006 if (EltTy->isEmptyTy())
12007 return 0;
12008 if (auto *ST = dyn_cast<StructType>(EltTy)) {
12009 // Check that struct is homogeneous.
12010 for (const auto *Ty : ST->elements())
12011 if (Ty != *ST->element_begin())
12012 return 0;
12013 N *= ST->getNumElements();
12014 EltTy = *ST->element_begin();
12015 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
12016 N *= AT->getNumElements();
12017 EltTy = AT->getElementType();
12018 } else {
12019 auto *VT = cast<FixedVectorType>(EltTy);
12020 N *= VT->getNumElements();
12021 EltTy = VT->getElementType();
12022 }
12023 }
12024
12025 if (!isValidElementType(EltTy))
12026 return 0;
12027 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
12028 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12029 VTSize != DL->getTypeStoreSizeInBits(T))
12030 return 0;
12031 return N;
12032}
12033
12034bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
12035 SmallVectorImpl<unsigned> &CurrentOrder,
12036 bool ResizeAllowed) const {
12038 assert(It != VL.end() && "Expected at least one extract instruction.");
12039 auto *E0 = cast<Instruction>(*It);
12040 assert(
12042 "Invalid opcode");
12043 // Check if all of the extracts come from the same vector and from the
12044 // correct offset.
12045 Value *Vec = E0->getOperand(0);
12046
12047 CurrentOrder.clear();
12048
12049 // We have to extract from a vector/aggregate with the same number of elements.
12050 unsigned NElts;
12051 if (E0->getOpcode() == Instruction::ExtractValue) {
12052 NElts = canMapToVector(Vec->getType());
12053 if (!NElts)
12054 return false;
12055 // Check if load can be rewritten as load of vector.
12056 LoadInst *LI = dyn_cast<LoadInst>(Vec);
12057 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
12058 return false;
12059 } else {
12060 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
12061 }
12062
12063 unsigned E = VL.size();
12064 if (!ResizeAllowed && NElts != E)
12065 return false;
12067 unsigned MinIdx = NElts, MaxIdx = 0;
12068 for (auto [I, V] : enumerate(VL)) {
12069 auto *Inst = dyn_cast<Instruction>(V);
12070 if (!Inst)
12071 continue;
12072 if (Inst->getOperand(0) != Vec)
12073 return false;
12074 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
12075 if (isa<UndefValue>(EE->getIndexOperand()))
12076 continue;
12077 std::optional<unsigned> Idx = getExtractIndex(Inst);
12078 if (!Idx)
12079 return false;
12080 const unsigned ExtIdx = *Idx;
12081 if (ExtIdx >= NElts)
12082 continue;
12083 Indices[I] = ExtIdx;
12084 if (MinIdx > ExtIdx)
12085 MinIdx = ExtIdx;
12086 if (MaxIdx < ExtIdx)
12087 MaxIdx = ExtIdx;
12088 }
12089 if (MaxIdx - MinIdx + 1 > E)
12090 return false;
12091 if (MaxIdx + 1 <= E)
12092 MinIdx = 0;
12093
12094 // Check that all of the indices extract from the correct offset.
12095 bool ShouldKeepOrder = true;
12096 // Assign to all items the initial value E + 1 so we can check if the extract
12097 // instruction index was used already.
12098 // Also, later we can check that all the indices are used and we have a
12099 // consecutive access in the extract instructions, by checking that no
12100 // element of CurrentOrder still has value E + 1.
12101 CurrentOrder.assign(E, E);
12102 for (unsigned I = 0; I < E; ++I) {
12103 if (Indices[I] == PoisonMaskElem)
12104 continue;
12105 const unsigned ExtIdx = Indices[I] - MinIdx;
12106 if (CurrentOrder[ExtIdx] != E) {
12107 CurrentOrder.clear();
12108 return false;
12109 }
12110 ShouldKeepOrder &= ExtIdx == I;
12111 CurrentOrder[ExtIdx] = I;
12112 }
12113 if (ShouldKeepOrder)
12114 CurrentOrder.clear();
12115
12116 return ShouldKeepOrder;
12117}
12118
12119bool BoUpSLP::areAllUsersVectorized(
12120 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
12121 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
12122 all_of(I->users(), [this](User *U) {
12123 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12124 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12125 });
12126}
12127
12128void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12129 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12130 SmallVectorImpl<Value *> *OpScalars,
12131 SmallVectorImpl<Value *> *AltScalars) const {
12132 unsigned Sz = Scalars.size();
12133 Mask.assign(Sz, PoisonMaskElem);
12134 SmallVector<int> OrderMask;
12135 if (!ReorderIndices.empty())
12136 inversePermutation(ReorderIndices, OrderMask);
12137 for (unsigned I = 0; I < Sz; ++I) {
12138 unsigned Idx = I;
12139 if (!ReorderIndices.empty())
12140 Idx = OrderMask[I];
12141 if (isa<PoisonValue>(Scalars[Idx]))
12142 continue;
12143 auto *OpInst = cast<Instruction>(Scalars[Idx]);
12144 if (IsAltOp(OpInst)) {
12145 Mask[I] = Sz + Idx;
12146 if (AltScalars)
12147 AltScalars->push_back(OpInst);
12148 } else {
12149 Mask[I] = Idx;
12150 if (OpScalars)
12151 OpScalars->push_back(OpInst);
12152 }
12153 }
12154 if (!ReuseShuffleIndices.empty()) {
12155 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12156 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
12157 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12158 });
12159 Mask.swap(NewMask);
12160 }
12161}
12162
12164 Instruction *AltOp,
12165 const TargetLibraryInfo &TLI) {
12166 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12167}
12168
12170 Instruction *AltOp,
12171 const TargetLibraryInfo &TLI) {
12172 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
12173 auto *AltCI = cast<CmpInst>(AltOp);
12174 CmpInst::Predicate MainP = MainCI->getPredicate();
12175 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12176 assert(MainP != AltP && "Expected different main/alternate predicates.");
12177 auto *CI = cast<CmpInst>(I);
12178 if (isCmpSameOrSwapped(MainCI, CI, TLI))
12179 return false;
12180 if (isCmpSameOrSwapped(AltCI, CI, TLI))
12181 return true;
12182 CmpInst::Predicate P = CI->getPredicate();
12184
12185 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12186 "CmpInst expected to match either main or alternate predicate or "
12187 "their swap.");
12188 return MainP != P && MainP != SwappedP;
12189 }
12190 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12191}
12192
12193TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
12194 assert(!Ops.empty());
12195 const auto *Op0 = Ops.front();
12196
12197 const bool IsConstant = all_of(Ops, [](Value *V) {
12198 // TODO: We should allow undef elements here
12199 return isConstant(V) && !isa<UndefValue>(V);
12200 });
12201 const bool IsUniform = all_of(Ops, [=](Value *V) {
12202 // TODO: We should allow undef elements here
12203 return V == Op0;
12204 });
12205 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12206 // TODO: We should allow undef elements here
12207 if (auto *CI = dyn_cast<ConstantInt>(V))
12208 return CI->getValue().isPowerOf2();
12209 return false;
12210 });
12211 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12212 // TODO: We should allow undef elements here
12213 if (auto *CI = dyn_cast<ConstantInt>(V))
12214 return CI->getValue().isNegatedPowerOf2();
12215 return false;
12216 });
12217
12219 if (IsConstant && IsUniform)
12221 else if (IsConstant)
12223 else if (IsUniform)
12225
12227 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12228 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12229
12230 return {VK, VP};
12231}
12232
12233namespace {
12234/// The base class for shuffle instruction emission and shuffle cost estimation.
12235class BaseShuffleAnalysis {
12236protected:
12237 Type *ScalarTy = nullptr;
12238
12239 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12240
12241 /// V is expected to be a vectorized value.
12242 /// When REVEC is disabled, there is no difference between VF and
12243 /// VNumElements.
12244 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12245 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12246 /// of 8.
12247 unsigned getVF(Value *V) const {
12248 assert(V && "V cannot be nullptr");
12249 assert(isa<FixedVectorType>(V->getType()) &&
12250 "V does not have FixedVectorType");
12251 assert(ScalarTy && "ScalarTy cannot be nullptr");
12252 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12253 unsigned VNumElements =
12254 cast<FixedVectorType>(V->getType())->getNumElements();
12255 assert(VNumElements > ScalarTyNumElements &&
12256 "the number of elements of V is not large enough");
12257 assert(VNumElements % ScalarTyNumElements == 0 &&
12258 "the number of elements of V is not a vectorized value");
12259 return VNumElements / ScalarTyNumElements;
12260 }
12261
12262 /// Checks if the mask is an identity mask.
12263 /// \param IsStrict if is true the function returns false if mask size does
12264 /// not match vector size.
12265 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12266 bool IsStrict) {
12267 int Limit = Mask.size();
12268 int VF = VecTy->getNumElements();
12269 int Index = -1;
12270 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12271 return true;
12272 if (!IsStrict) {
12273 // Consider extract subvector starting from index 0.
12274 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
12275 Index == 0)
12276 return true;
12277 // All VF-size submasks are identity (e.g.
12278 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12279 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12280 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12281 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
12283 }))
12284 return true;
12285 }
12286 return false;
12287 }
12288
12289 /// Tries to combine 2 different masks into single one.
12290 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12291 /// change the size of the vector, \p LocalVF is the original size of the
12292 /// shuffled vector.
12293 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12294 ArrayRef<int> ExtMask) {
12295 unsigned VF = Mask.size();
12296 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12297 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12298 if (ExtMask[I] == PoisonMaskElem)
12299 continue;
12300 int MaskedIdx = Mask[ExtMask[I] % VF];
12301 NewMask[I] =
12302 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12303 }
12304 Mask.swap(NewMask);
12305 }
12306
12307 /// Looks through shuffles trying to reduce final number of shuffles in the
12308 /// code. The function looks through the previously emitted shuffle
12309 /// instructions and properly mark indices in mask as undef.
12310 /// For example, given the code
12311 /// \code
12312 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12313 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12314 /// \endcode
12315 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12316 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12317 /// <0, 1, 2, 3> for the shuffle.
12318 /// If 2 operands are of different size, the smallest one will be resized and
12319 /// the mask recalculated properly.
12320 /// For example, given the code
12321 /// \code
12322 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12323 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12324 /// \endcode
12325 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12326 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12327 /// <0, 1, 2, 3> for the shuffle.
12328 /// So, it tries to transform permutations to simple vector merge, if
12329 /// possible.
12330 /// \param V The input vector which must be shuffled using the given \p Mask.
12331 /// If the better candidate is found, \p V is set to this best candidate
12332 /// vector.
12333 /// \param Mask The input mask for the shuffle. If the best candidate is found
12334 /// during looking-through-shuffles attempt, it is updated accordingly.
12335 /// \param SinglePermute true if the shuffle operation is originally a
12336 /// single-value-permutation. In this case the look-through-shuffles procedure
12337 /// may look for resizing shuffles as the best candidates.
12338 /// \return true if the shuffle results in the non-resizing identity shuffle
12339 /// (and thus can be ignored), false - otherwise.
12340 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12341 bool SinglePermute) {
12342 Value *Op = V;
12343 ShuffleVectorInst *IdentityOp = nullptr;
12344 SmallVector<int> IdentityMask;
12345 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12346 // Exit if not a fixed vector type or changing size shuffle.
12347 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12348 if (!SVTy)
12349 break;
12350 // Remember the identity or broadcast mask, if it is not a resizing
12351 // shuffle. If no better candidates are found, this Op and Mask will be
12352 // used in the final shuffle.
12353 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12354 if (!IdentityOp || !SinglePermute ||
12355 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12357 IdentityMask.size()))) {
12358 IdentityOp = SV;
12359 // Store current mask in the IdentityMask so later we did not lost
12360 // this info if IdentityOp is selected as the best candidate for the
12361 // permutation.
12362 IdentityMask.assign(Mask);
12363 }
12364 }
12365 // Remember the broadcast mask. If no better candidates are found, this Op
12366 // and Mask will be used in the final shuffle.
12367 // Zero splat can be used as identity too, since it might be used with
12368 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12369 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12370 // expensive, the analysis founds out, that the source vector is just a
12371 // broadcast, this original mask can be transformed to identity mask <0,
12372 // 1, 2, 3>.
12373 // \code
12374 // %0 = shuffle %v, poison, zeroinitalizer
12375 // %res = shuffle %0, poison, <3, 1, 2, 0>
12376 // \endcode
12377 // may be transformed to
12378 // \code
12379 // %0 = shuffle %v, poison, zeroinitalizer
12380 // %res = shuffle %0, poison, <0, 1, 2, 3>
12381 // \endcode
12382 if (SV->isZeroEltSplat()) {
12383 IdentityOp = SV;
12384 IdentityMask.assign(Mask);
12385 }
12386 int LocalVF = Mask.size();
12387 if (auto *SVOpTy =
12388 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12389 LocalVF = SVOpTy->getNumElements();
12390 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12391 for (auto [Idx, I] : enumerate(Mask)) {
12392 if (I == PoisonMaskElem ||
12393 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12394 continue;
12395 ExtMask[Idx] = SV->getMaskValue(I);
12396 }
12397 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12398 SV->getOperand(0),
12399 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12400 .all();
12401 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12402 SV->getOperand(1),
12403 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12404 .all();
12405 if (!IsOp1Undef && !IsOp2Undef) {
12406 // Update mask and mark undef elems.
12407 for (int &I : Mask) {
12408 if (I == PoisonMaskElem)
12409 continue;
12410 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12412 I = PoisonMaskElem;
12413 }
12414 break;
12415 }
12416 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12417 combineMasks(LocalVF, ShuffleMask, Mask);
12418 Mask.swap(ShuffleMask);
12419 if (IsOp2Undef)
12420 Op = SV->getOperand(0);
12421 else
12422 Op = SV->getOperand(1);
12423 }
12424 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12425 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12427 if (IdentityOp) {
12428 V = IdentityOp;
12429 assert(Mask.size() == IdentityMask.size() &&
12430 "Expected masks of same sizes.");
12431 // Clear known poison elements.
12432 for (auto [I, Idx] : enumerate(Mask))
12433 if (Idx == PoisonMaskElem)
12434 IdentityMask[I] = PoisonMaskElem;
12435 Mask.swap(IdentityMask);
12436 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12437 return SinglePermute &&
12438 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12439 /*IsStrict=*/true) ||
12440 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12441 Shuffle->isZeroEltSplat() &&
12443 all_of(enumerate(Mask), [&](const auto &P) {
12444 return P.value() == PoisonMaskElem ||
12445 Shuffle->getShuffleMask()[P.index()] == 0;
12446 })));
12447 }
12448 V = Op;
12449 return false;
12450 }
12451 V = Op;
12452 return true;
12453 }
12454
12455 /// Smart shuffle instruction emission, walks through shuffles trees and
12456 /// tries to find the best matching vector for the actual shuffle
12457 /// instruction.
12458 template <typename T, typename ShuffleBuilderTy>
12459 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12460 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12461 assert(V1 && "Expected at least one vector value.");
12462 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12463 SmallVector<int> NewMask(Mask);
12464 if (ScalarTyNumElements != 1) {
12465 assert(SLPReVec && "FixedVectorType is not expected.");
12466 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12467 Mask = NewMask;
12468 }
12469 if (V2)
12470 Builder.resizeToMatch(V1, V2);
12471 int VF = Mask.size();
12472 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12473 VF = FTy->getNumElements();
12475 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12476 .all()) {
12477 // Peek through shuffles.
12478 Value *Op1 = V1;
12479 Value *Op2 = V2;
12480 int VF =
12481 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12482 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12483 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12484 for (int I = 0, E = Mask.size(); I < E; ++I) {
12485 if (Mask[I] < VF)
12486 CombinedMask1[I] = Mask[I];
12487 else
12488 CombinedMask2[I] = Mask[I] - VF;
12489 }
12490 Value *PrevOp1;
12491 Value *PrevOp2;
12492 do {
12493 PrevOp1 = Op1;
12494 PrevOp2 = Op2;
12495 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12496 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12497 // Check if we have 2 resizing shuffles - need to peek through operands
12498 // again.
12499 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12500 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12501 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12502 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12503 if (I == PoisonMaskElem)
12504 continue;
12505 ExtMask1[Idx] = SV1->getMaskValue(I);
12506 }
12507 SmallBitVector UseMask1 = buildUseMask(
12508 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12509 ->getNumElements(),
12510 ExtMask1, UseMask::SecondArg);
12511 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12512 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12513 if (I == PoisonMaskElem)
12514 continue;
12515 ExtMask2[Idx] = SV2->getMaskValue(I);
12516 }
12517 SmallBitVector UseMask2 = buildUseMask(
12518 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12519 ->getNumElements(),
12520 ExtMask2, UseMask::SecondArg);
12521 if (SV1->getOperand(0)->getType() ==
12522 SV2->getOperand(0)->getType() &&
12523 SV1->getOperand(0)->getType() != SV1->getType() &&
12524 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12525 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12526 Op1 = SV1->getOperand(0);
12527 Op2 = SV2->getOperand(0);
12528 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12529 int LocalVF = ShuffleMask1.size();
12530 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12531 LocalVF = FTy->getNumElements();
12532 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12533 CombinedMask1.swap(ShuffleMask1);
12534 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12535 LocalVF = ShuffleMask2.size();
12536 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12537 LocalVF = FTy->getNumElements();
12538 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12539 CombinedMask2.swap(ShuffleMask2);
12540 }
12541 }
12542 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12543 Builder.resizeToMatch(Op1, Op2);
12544 VF = std::max(cast<VectorType>(Op1->getType())
12545 ->getElementCount()
12546 .getKnownMinValue(),
12548 ->getElementCount()
12549 .getKnownMinValue());
12550 for (int I = 0, E = Mask.size(); I < E; ++I) {
12551 if (CombinedMask2[I] != PoisonMaskElem) {
12552 assert(CombinedMask1[I] == PoisonMaskElem &&
12553 "Expected undefined mask element");
12554 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12555 }
12556 }
12557 if (Op1 == Op2 &&
12558 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12559 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12561 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12562 ArrayRef(CombinedMask1))))
12563 return Builder.createIdentity(Op1);
12564 return Builder.createShuffleVector(
12565 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
12566 CombinedMask1);
12567 }
12568 if (isa<PoisonValue>(V1))
12569 return Builder.createPoison(
12570 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
12571 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
12572 assert(V1 && "Expected non-null value after looking through shuffles.");
12573
12574 if (!IsIdentity)
12575 return Builder.createShuffleVector(V1, NewMask);
12576 return Builder.createIdentity(V1);
12577 }
12578
12579 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12580 /// shuffle emission.
12581 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12582 ArrayRef<int> Mask) {
12583 for (unsigned I : seq<unsigned>(CommonMask.size()))
12584 if (Mask[I] != PoisonMaskElem)
12585 CommonMask[I] = I;
12586 }
12587};
12588} // namespace
12589
12590/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12591static std::pair<InstructionCost, InstructionCost>
12593 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
12594 Type *ScalarTy, VectorType *VecTy) {
12595 InstructionCost ScalarCost = 0;
12596 InstructionCost VecCost = 0;
12597 // Here we differentiate two cases: (1) when Ptrs represent a regular
12598 // vectorization tree node (as they are pointer arguments of scattered
12599 // loads) or (2) when Ptrs are the arguments of loads or stores being
12600 // vectorized as plane wide unit-stride load/store since all the
12601 // loads/stores are known to be from/to adjacent locations.
12602 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12603 // Case 2: estimate costs for pointer related costs when vectorizing to
12604 // a wide load/store.
12605 // Scalar cost is estimated as a set of pointers with known relationship
12606 // between them.
12607 // For vector code we will use BasePtr as argument for the wide load/store
12608 // but we also need to account all the instructions which are going to
12609 // stay in vectorized code due to uses outside of these scalar
12610 // loads/stores.
12611 ScalarCost = TTI.getPointersChainCost(
12612 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12613 CostKind);
12614
12615 SmallVector<const Value *> PtrsRetainedInVecCode;
12616 for (Value *V : Ptrs) {
12617 if (V == BasePtr) {
12618 PtrsRetainedInVecCode.push_back(V);
12619 continue;
12620 }
12622 // For simplicity assume Ptr to stay in vectorized code if it's not a
12623 // GEP instruction. We don't care since it's cost considered free.
12624 // TODO: We should check for any uses outside of vectorizable tree
12625 // rather than just single use.
12626 if (!Ptr || !Ptr->hasOneUse())
12627 PtrsRetainedInVecCode.push_back(V);
12628 }
12629
12630 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
12631 // If all pointers stay in vectorized code then we don't have
12632 // any savings on that.
12633 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
12634 }
12635 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12636 TTI::PointersChainInfo::getKnownStride(),
12637 VecTy, CostKind);
12638 } else {
12639 // Case 1: Ptrs are the arguments of loads that we are going to transform
12640 // into masked gather load intrinsic.
12641 // All the scalar GEPs will be removed as a result of vectorization.
12642 // For any external uses of some lanes extract element instructions will
12643 // be generated (which cost is estimated separately).
12644 TTI::PointersChainInfo PtrsInfo =
12645 all_of(Ptrs,
12646 [](const Value *V) {
12648 return Ptr && !Ptr->hasAllConstantIndices();
12649 })
12650 ? TTI::PointersChainInfo::getUnknownStride()
12651 : TTI::PointersChainInfo::getKnownStride();
12652
12653 ScalarCost =
12654 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
12655 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
12656 if (!BaseGEP) {
12657 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
12658 if (It != Ptrs.end())
12659 BaseGEP = cast<GEPOperator>(*It);
12660 }
12661 if (BaseGEP) {
12662 SmallVector<const Value *> Indices(BaseGEP->indices());
12663 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
12664 BaseGEP->getPointerOperand(), Indices, VecTy,
12665 CostKind);
12666 }
12667 }
12668
12669 return std::make_pair(ScalarCost, VecCost);
12670}
12671
12672void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12673 assert(TE.isGather() && TE.ReorderIndices.empty() &&
12674 "Expected gather node without reordering.");
12675 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
12676 SmallSet<size_t, 2> LoadKeyUsed;
12677
12678 // Do not reorder nodes if it small (just 2 elements), all-constant or all
12679 // instructions have same opcode already.
12680 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
12681 all_of(TE.Scalars, isConstant))
12682 return;
12683
12684 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
12685 return VectorizableTree[Idx]->isSame(TE.Scalars);
12686 }))
12687 return;
12688
12689 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
12690 Key = hash_combine(hash_value(LI->getParent()), Key);
12691 Value *Ptr =
12692 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
12693 if (LoadKeyUsed.contains(Key)) {
12694 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
12695 if (LIt != LoadsMap.end()) {
12696 for (LoadInst *RLI : LIt->second) {
12697 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
12698 LI->getType(), LI->getPointerOperand(), *DL, *SE,
12699 /*StrictCheck=*/true))
12700 return hash_value(RLI->getPointerOperand());
12701 }
12702 for (LoadInst *RLI : LIt->second) {
12704 LI->getPointerOperand(), *TLI)) {
12705 hash_code SubKey = hash_value(RLI->getPointerOperand());
12706 return SubKey;
12707 }
12708 }
12709 if (LIt->second.size() > 2) {
12710 hash_code SubKey =
12711 hash_value(LIt->second.back()->getPointerOperand());
12712 return SubKey;
12713 }
12714 }
12715 }
12716 LoadKeyUsed.insert(Key);
12717 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
12718 return hash_value(LI->getPointerOperand());
12719 };
12720 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12721 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12722 bool IsOrdered = true;
12723 unsigned NumInstructions = 0;
12724 // Try to "cluster" scalar instructions, to be able to build extra vectorized
12725 // nodes.
12726 for (auto [I, V] : enumerate(TE.Scalars)) {
12727 size_t Key = 1, Idx = 1;
12728 if (auto *Inst = dyn_cast<Instruction>(V);
12730 !isDeleted(Inst) && !isVectorized(V)) {
12731 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
12732 /*AllowAlternate=*/false);
12733 ++NumInstructions;
12734 }
12735 auto &Container = SortedValues[Key];
12736 if (IsOrdered && !KeyToIndex.contains(V) &&
12739 ((Container.contains(Idx) &&
12740 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
12741 (!Container.empty() && !Container.contains(Idx) &&
12742 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
12743 IsOrdered = false;
12744 auto &KTI = KeyToIndex[V];
12745 if (KTI.empty())
12746 Container[Idx].push_back(V);
12747 KTI.push_back(I);
12748 }
12750 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12751 if (!IsOrdered && NumInstructions > 1) {
12752 unsigned Cnt = 0;
12753 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
12754 for (const auto &D : SortedValues) {
12755 for (const auto &P : D.second) {
12756 unsigned Sz = 0;
12757 for (Value *V : P.second) {
12758 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
12759 for (auto [K, Idx] : enumerate(Indices)) {
12760 TE.ReorderIndices[Cnt + K] = Idx;
12761 TE.Scalars[Cnt + K] = V;
12762 }
12763 Sz += Indices.size();
12764 Cnt += Indices.size();
12765 }
12766 if (Sz > 1 && isa<Instruction>(P.second.front())) {
12767 const unsigned SubVF = getFloorFullVectorNumberOfElements(
12768 *TTI, TE.Scalars.front()->getType(), Sz);
12769 SubVectors.emplace_back(Cnt - Sz, SubVF);
12770 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
12771 DemandedElts.clearBit(I);
12772 } else if (!P.second.empty() && isConstant(P.second.front())) {
12773 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
12774 DemandedElts.clearBit(I);
12775 }
12776 }
12777 }
12778 }
12779 // Reuses always require shuffles, so consider it as profitable.
12780 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
12781 return;
12782 // Do simple cost estimation.
12785 auto *ScalarTy = TE.Scalars.front()->getType();
12786 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
12787 for (auto [Idx, Sz] : SubVectors) {
12789 Idx, getWidenedType(ScalarTy, Sz));
12790 }
12791 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12792 /*Insert=*/true,
12793 /*Extract=*/false, CostKind);
12794 int Sz = TE.Scalars.size();
12795 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
12796 TE.ReorderIndices.end());
12797 for (unsigned I : seq<unsigned>(Sz)) {
12798 Value *V = TE.getOrdered(I);
12799 if (isa<PoisonValue>(V)) {
12800 ReorderMask[I] = PoisonMaskElem;
12801 } else if (isConstant(V) || DemandedElts[I]) {
12802 ReorderMask[I] = I + TE.ReorderIndices.size();
12803 }
12804 }
12805 Cost += ::getShuffleCost(*TTI,
12806 any_of(ReorderMask, [&](int I) { return I >= Sz; })
12809 VecTy, ReorderMask);
12810 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12811 ReorderMask.assign(Sz, PoisonMaskElem);
12812 for (unsigned I : seq<unsigned>(Sz)) {
12813 Value *V = TE.getOrdered(I);
12814 if (isConstant(V)) {
12815 DemandedElts.clearBit(I);
12816 if (!isa<PoisonValue>(V))
12817 ReorderMask[I] = I;
12818 } else {
12819 ReorderMask[I] = I + Sz;
12820 }
12821 }
12822 InstructionCost BVCost =
12823 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12824 /*Insert=*/true, /*Extract=*/false, CostKind);
12825 if (!DemandedElts.isAllOnes())
12826 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
12827 if (Cost >= BVCost) {
12828 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
12829 reorderScalars(TE.Scalars, Mask);
12830 TE.ReorderIndices.clear();
12831 }
12832}
12833
12834/// Check if we can convert fadd/fsub sequence to FMAD.
12835/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
12837 const InstructionsState &S,
12838 DominatorTree &DT, const DataLayout &DL,
12840 const TargetLibraryInfo &TLI) {
12841 assert(all_of(VL,
12842 [](Value *V) {
12843 return V->getType()->getScalarType()->isFloatingPointTy();
12844 }) &&
12845 "Can only convert to FMA for floating point types");
12846 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
12847
12848 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
12849 FastMathFlags FMF;
12850 FMF.set();
12851 for (Value *V : VL) {
12852 auto *I = dyn_cast<Instruction>(V);
12853 if (!I)
12854 continue;
12855 if (S.isCopyableElement(I))
12856 continue;
12857 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
12858 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12859 continue;
12860 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12861 FMF &= FPCI->getFastMathFlags();
12862 }
12863 return FMF.allowContract();
12864 };
12865 if (!CheckForContractable(VL))
12867 // fmul also should be contractable
12868 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
12869 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
12870
12871 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
12872 if (!OpS.valid())
12874
12875 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12877 if (!CheckForContractable(Operands.front()))
12879 // Compare the costs.
12880 InstructionCost FMulPlusFAddCost = 0;
12881 InstructionCost FMACost = 0;
12883 FastMathFlags FMF;
12884 FMF.set();
12885 for (Value *V : VL) {
12886 auto *I = dyn_cast<Instruction>(V);
12887 if (!I)
12888 continue;
12889 if (!S.isCopyableElement(I))
12890 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12891 FMF &= FPCI->getFastMathFlags();
12892 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12893 }
12894 unsigned NumOps = 0;
12895 for (auto [V, Op] : zip(VL, Operands.front())) {
12896 if (S.isCopyableElement(V))
12897 continue;
12898 auto *I = dyn_cast<Instruction>(Op);
12899 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
12900 if (auto *OpI = dyn_cast<Instruction>(V))
12901 FMACost += TTI.getInstructionCost(OpI, CostKind);
12902 if (I)
12903 FMACost += TTI.getInstructionCost(I, CostKind);
12904 continue;
12905 }
12906 ++NumOps;
12907 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12908 FMF &= FPCI->getFastMathFlags();
12909 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12910 }
12911 Type *Ty = VL.front()->getType();
12912 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
12913 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
12914 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
12915}
12916
12919 BaseGraphSize = VectorizableTree.size();
12920 // Turn graph transforming mode on and off, when done.
12921 class GraphTransformModeRAAI {
12922 bool &SavedIsGraphTransformMode;
12923
12924 public:
12925 GraphTransformModeRAAI(bool &IsGraphTransformMode)
12926 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12927 IsGraphTransformMode = true;
12928 }
12929 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
12930 } TransformContext(IsGraphTransformMode);
12931 // Operands are profitable if they are:
12932 // 1. At least one constant
12933 // or
12934 // 2. Splats
12935 // or
12936 // 3. Results in good vectorization opportunity, i.e. may generate vector
12937 // nodes and reduce cost of the graph.
12938 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
12939 const InstructionsState &S) {
12941 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
12942 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
12943 I2->getOperand(Op));
12944 return all_of(
12945 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
12946 return all_of(Cand,
12947 [](const std::pair<Value *, Value *> &P) {
12948 return isa<Constant>(P.first) ||
12949 isa<Constant>(P.second) || P.first == P.second;
12950 }) ||
12952 });
12953 };
12954
12955 // Try to reorder gather nodes for better vectorization opportunities.
12956 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12957 TreeEntry &E = *VectorizableTree[Idx];
12958 if (E.isGather())
12959 reorderGatherNode(E);
12960 }
12961
12962 // Better to use full gathered loads analysis, if there are only 2 loads
12963 // gathered nodes each having less than 16 elements.
12964 constexpr unsigned VFLimit = 16;
12965 bool ForceLoadGather =
12966 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12967 return TE->isGather() && TE->hasState() &&
12968 TE->getOpcode() == Instruction::Load &&
12969 TE->getVectorFactor() < VFLimit;
12970 }) == 2;
12971
12972 // Checks if the scalars are used in other node.
12973 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
12974 function_ref<bool(Value *)> CheckContainer) {
12975 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
12976 if (isa<PoisonValue>(V))
12977 return true;
12978 auto *I = dyn_cast<Instruction>(V);
12979 if (!I)
12980 return false;
12981 return is_contained(TE->Scalars, I) || CheckContainer(I);
12982 });
12983 };
12984 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
12985 if (E.hasState()) {
12986 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
12987 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12988 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12989 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12990 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12991 return is_contained(TEs, TE);
12992 });
12993 });
12994 }))
12995 return true;
12996 ;
12997 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
12998 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12999 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13000 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13001 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13002 return is_contained(TEs, TE);
13003 });
13004 });
13005 }))
13006 return true;
13007 } else {
13008 // Check if the gather node full copy of split node.
13009 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
13010 if (It != E.Scalars.end()) {
13011 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
13012 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13013 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13014 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13015 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13016 return is_contained(TEs, TE);
13017 });
13018 });
13019 }))
13020 return true;
13021 }
13022 }
13023 return false;
13024 };
13025 // The tree may grow here, so iterate over nodes, built before.
13026 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13027 TreeEntry &E = *VectorizableTree[Idx];
13028 if (E.isGather()) {
13029 ArrayRef<Value *> VL = E.Scalars;
13030 const unsigned Sz = getVectorElementSize(VL.front());
13031 unsigned MinVF = getMinVF(2 * Sz);
13032 // Do not try partial vectorization for small nodes (<= 2), nodes with the
13033 // same opcode and same parent block or all constants.
13034 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13035 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
13036 // We use allSameOpcode instead of isAltShuffle because we don't
13037 // want to use interchangeable instruction here.
13038 !allSameOpcode(VL) || !allSameBlock(VL)) ||
13039 allConstant(VL) || isSplat(VL))
13040 continue;
13041 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
13042 continue;
13043 // Check if the node is a copy of other vector nodes.
13044 if (CheckForSameVectorNodes(E))
13045 continue;
13046 // Try to find vectorizable sequences and transform them into a series of
13047 // insertvector instructions.
13048 unsigned StartIdx = 0;
13049 unsigned End = VL.size();
13050 for (unsigned VF = getFloorFullVectorNumberOfElements(
13051 *TTI, VL.front()->getType(), VL.size() - 1);
13052 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
13053 *TTI, VL.front()->getType(), VF - 1)) {
13054 if (StartIdx + VF > End)
13055 continue;
13057 bool AllStrided = true;
13058 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13059 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
13060 // If any instruction is vectorized already - do not try again.
13061 // Reuse the existing node, if it fully matches the slice.
13062 if (isVectorized(Slice.front()) &&
13063 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
13064 continue;
13065 // Constant already handled effectively - skip.
13066 if (allConstant(Slice))
13067 continue;
13068 // Do not try to vectorize small splats (less than vector register and
13069 // only with the single non-undef element).
13070 bool IsSplat = isSplat(Slice);
13071 bool IsTwoRegisterSplat = true;
13072 if (IsSplat && VF == 2) {
13073 unsigned NumRegs2VF = ::getNumberOfParts(
13074 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
13075 IsTwoRegisterSplat = NumRegs2VF == 2;
13076 }
13077 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
13078 count(Slice, Slice.front()) ==
13079 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
13080 : 1)) {
13081 if (IsSplat)
13082 continue;
13083 InstructionsState S = getSameOpcode(Slice, *TLI);
13084 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
13085 (S.getOpcode() == Instruction::Load &&
13087 (S.getOpcode() != Instruction::Load &&
13088 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
13089 continue;
13090 if (VF == 2) {
13091 // Try to vectorize reduced values or if all users are vectorized.
13092 // For expensive instructions extra extracts might be profitable.
13093 if ((!UserIgnoreList || E.Idx != 0) &&
13094 TTI->getInstructionCost(S.getMainOp(), CostKind) <
13096 !all_of(Slice, [&](Value *V) {
13097 if (isa<PoisonValue>(V))
13098 return true;
13099 return areAllUsersVectorized(cast<Instruction>(V),
13100 UserIgnoreList);
13101 }))
13102 continue;
13103 if (S.getOpcode() == Instruction::Load) {
13104 OrdersType Order;
13105 SmallVector<Value *> PointerOps;
13106 StridedPtrInfo SPtrInfo;
13107 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
13108 PointerOps, SPtrInfo);
13109 AllStrided &= Res == LoadsState::StridedVectorize ||
13111 Res == LoadsState::Gather;
13112 // Do not vectorize gathers.
13113 if (Res == LoadsState::ScatterVectorize ||
13114 Res == LoadsState::Gather) {
13115 if (Res == LoadsState::Gather) {
13117 // If reductions and the scalars from the root node are
13118 // analyzed - mark as non-vectorizable reduction.
13119 if (UserIgnoreList && E.Idx == 0)
13120 analyzedReductionVals(Slice);
13121 }
13122 continue;
13123 }
13124 } else if (S.getOpcode() == Instruction::ExtractElement ||
13125 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
13127 !CheckOperandsProfitability(
13128 S.getMainOp(),
13131 S))) {
13132 // Do not vectorize extractelements (handled effectively
13133 // alread). Do not vectorize non-profitable instructions (with
13134 // low cost and non-vectorizable operands.)
13135 continue;
13136 }
13137 }
13138 }
13139 Slices.emplace_back(Cnt, Slice.size());
13140 }
13141 // Do not try to vectorize if all slides are strided or gathered with
13142 // vector factor 2 and there are more than 2 slices. Better to handle
13143 // them in gathered loads analysis, may result in better vectorization.
13144 if (VF == 2 && AllStrided && Slices.size() > 2)
13145 continue;
13146 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13147 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13148 if (StartIdx == Cnt)
13149 StartIdx = Cnt + Sz;
13150 if (End == Cnt + Sz)
13151 End = Cnt;
13152 };
13153 for (auto [Cnt, Sz] : Slices) {
13154 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
13155 const TreeEntry *SameTE = nullptr;
13156 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
13157 It != Slice.end()) {
13158 // If any instruction is vectorized already - do not try again.
13159 SameTE = getSameValuesTreeEntry(*It, Slice);
13160 }
13161 unsigned PrevSize = VectorizableTree.size();
13162 [[maybe_unused]] unsigned PrevEntriesSize =
13163 LoadEntriesToVectorize.size();
13164 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
13165 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13166 VectorizableTree[PrevSize]->isGather() &&
13167 VectorizableTree[PrevSize]->hasState() &&
13168 VectorizableTree[PrevSize]->getOpcode() !=
13169 Instruction::ExtractElement &&
13170 !isSplat(Slice)) {
13171 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13172 analyzedReductionVals(Slice);
13173 VectorizableTree.pop_back();
13174 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13175 "LoadEntriesToVectorize expected to remain the same");
13176 continue;
13177 }
13178 AddCombinedNode(PrevSize, Cnt, Sz);
13179 }
13180 }
13181 // Restore ordering, if no extra vectorization happened.
13182 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13183 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13184 reorderScalars(E.Scalars, Mask);
13185 E.ReorderIndices.clear();
13186 }
13187 }
13188 if (!E.hasState())
13189 continue;
13190 switch (E.getOpcode()) {
13191 case Instruction::Load: {
13192 // No need to reorder masked gather loads, just reorder the scalar
13193 // operands.
13194 if (E.State != TreeEntry::Vectorize)
13195 break;
13196 Type *ScalarTy = E.getMainOp()->getType();
13197 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13198 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13199 // Check if profitable to represent consecutive load + reverse as strided
13200 // load with stride -1.
13201 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13202 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13203 SmallVector<int> Mask;
13204 inversePermutation(E.ReorderIndices, Mask);
13205 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13206 InstructionCost OriginalVecCost =
13207 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13208 BaseLI->getPointerAddressSpace(), CostKind,
13210 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13211 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13212 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13213 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
13214 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13215 // Strided load is more profitable than consecutive load + reverse -
13216 // transform the node to strided load.
13217 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
13218 ->getPointerOperand()
13219 ->getType());
13220 StridedPtrInfo SPtrInfo;
13221 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13222 SPtrInfo.Ty = VecTy;
13223 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13224 E.State = TreeEntry::StridedVectorize;
13225 }
13226 }
13227 break;
13228 }
13229 case Instruction::Store: {
13230 Type *ScalarTy =
13231 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13232 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13233 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13234 // Check if profitable to represent consecutive load + reverse as strided
13235 // load with stride -1.
13236 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13237 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13238 SmallVector<int> Mask;
13239 inversePermutation(E.ReorderIndices, Mask);
13240 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13241 InstructionCost OriginalVecCost =
13242 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13243 BaseSI->getPointerAddressSpace(), CostKind,
13245 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13246 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13247 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13248 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
13249 if (StridedCost < OriginalVecCost)
13250 // Strided store is more profitable than reverse + consecutive store -
13251 // transform the node to strided store.
13252 E.State = TreeEntry::StridedVectorize;
13253 } else if (!E.ReorderIndices.empty()) {
13254 // Check for interleaved stores.
13255 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13256 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13257 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13258 if (Mask.size() < 4)
13259 return 0u;
13260 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13262 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13263 TTI.isLegalInterleavedAccessType(
13264 VecTy, Factor, BaseSI->getAlign(),
13265 BaseSI->getPointerAddressSpace()))
13266 return Factor;
13267 }
13268
13269 return 0u;
13270 };
13271 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13272 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13273 if (InterleaveFactor != 0)
13274 E.setInterleave(InterleaveFactor);
13275 }
13276 break;
13277 }
13278 case Instruction::Select: {
13279 if (E.State != TreeEntry::Vectorize)
13280 break;
13281 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13282 if (MinMaxID == Intrinsic::not_intrinsic)
13283 break;
13284 // This node is a minmax node.
13285 E.CombinedOp = TreeEntry::MinMax;
13286 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13287 if (SelectOnly && CondEntry->UserTreeIndex &&
13288 CondEntry->State == TreeEntry::Vectorize) {
13289 // The condition node is part of the combined minmax node.
13290 CondEntry->State = TreeEntry::CombinedVectorize;
13291 }
13292 break;
13293 }
13294 case Instruction::FSub:
13295 case Instruction::FAdd: {
13296 // Check if possible to convert (a*b)+c to fma.
13297 if (E.State != TreeEntry::Vectorize ||
13298 !E.getOperations().isAddSubLikeOp())
13299 break;
13300 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13301 .isValid())
13302 break;
13303 // This node is a fmuladd node.
13304 E.CombinedOp = TreeEntry::FMulAdd;
13305 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13306 if (FMulEntry->UserTreeIndex &&
13307 FMulEntry->State == TreeEntry::Vectorize) {
13308 // The FMul node is part of the combined fmuladd node.
13309 FMulEntry->State = TreeEntry::CombinedVectorize;
13310 }
13311 break;
13312 }
13313 default:
13314 break;
13315 }
13316 }
13317
13318 if (LoadEntriesToVectorize.empty()) {
13319 // Single load node - exit.
13320 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13321 VectorizableTree.front()->getOpcode() == Instruction::Load)
13322 return;
13323 // Small graph with small VF - exit.
13324 constexpr unsigned SmallTree = 3;
13325 constexpr unsigned SmallVF = 2;
13326 if ((VectorizableTree.size() <= SmallTree &&
13327 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13328 (VectorizableTree.size() <= 2 && UserIgnoreList))
13329 return;
13330
13331 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13332 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13333 getCanonicalGraphSize() <= SmallTree &&
13334 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13335 [](const std::unique_ptr<TreeEntry> &TE) {
13336 return TE->isGather() && TE->hasState() &&
13337 TE->getOpcode() == Instruction::Load &&
13338 !allSameBlock(TE->Scalars);
13339 }) == 1)
13340 return;
13341 }
13342
13343 // A list of loads to be gathered during the vectorization process. We can
13344 // try to vectorize them at the end, if profitable.
13345 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13347 GatheredLoads;
13348
13349 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13350 TreeEntry &E = *TE;
13351 if (E.isGather() &&
13352 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13353 (!E.hasState() && any_of(E.Scalars,
13354 [&](Value *V) {
13355 return isa<LoadInst>(V) &&
13356 !isVectorized(V) &&
13357 !isDeleted(cast<Instruction>(V));
13358 }))) &&
13359 !isSplat(E.Scalars)) {
13360 for (Value *V : E.Scalars) {
13361 auto *LI = dyn_cast<LoadInst>(V);
13362 if (!LI)
13363 continue;
13364 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13365 continue;
13367 *this, V, *DL, *SE, *TTI,
13368 GatheredLoads[std::make_tuple(
13369 LI->getParent(),
13370 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
13371 LI->getType())]);
13372 }
13373 }
13374 }
13375 // Try to vectorize gathered loads if this is not just a gather of loads.
13376 if (!GatheredLoads.empty())
13377 tryToVectorizeGatheredLoads(GatheredLoads);
13378}
13379
13380/// Merges shuffle masks and emits final shuffle instruction, if required. It
13381/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13382/// when the actual shuffle instruction is generated only if this is actually
13383/// required. Otherwise, the shuffle instruction emission is delayed till the
13384/// end of the process, to reduce the number of emitted instructions and further
13385/// analysis/transformations.
13386class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13387 bool IsFinalized = false;
13388 SmallVector<int> CommonMask;
13390 const TargetTransformInfo &TTI;
13391 InstructionCost Cost = 0;
13392 SmallDenseSet<Value *> VectorizedVals;
13393 BoUpSLP &R;
13394 SmallPtrSetImpl<Value *> &CheckedExtracts;
13395 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13396 /// While set, still trying to estimate the cost for the same nodes and we
13397 /// can delay actual cost estimation (virtual shuffle instruction emission).
13398 /// May help better estimate the cost if same nodes must be permuted + allows
13399 /// to move most of the long shuffles cost estimation to TTI.
13400 bool SameNodesEstimated = true;
13401
13402 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
13403 if (Ty->getScalarType()->isPointerTy()) {
13406 IntegerType::get(Ty->getContext(),
13407 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13408 Ty->getScalarType());
13409 if (auto *VTy = dyn_cast<VectorType>(Ty))
13410 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
13411 return Res;
13412 }
13413 return Constant::getAllOnesValue(Ty);
13414 }
13415
13416 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
13417 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
13418 return TTI::TCC_Free;
13419 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13420 InstructionCost GatherCost = 0;
13421 SmallVector<Value *> Gathers(VL);
13422 if (!Root && isSplat(VL)) {
13423 // Found the broadcasting of the single scalar, calculate the cost as
13424 // the broadcast.
13425 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
13426 assert(It != VL.end() && "Expected at least one non-undef value.");
13427 // Add broadcast for non-identity shuffle only.
13428 bool NeedShuffle =
13429 count(VL, *It) > 1 &&
13430 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
13431 if (!NeedShuffle) {
13432 if (isa<FixedVectorType>(ScalarTy)) {
13433 assert(SLPReVec && "FixedVectorType is not expected.");
13434 return TTI.getShuffleCost(
13435 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
13436 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
13437 cast<FixedVectorType>(ScalarTy));
13438 }
13439 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13440 CostKind, std::distance(VL.begin(), It),
13441 PoisonValue::get(VecTy), *It);
13442 }
13443
13444 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13445 transform(VL, ShuffleMask.begin(), [](Value *V) {
13446 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13447 });
13448 InstructionCost InsertCost =
13449 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13450 PoisonValue::get(VecTy), *It);
13451 return InsertCost + ::getShuffleCost(TTI,
13453 VecTy, ShuffleMask, CostKind,
13454 /*Index=*/0, /*SubTp=*/nullptr,
13455 /*Args=*/*It);
13456 }
13457 return GatherCost +
13458 (all_of(Gathers, IsaPred<UndefValue>)
13460 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
13461 ScalarTy));
13462 };
13463
13464 /// Compute the cost of creating a vector containing the extracted values from
13465 /// \p VL.
13467 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
13468 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13469 unsigned NumParts) {
13470 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
13471 unsigned NumElts =
13472 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
13473 auto *EE = dyn_cast<ExtractElementInst>(V);
13474 if (!EE)
13475 return Sz;
13476 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13477 if (!VecTy)
13478 return Sz;
13479 return std::max(Sz, VecTy->getNumElements());
13480 });
13481 // FIXME: this must be moved to TTI for better estimation.
13482 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
13483 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
13485 SmallVectorImpl<unsigned> &SubVecSizes)
13486 -> std::optional<TTI::ShuffleKind> {
13487 if (NumElts <= EltsPerVector)
13488 return std::nullopt;
13489 int OffsetReg0 =
13490 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13491 [](int S, int I) {
13492 if (I == PoisonMaskElem)
13493 return S;
13494 return std::min(S, I);
13495 }),
13496 EltsPerVector);
13497 int OffsetReg1 = OffsetReg0;
13498 DenseSet<int> RegIndices;
13499 // Check that if trying to permute same single/2 input vectors.
13501 int FirstRegId = -1;
13502 Indices.assign(1, OffsetReg0);
13503 for (auto [Pos, I] : enumerate(Mask)) {
13504 if (I == PoisonMaskElem)
13505 continue;
13506 int Idx = I - OffsetReg0;
13507 int RegId =
13508 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13509 if (FirstRegId < 0)
13510 FirstRegId = RegId;
13511 RegIndices.insert(RegId);
13512 if (RegIndices.size() > 2)
13513 return std::nullopt;
13514 if (RegIndices.size() == 2) {
13515 ShuffleKind = TTI::SK_PermuteTwoSrc;
13516 if (Indices.size() == 1) {
13517 OffsetReg1 = alignDown(
13518 std::accumulate(
13519 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13520 [&](int S, int I) {
13521 if (I == PoisonMaskElem)
13522 return S;
13523 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13524 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13525 if (RegId == FirstRegId)
13526 return S;
13527 return std::min(S, I);
13528 }),
13529 EltsPerVector);
13530 unsigned Index = OffsetReg1 % NumElts;
13531 Indices.push_back(Index);
13532 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13533 }
13534 Idx = I - OffsetReg1;
13535 }
13536 I = (Idx % NumElts) % EltsPerVector +
13537 (RegId == FirstRegId ? 0 : EltsPerVector);
13538 }
13539 return ShuffleKind;
13540 };
13541 InstructionCost Cost = 0;
13542
13543 // Process extracts in blocks of EltsPerVector to check if the source vector
13544 // operand can be re-used directly. If not, add the cost of creating a
13545 // shuffle to extract the values into a vector register.
13546 for (unsigned Part : seq<unsigned>(NumParts)) {
13547 if (!ShuffleKinds[Part])
13548 continue;
13549 ArrayRef<int> MaskSlice = Mask.slice(
13550 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
13551 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
13552 copy(MaskSlice, SubMask.begin());
13554 SmallVector<unsigned, 2> SubVecSizes;
13555 std::optional<TTI::ShuffleKind> RegShuffleKind =
13556 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13557 if (!RegShuffleKind) {
13558 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
13560 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
13561 Cost +=
13562 ::getShuffleCost(TTI, *ShuffleKinds[Part],
13563 getWidenedType(ScalarTy, NumElts), MaskSlice);
13564 continue;
13565 }
13566 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
13567 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
13568 Cost +=
13569 ::getShuffleCost(TTI, *RegShuffleKind,
13570 getWidenedType(ScalarTy, EltsPerVector), SubMask);
13571 }
13572 const unsigned BaseVF = getFullVectorNumberOfElements(
13573 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
13574 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
13575 assert((Idx + SubVecSize) <= BaseVF &&
13576 "SK_ExtractSubvector index out of range");
13578 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
13579 Idx, getWidenedType(ScalarTy, SubVecSize));
13580 }
13581 // Second attempt to check, if just a permute is better estimated than
13582 // subvector extract.
13583 SubMask.assign(NumElts, PoisonMaskElem);
13584 copy(MaskSlice, SubMask.begin());
13585 InstructionCost OriginalCost = ::getShuffleCost(
13586 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
13587 if (OriginalCost < Cost)
13588 Cost = OriginalCost;
13589 }
13590 return Cost;
13591 }
13592 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
13593 /// mask \p Mask, register number \p Part, that includes \p SliceSize
13594 /// elements.
13595 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
13596 ArrayRef<int> Mask, unsigned Part,
13597 unsigned SliceSize) {
13598 if (SameNodesEstimated) {
13599 // Delay the cost estimation if the same nodes are reshuffling.
13600 // If we already requested the cost of reshuffling of E1 and E2 before, no
13601 // need to estimate another cost with the sub-Mask, instead include this
13602 // sub-Mask into the CommonMask to estimate it later and avoid double cost
13603 // estimation.
13604 if ((InVectors.size() == 2 &&
13605 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
13606 cast<const TreeEntry *>(InVectors.back()) == E2) ||
13607 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
13608 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
13609 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
13610 [](int Idx) { return Idx == PoisonMaskElem; }) &&
13611 "Expected all poisoned elements.");
13612 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
13613 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13614 return;
13615 }
13616 // Found non-matching nodes - need to estimate the cost for the matched
13617 // and transform mask.
13618 Cost += createShuffle(InVectors.front(),
13619 InVectors.size() == 1 ? nullptr : InVectors.back(),
13620 CommonMask);
13621 transformMaskAfterShuffle(CommonMask, CommonMask);
13622 } else if (InVectors.size() == 2) {
13623 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13624 transformMaskAfterShuffle(CommonMask, CommonMask);
13625 }
13626 SameNodesEstimated = false;
13627 if (!E2 && InVectors.size() == 1) {
13628 unsigned VF = E1.getVectorFactor();
13629 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
13630 VF = std::max(VF, getVF(V1));
13631 } else {
13632 const auto *E = cast<const TreeEntry *>(InVectors.front());
13633 VF = std::max(VF, E->getVectorFactor());
13634 }
13635 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13636 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
13637 CommonMask[Idx] = Mask[Idx] + VF;
13638 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13639 transformMaskAfterShuffle(CommonMask, CommonMask);
13640 } else {
13641 auto P = InVectors.front();
13642 Cost += createShuffle(&E1, E2, Mask);
13643 unsigned VF = Mask.size();
13644 if (Value *V1 = dyn_cast<Value *>(P)) {
13645 VF = std::max(VF,
13646 getNumElements(V1->getType()));
13647 } else {
13648 const auto *E = cast<const TreeEntry *>(P);
13649 VF = std::max(VF, E->getVectorFactor());
13650 }
13651 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13652 if (Mask[Idx] != PoisonMaskElem)
13653 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13654 Cost += createShuffle(P, InVectors.front(), CommonMask);
13655 transformMaskAfterShuffle(CommonMask, CommonMask);
13656 }
13657 }
13658
13659 class ShuffleCostBuilder {
13660 const TargetTransformInfo &TTI;
13661
13662 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
13663 int Index = -1;
13664 return Mask.empty() ||
13665 (VF == Mask.size() &&
13668 Index == 0);
13669 }
13670
13671 public:
13672 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
13673 ~ShuffleCostBuilder() = default;
13674 InstructionCost createShuffleVector(Value *V1, Value *,
13675 ArrayRef<int> Mask) const {
13676 // Empty mask or identity mask are free.
13677 unsigned VF =
13678 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13679 if (isEmptyOrIdentity(Mask, VF))
13680 return TTI::TCC_Free;
13681 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
13682 cast<VectorType>(V1->getType()), Mask);
13683 }
13684 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
13685 // Empty mask or identity mask are free.
13686 unsigned VF =
13687 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13688 if (isEmptyOrIdentity(Mask, VF))
13689 return TTI::TCC_Free;
13690 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
13691 cast<VectorType>(V1->getType()), Mask);
13692 }
13693 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
13694 InstructionCost createPoison(Type *Ty, unsigned VF) const {
13695 return TTI::TCC_Free;
13696 }
13697 void resizeToMatch(Value *&, Value *&) const {}
13698 };
13699
13700 /// Smart shuffle instruction emission, walks through shuffles trees and
13701 /// tries to find the best matching vector for the actual shuffle
13702 /// instruction.
13704 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
13706 ArrayRef<int> Mask) {
13707 ShuffleCostBuilder Builder(TTI);
13708 SmallVector<int> CommonMask(Mask);
13709 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
13710 unsigned CommonVF = Mask.size();
13711 InstructionCost ExtraCost = 0;
13712 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
13713 unsigned VF) -> InstructionCost {
13714 if (E.isGather() && allConstant(E.Scalars))
13715 return TTI::TCC_Free;
13716 Type *EScalarTy = E.Scalars.front()->getType();
13717 bool IsSigned = true;
13718 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13719 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
13720 IsSigned = It->second.second;
13721 }
13722 if (EScalarTy != ScalarTy) {
13723 unsigned CastOpcode = Instruction::Trunc;
13724 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13725 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13726 if (DstSz > SrcSz)
13727 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13728 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
13729 getWidenedType(EScalarTy, VF),
13730 TTI::CastContextHint::None, CostKind);
13731 }
13732 return TTI::TCC_Free;
13733 };
13734 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
13735 if (isa<Constant>(V))
13736 return TTI::TCC_Free;
13737 auto *VecTy = cast<VectorType>(V->getType());
13738 Type *EScalarTy = VecTy->getElementType();
13739 if (EScalarTy != ScalarTy) {
13740 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
13741 unsigned CastOpcode = Instruction::Trunc;
13742 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13743 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13744 if (DstSz > SrcSz)
13745 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13746 return TTI.getCastInstrCost(
13747 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
13748 VecTy, TTI::CastContextHint::None, CostKind);
13749 }
13750 return TTI::TCC_Free;
13751 };
13752 if (!V1 && !V2 && !P2.isNull()) {
13753 // Shuffle 2 entry nodes.
13754 const TreeEntry *E = cast<const TreeEntry *>(P1);
13755 unsigned VF = E->getVectorFactor();
13756 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13757 CommonVF = std::max(VF, E2->getVectorFactor());
13758 assert(all_of(Mask,
13759 [=](int Idx) {
13760 return Idx < 2 * static_cast<int>(CommonVF);
13761 }) &&
13762 "All elements in mask must be less than 2 * CommonVF.");
13763 if (E->Scalars.size() == E2->Scalars.size()) {
13764 SmallVector<int> EMask = E->getCommonMask();
13765 SmallVector<int> E2Mask = E2->getCommonMask();
13766 if (!EMask.empty() || !E2Mask.empty()) {
13767 for (int &Idx : CommonMask) {
13768 if (Idx == PoisonMaskElem)
13769 continue;
13770 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
13771 Idx = EMask[Idx];
13772 else if (Idx >= static_cast<int>(CommonVF))
13773 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13774 E->Scalars.size();
13775 }
13776 }
13777 CommonVF = E->Scalars.size();
13778 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13779 GetNodeMinBWAffectedCost(*E2, CommonVF);
13780 } else {
13781 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13782 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13783 }
13784 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13785 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13786 } else if (!V1 && P2.isNull()) {
13787 // Shuffle single entry node.
13788 const TreeEntry *E = cast<const TreeEntry *>(P1);
13789 unsigned VF = E->getVectorFactor();
13790 CommonVF = VF;
13791 assert(
13792 all_of(Mask,
13793 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13794 "All elements in mask must be less than CommonVF.");
13795 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13796 SmallVector<int> EMask = E->getCommonMask();
13797 assert(!EMask.empty() && "Expected non-empty common mask.");
13798 for (int &Idx : CommonMask) {
13799 if (Idx != PoisonMaskElem)
13800 Idx = EMask[Idx];
13801 }
13802 CommonVF = E->Scalars.size();
13803 } else if (unsigned Factor = E->getInterleaveFactor();
13804 Factor > 0 && E->Scalars.size() != Mask.size() &&
13806 Factor)) {
13807 // Deinterleaved nodes are free.
13808 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13809 }
13810 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13811 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13812 // Not identity/broadcast? Try to see if the original vector is better.
13813 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13814 CommonVF == CommonMask.size() &&
13815 any_of(enumerate(CommonMask),
13816 [](const auto &&P) {
13817 return P.value() != PoisonMaskElem &&
13818 static_cast<unsigned>(P.value()) != P.index();
13819 }) &&
13820 any_of(CommonMask,
13821 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
13822 SmallVector<int> ReorderMask;
13823 inversePermutation(E->ReorderIndices, ReorderMask);
13824 ::addMask(CommonMask, ReorderMask);
13825 }
13826 } else if (V1 && P2.isNull()) {
13827 // Shuffle single vector.
13828 ExtraCost += GetValueMinBWAffectedCost(V1);
13829 CommonVF = getVF(V1);
13830 assert(
13831 all_of(Mask,
13832 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13833 "All elements in mask must be less than CommonVF.");
13834 } else if (V1 && !V2) {
13835 // Shuffle vector and tree node.
13836 unsigned VF = getVF(V1);
13837 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13838 CommonVF = std::max(VF, E2->getVectorFactor());
13839 assert(all_of(Mask,
13840 [=](int Idx) {
13841 return Idx < 2 * static_cast<int>(CommonVF);
13842 }) &&
13843 "All elements in mask must be less than 2 * CommonVF.");
13844 if (E2->Scalars.size() == VF && VF != CommonVF) {
13845 SmallVector<int> E2Mask = E2->getCommonMask();
13846 assert(!E2Mask.empty() && "Expected non-empty common mask.");
13847 for (int &Idx : CommonMask) {
13848 if (Idx == PoisonMaskElem)
13849 continue;
13850 if (Idx >= static_cast<int>(CommonVF))
13851 Idx = E2Mask[Idx - CommonVF] + VF;
13852 }
13853 CommonVF = VF;
13854 }
13855 ExtraCost += GetValueMinBWAffectedCost(V1);
13856 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13857 ExtraCost += GetNodeMinBWAffectedCost(
13858 *E2, std::min(CommonVF, E2->getVectorFactor()));
13859 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13860 } else if (!V1 && V2) {
13861 // Shuffle vector and tree node.
13862 unsigned VF = getVF(V2);
13863 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
13864 CommonVF = std::max(VF, E1->getVectorFactor());
13865 assert(all_of(Mask,
13866 [=](int Idx) {
13867 return Idx < 2 * static_cast<int>(CommonVF);
13868 }) &&
13869 "All elements in mask must be less than 2 * CommonVF.");
13870 if (E1->Scalars.size() == VF && VF != CommonVF) {
13871 SmallVector<int> E1Mask = E1->getCommonMask();
13872 assert(!E1Mask.empty() && "Expected non-empty common mask.");
13873 for (int &Idx : CommonMask) {
13874 if (Idx == PoisonMaskElem)
13875 continue;
13876 if (Idx >= static_cast<int>(CommonVF))
13877 Idx = E1Mask[Idx - CommonVF] + VF;
13878 else
13879 Idx = E1Mask[Idx];
13880 }
13881 CommonVF = VF;
13882 }
13883 ExtraCost += GetNodeMinBWAffectedCost(
13884 *E1, std::min(CommonVF, E1->getVectorFactor()));
13885 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13886 ExtraCost += GetValueMinBWAffectedCost(V2);
13887 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13888 } else {
13889 assert(V1 && V2 && "Expected both vectors.");
13890 unsigned VF = getVF(V1);
13891 CommonVF = std::max(VF, getVF(V2));
13892 assert(all_of(Mask,
13893 [=](int Idx) {
13894 return Idx < 2 * static_cast<int>(CommonVF);
13895 }) &&
13896 "All elements in mask must be less than 2 * CommonVF.");
13897 ExtraCost +=
13898 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13899 if (V1->getType() != V2->getType()) {
13900 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13901 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13902 } else {
13903 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
13904 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13905 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
13906 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13907 }
13908 }
13909 InVectors.front() =
13910 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
13911 if (InVectors.size() == 2)
13912 InVectors.pop_back();
13913 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13914 V1, V2, CommonMask, Builder, ScalarTy);
13915 }
13916
13917public:
13919 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
13920 SmallPtrSetImpl<Value *> &CheckedExtracts)
13921 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13922 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13923 CheckedExtracts(CheckedExtracts) {}
13924 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
13925 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13926 unsigned NumParts, bool &UseVecBaseAsInput) {
13927 UseVecBaseAsInput = false;
13928 if (Mask.empty())
13929 return nullptr;
13930 Value *VecBase = nullptr;
13931 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
13932 if (!E->ReorderIndices.empty()) {
13933 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
13934 E->ReorderIndices.end());
13935 reorderScalars(VL, ReorderMask);
13936 }
13937 // Check if it can be considered reused if same extractelements were
13938 // vectorized already.
13939 bool PrevNodeFound = any_of(
13940 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13941 [&](const std::unique_ptr<TreeEntry> &TE) {
13942 return ((TE->hasState() && !TE->isAltShuffle() &&
13943 TE->getOpcode() == Instruction::ExtractElement) ||
13944 TE->isGather()) &&
13945 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13946 return VL.size() > Data.index() &&
13947 (Mask[Data.index()] == PoisonMaskElem ||
13948 isa<UndefValue>(VL[Data.index()]) ||
13949 Data.value() == VL[Data.index()]);
13950 });
13951 });
13952 SmallPtrSet<Value *, 4> UniqueBases;
13953 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13954 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
13955 for (unsigned Part : seq<unsigned>(NumParts)) {
13956 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
13957 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13958 for (auto [I, V] :
13959 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
13960 // Ignore non-extractelement scalars.
13961 if (isa<UndefValue>(V) ||
13962 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
13963 continue;
13964 // If all users of instruction are going to be vectorized and this
13965 // instruction itself is not going to be vectorized, consider this
13966 // instruction as dead and remove its cost from the final cost of the
13967 // vectorized tree.
13968 // Also, avoid adjusting the cost for extractelements with multiple uses
13969 // in different graph entries.
13970 auto *EE = cast<ExtractElementInst>(V);
13971 VecBase = EE->getVectorOperand();
13972 UniqueBases.insert(VecBase);
13973 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
13974 if (!CheckedExtracts.insert(V).second ||
13975 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
13976 any_of(EE->users(),
13977 [&](User *U) {
13978 return isa<GetElementPtrInst>(U) &&
13979 !R.areAllUsersVectorized(cast<Instruction>(U),
13980 &VectorizedVals);
13981 }) ||
13982 (!VEs.empty() && !is_contained(VEs, E)))
13983 continue;
13984 std::optional<unsigned> EEIdx = getExtractIndex(EE);
13985 if (!EEIdx)
13986 continue;
13987 unsigned Idx = *EEIdx;
13988 // Take credit for instruction that will become dead.
13989 if (EE->hasOneUse() || !PrevNodeFound) {
13990 Instruction *Ext = EE->user_back();
13991 if (isa<SExtInst, ZExtInst>(Ext) &&
13992 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
13993 // Use getExtractWithExtendCost() to calculate the cost of
13994 // extractelement/ext pair.
13995 Cost -= TTI.getExtractWithExtendCost(
13996 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13997 Idx, CostKind);
13998 // Add back the cost of s|zext which is subtracted separately.
13999 Cost += TTI.getCastInstrCost(
14000 Ext->getOpcode(), Ext->getType(), EE->getType(),
14002 continue;
14003 }
14004 }
14005 APInt &DemandedElts =
14006 VectorOpsToExtracts
14007 .try_emplace(VecBase,
14008 APInt::getZero(getNumElements(VecBase->getType())))
14009 .first->getSecond();
14010 DemandedElts.setBit(Idx);
14011 }
14012 }
14013 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14015 DemandedElts, /*Insert=*/false,
14016 /*Extract=*/true, CostKind);
14017 // Check that gather of extractelements can be represented as just a
14018 // shuffle of a single/two vectors the scalars are extracted from.
14019 // Found the bunch of extractelement instructions that must be gathered
14020 // into a vector and can be represented as a permutation elements in a
14021 // single input vector or of 2 input vectors.
14022 // Done for reused if same extractelements were vectorized already.
14023 if (!PrevNodeFound)
14024 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14025 InVectors.assign(1, E);
14026 CommonMask.assign(Mask.begin(), Mask.end());
14027 transformMaskAfterShuffle(CommonMask, CommonMask);
14028 SameNodesEstimated = false;
14029 if (NumParts != 1 && UniqueBases.size() != 1) {
14030 UseVecBaseAsInput = true;
14031 VecBase =
14032 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14033 }
14034 return VecBase;
14035 }
14036 /// Checks if the specified entry \p E needs to be delayed because of its
14037 /// dependency nodes.
14038 std::optional<InstructionCost>
14039 needToDelay(const TreeEntry *,
14041 // No need to delay the cost estimation during analysis.
14042 return std::nullopt;
14043 }
14044 /// Reset the builder to handle perfect diamond match.
14046 IsFinalized = false;
14047 CommonMask.clear();
14048 InVectors.clear();
14049 Cost = 0;
14050 VectorizedVals.clear();
14051 SameNodesEstimated = true;
14052 }
14053 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14054 if (&E1 == &E2) {
14055 assert(all_of(Mask,
14056 [&](int Idx) {
14057 return Idx < static_cast<int>(E1.getVectorFactor());
14058 }) &&
14059 "Expected single vector shuffle mask.");
14060 add(E1, Mask);
14061 return;
14062 }
14063 if (InVectors.empty()) {
14064 CommonMask.assign(Mask.begin(), Mask.end());
14065 InVectors.assign({&E1, &E2});
14066 return;
14067 }
14068 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14069 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14070 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14071 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14072 const auto *It =
14073 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
14074 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14075 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14076 }
14077 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14078 if (InVectors.empty()) {
14079 CommonMask.assign(Mask.begin(), Mask.end());
14080 InVectors.assign(1, &E1);
14081 return;
14082 }
14083 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14084 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14085 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14086 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14087 const auto *It =
14088 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
14089 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14090 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
14091 if (!SameNodesEstimated && InVectors.size() == 1)
14092 InVectors.emplace_back(&E1);
14093 }
14094 /// Adds 2 input vectors and the mask for their shuffling.
14095 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14096 // May come only for shuffling of 2 vectors with extractelements, already
14097 // handled in adjustExtracts.
14098 assert(InVectors.size() == 1 &&
14099 all_of(enumerate(CommonMask),
14100 [&](auto P) {
14101 if (P.value() == PoisonMaskElem)
14102 return Mask[P.index()] == PoisonMaskElem;
14103 auto *EI = cast<ExtractElementInst>(
14104 cast<const TreeEntry *>(InVectors.front())
14105 ->getOrdered(P.index()));
14106 return EI->getVectorOperand() == V1 ||
14107 EI->getVectorOperand() == V2;
14108 }) &&
14109 "Expected extractelement vectors.");
14110 }
14111 /// Adds another one input vector and the mask for the shuffling.
14112 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
14113 if (InVectors.empty()) {
14114 assert(CommonMask.empty() && !ForExtracts &&
14115 "Expected empty input mask/vectors.");
14116 CommonMask.assign(Mask.begin(), Mask.end());
14117 InVectors.assign(1, V1);
14118 return;
14119 }
14120 if (ForExtracts) {
14121 // No need to add vectors here, already handled them in adjustExtracts.
14122 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
14123 !CommonMask.empty() &&
14124 all_of(enumerate(CommonMask),
14125 [&](auto P) {
14126 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
14127 ->getOrdered(P.index());
14128 if (P.value() == PoisonMaskElem)
14129 return P.value() == Mask[P.index()] ||
14130 isa<UndefValue>(Scalar);
14131 if (isa<Constant>(V1))
14132 return true;
14133 auto *EI = cast<ExtractElementInst>(Scalar);
14134 return EI->getVectorOperand() == V1;
14135 }) &&
14136 "Expected only tree entry for extractelement vectors.");
14137 return;
14138 }
14139 assert(!InVectors.empty() && !CommonMask.empty() &&
14140 "Expected only tree entries from extracts/reused buildvectors.");
14141 unsigned VF = getVF(V1);
14142 if (InVectors.size() == 2) {
14143 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14144 transformMaskAfterShuffle(CommonMask, CommonMask);
14145 VF = std::max<unsigned>(VF, CommonMask.size());
14146 } else if (const auto *InTE =
14147 InVectors.front().dyn_cast<const TreeEntry *>()) {
14148 VF = std::max(VF, InTE->getVectorFactor());
14149 } else {
14150 VF = std::max(
14151 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
14152 ->getNumElements());
14153 }
14154 InVectors.push_back(V1);
14155 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14156 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14157 CommonMask[Idx] = Mask[Idx] + VF;
14158 }
14159 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14160 Value *Root = nullptr) {
14161 Cost += getBuildVectorCost(VL, Root);
14162 if (!Root) {
14163 // FIXME: Need to find a way to avoid use of getNullValue here.
14165 unsigned VF = VL.size();
14166 if (MaskVF != 0)
14167 VF = std::min(VF, MaskVF);
14168 Type *VLScalarTy = VL.front()->getType();
14169 for (Value *V : VL.take_front(VF)) {
14170 Type *ScalarTy = VLScalarTy->getScalarType();
14171 if (isa<PoisonValue>(V)) {
14172 Vals.push_back(PoisonValue::get(ScalarTy));
14173 continue;
14174 }
14175 if (isa<UndefValue>(V)) {
14176 Vals.push_back(UndefValue::get(ScalarTy));
14177 continue;
14178 }
14179 Vals.push_back(Constant::getNullValue(ScalarTy));
14180 }
14181 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
14182 assert(SLPReVec && "FixedVectorType is not expected.");
14183 // When REVEC is enabled, we need to expand vector types into scalar
14184 // types.
14185 Vals = replicateMask(Vals, VecTy->getNumElements());
14186 }
14187 return ConstantVector::get(Vals);
14188 }
14191 cast<FixedVectorType>(Root->getType())->getNumElements()),
14192 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14193 }
14195 /// Finalize emission of the shuffles.
14197 ArrayRef<int> ExtMask,
14198 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14199 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14202 Action = {}) {
14203 IsFinalized = true;
14204 if (Action) {
14205 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14206 if (InVectors.size() == 2)
14207 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14208 else
14209 Cost += createShuffle(Vec, nullptr, CommonMask);
14210 transformMaskAfterShuffle(CommonMask, CommonMask);
14211 assert(VF > 0 &&
14212 "Expected vector length for the final value before action.");
14213 Value *V = cast<Value *>(Vec);
14214 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14215 Cost += createShuffle(V1, V2, Mask);
14216 return V1;
14217 });
14218 InVectors.front() = V;
14219 }
14220 if (!SubVectors.empty()) {
14221 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14222 if (InVectors.size() == 2)
14223 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14224 else
14225 Cost += createShuffle(Vec, nullptr, CommonMask);
14226 transformMaskAfterShuffle(CommonMask, CommonMask);
14227 // Add subvectors permutation cost.
14228 if (!SubVectorsMask.empty()) {
14229 assert(SubVectorsMask.size() <= CommonMask.size() &&
14230 "Expected same size of masks for subvectors and common mask.");
14231 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14232 copy(SubVectorsMask, SVMask.begin());
14233 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14234 if (I2 != PoisonMaskElem) {
14235 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14236 I1 = I2 + CommonMask.size();
14237 }
14238 }
14240 getWidenedType(ScalarTy, CommonMask.size()),
14241 SVMask, CostKind);
14242 }
14243 for (auto [E, Idx] : SubVectors) {
14244 Type *EScalarTy = E->Scalars.front()->getType();
14245 bool IsSigned = true;
14246 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14247 EScalarTy =
14248 IntegerType::get(EScalarTy->getContext(), It->second.first);
14249 IsSigned = It->second.second;
14250 }
14251 if (ScalarTy != EScalarTy) {
14252 unsigned CastOpcode = Instruction::Trunc;
14253 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14254 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14255 if (DstSz > SrcSz)
14256 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14257 Cost += TTI.getCastInstrCost(
14258 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14259 getWidenedType(EScalarTy, E->getVectorFactor()),
14261 }
14264 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14265 getWidenedType(ScalarTy, E->getVectorFactor()));
14266 if (!CommonMask.empty()) {
14267 std::iota(std::next(CommonMask.begin(), Idx),
14268 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14269 Idx);
14270 }
14271 }
14272 }
14273
14274 if (!ExtMask.empty()) {
14275 if (CommonMask.empty()) {
14276 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14277 } else {
14278 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14279 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14280 if (ExtMask[I] == PoisonMaskElem)
14281 continue;
14282 NewMask[I] = CommonMask[ExtMask[I]];
14283 }
14284 CommonMask.swap(NewMask);
14285 }
14286 }
14287 if (CommonMask.empty()) {
14288 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14289 return Cost;
14290 }
14291 return Cost +
14292 createShuffle(InVectors.front(),
14293 InVectors.size() == 2 ? InVectors.back() : nullptr,
14294 CommonMask);
14295 }
14296
14298 assert((IsFinalized || CommonMask.empty()) &&
14299 "Shuffle construction must be finalized.");
14300 }
14301};
14302
14303const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14304 unsigned Idx) const {
14305 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14306 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14307 return Op;
14308}
14309
14310TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14311 if (TE.State == TreeEntry::ScatterVectorize ||
14312 TE.State == TreeEntry::StridedVectorize)
14314 if (TE.State == TreeEntry::CompressVectorize)
14316 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14317 !TE.isAltShuffle()) {
14318 if (TE.ReorderIndices.empty())
14320 SmallVector<int> Mask;
14321 inversePermutation(TE.ReorderIndices, Mask);
14322 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14324 }
14326}
14327
14329BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14330 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14331 ArrayRef<Value *> VL = E->Scalars;
14332
14333 Type *ScalarTy = getValueType(VL[0]);
14334 if (!isValidElementType(ScalarTy))
14337
14338 // If we have computed a smaller type for the expression, update VecTy so
14339 // that the costs will be accurate.
14340 auto It = MinBWs.find(E);
14341 Type *OrigScalarTy = ScalarTy;
14342 if (It != MinBWs.end()) {
14343 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14344 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14345 if (VecTy)
14346 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14347 }
14348 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14349 unsigned EntryVF = E->getVectorFactor();
14350 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14351
14352 if (E->isGather()) {
14353 if (allConstant(VL))
14354 return 0;
14355 if (isa<InsertElementInst>(VL[0]))
14357 if (isa<CmpInst>(VL.front()))
14358 ScalarTy = VL.front()->getType();
14359 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14360 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14361 }
14362 if (E->State == TreeEntry::SplitVectorize) {
14363 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14364 "Expected exactly 2 combined entries.");
14365 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14366 InstructionCost VectorCost = 0;
14367 if (E->ReorderIndices.empty()) {
14368 VectorCost = ::getShuffleCost(
14369 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14370 E->CombinedEntriesWithIndices.back().second,
14372 ScalarTy,
14373 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14374 ->getVectorFactor()));
14375 } else {
14376 unsigned CommonVF =
14377 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14378 ->getVectorFactor(),
14379 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14380 ->getVectorFactor());
14381 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
14382 getWidenedType(ScalarTy, CommonVF),
14383 E->getSplitMask(), CostKind);
14384 }
14385 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14386 return VectorCost;
14387 }
14388 InstructionCost CommonCost = 0;
14389 SmallVector<int> Mask;
14390 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14391 (E->State != TreeEntry::StridedVectorize ||
14392 !isReverseOrder(E->ReorderIndices))) {
14393 SmallVector<int> NewMask;
14394 if (E->getOpcode() == Instruction::Store) {
14395 // For stores the order is actually a mask.
14396 NewMask.resize(E->ReorderIndices.size());
14397 copy(E->ReorderIndices, NewMask.begin());
14398 } else {
14399 inversePermutation(E->ReorderIndices, NewMask);
14400 }
14401 ::addMask(Mask, NewMask);
14402 }
14403 if (!E->ReuseShuffleIndices.empty())
14404 ::addMask(Mask, E->ReuseShuffleIndices);
14405 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14406 CommonCost =
14407 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
14408 assert((E->State == TreeEntry::Vectorize ||
14409 E->State == TreeEntry::ScatterVectorize ||
14410 E->State == TreeEntry::StridedVectorize ||
14411 E->State == TreeEntry::CompressVectorize) &&
14412 "Unhandled state");
14413 assert(E->getOpcode() &&
14414 ((allSameType(VL) && allSameBlock(VL)) ||
14415 (E->getOpcode() == Instruction::GetElementPtr &&
14416 E->getMainOp()->getType()->isPointerTy()) ||
14417 E->hasCopyableElements()) &&
14418 "Invalid VL");
14419 Instruction *VL0 = E->getMainOp();
14420 unsigned ShuffleOrOp =
14421 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14422 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14423 ShuffleOrOp = E->CombinedOp;
14424 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
14425 const unsigned Sz = UniqueValues.size();
14426 SmallBitVector UsedScalars(Sz, false);
14427 for (unsigned I = 0; I < Sz; ++I) {
14428 if (isa<Instruction>(UniqueValues[I]) &&
14429 !E->isCopyableElement(UniqueValues[I]) &&
14430 getTreeEntries(UniqueValues[I]).front() == E)
14431 continue;
14432 UsedScalars.set(I);
14433 }
14434 auto GetCastContextHint = [&](Value *V) {
14435 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
14436 return getCastContextHint(*OpTEs.front());
14437 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
14438 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14439 !SrcState.isAltShuffle())
14442 };
14443 auto GetCostDiff =
14444 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
14445 function_ref<InstructionCost(InstructionCost)> VectorCost) {
14446 // Calculate the cost of this instruction.
14447 InstructionCost ScalarCost = 0;
14448 if (isa<CastInst, CallInst>(VL0)) {
14449 // For some of the instructions no need to calculate cost for each
14450 // particular instruction, we can use the cost of the single
14451 // instruction x total number of scalar instructions.
14452 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14453 } else {
14454 for (unsigned I = 0; I < Sz; ++I) {
14455 if (UsedScalars.test(I))
14456 continue;
14457 ScalarCost += ScalarEltCost(I);
14458 }
14459 }
14460
14461 InstructionCost VecCost = VectorCost(CommonCost);
14462 // Check if the current node must be resized, if the parent node is not
14463 // resized.
14464 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
14465 E->Idx != 0 &&
14466 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14467 const EdgeInfo &EI = E->UserTreeIndex;
14468 if (!EI.UserTE->hasState() ||
14469 EI.UserTE->getOpcode() != Instruction::Select ||
14470 EI.EdgeIdx != 0) {
14471 auto UserBWIt = MinBWs.find(EI.UserTE);
14472 Type *UserScalarTy =
14473 (EI.UserTE->isGather() ||
14474 EI.UserTE->State == TreeEntry::SplitVectorize)
14475 ? EI.UserTE->Scalars.front()->getType()
14476 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14477 if (UserBWIt != MinBWs.end())
14478 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
14479 UserBWIt->second.first);
14480 if (ScalarTy != UserScalarTy) {
14481 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14482 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14483 unsigned VecOpcode;
14484 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
14485 if (BWSz > SrcBWSz)
14486 VecOpcode = Instruction::Trunc;
14487 else
14488 VecOpcode =
14489 It->second.second ? Instruction::SExt : Instruction::ZExt;
14490 TTI::CastContextHint CCH = GetCastContextHint(VL0);
14491 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14492 CostKind);
14493 }
14494 }
14495 }
14496 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14497 ScalarCost, "Calculated costs for Tree"));
14498 return VecCost - ScalarCost;
14499 };
14500 // Calculate cost difference from vectorizing set of GEPs.
14501 // Negative value means vectorizing is profitable.
14502 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
14503 assert((E->State == TreeEntry::Vectorize ||
14504 E->State == TreeEntry::StridedVectorize ||
14505 E->State == TreeEntry::CompressVectorize) &&
14506 "Entry state expected to be Vectorize, StridedVectorize or "
14507 "MaskedLoadCompressVectorize here.");
14508 InstructionCost ScalarCost = 0;
14509 InstructionCost VecCost = 0;
14510 std::tie(ScalarCost, VecCost) = getGEPCosts(
14511 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
14512 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14513 "Calculated GEPs cost for Tree"));
14514
14515 return VecCost - ScalarCost;
14516 };
14517
14518 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
14519 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
14520 if (MinMaxID == Intrinsic::not_intrinsic)
14522 Type *CanonicalType = Ty;
14523 if (CanonicalType->isPtrOrPtrVectorTy())
14524 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
14525 CanonicalType->getContext(),
14526 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
14527
14528 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14529 {CanonicalType, CanonicalType});
14531 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14532 // If the selects are the only uses of the compares, they will be
14533 // dead and we can adjust the cost by removing their cost.
14534 if (VI && SelectOnly) {
14535 assert((!Ty->isVectorTy() || SLPReVec) &&
14536 "Expected only for scalar type.");
14537 auto *CI = cast<CmpInst>(VI->getOperand(0));
14538 IntrinsicCost -= TTI->getCmpSelInstrCost(
14539 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14540 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14541 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14542 }
14543 return IntrinsicCost;
14544 };
14545 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
14546 Instruction *VI) {
14547 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
14548 return Cost;
14549 };
14550 switch (ShuffleOrOp) {
14551 case Instruction::PHI: {
14552 // Count reused scalars.
14553 InstructionCost ScalarCost = 0;
14554 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14555 for (Value *V : UniqueValues) {
14556 auto *PHI = dyn_cast<PHINode>(V);
14557 if (!PHI)
14558 continue;
14559
14560 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
14561 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
14562 Value *Op = PHI->getIncomingValue(I);
14563 Operands[I] = Op;
14564 }
14565 if (const TreeEntry *OpTE =
14566 getSameValuesTreeEntry(Operands.front(), Operands))
14567 if (CountedOps.insert(OpTE).second &&
14568 !OpTE->ReuseShuffleIndices.empty())
14569 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14570 OpTE->Scalars.size());
14571 }
14572
14573 return CommonCost - ScalarCost;
14574 }
14575 case Instruction::ExtractValue:
14576 case Instruction::ExtractElement: {
14577 APInt DemandedElts;
14578 VectorType *SrcVecTy = nullptr;
14579 auto GetScalarCost = [&](unsigned Idx) {
14580 if (isa<PoisonValue>(UniqueValues[Idx]))
14582
14583 auto *I = cast<Instruction>(UniqueValues[Idx]);
14584 if (!SrcVecTy) {
14585 if (ShuffleOrOp == Instruction::ExtractElement) {
14586 auto *EE = cast<ExtractElementInst>(I);
14587 SrcVecTy = EE->getVectorOperandType();
14588 } else {
14589 auto *EV = cast<ExtractValueInst>(I);
14590 Type *AggregateTy = EV->getAggregateOperand()->getType();
14591 unsigned NumElts;
14592 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
14593 NumElts = ATy->getNumElements();
14594 else
14595 NumElts = AggregateTy->getStructNumElements();
14596 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
14597 }
14598 }
14599 if (I->hasOneUse()) {
14600 Instruction *Ext = I->user_back();
14601 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
14603 // Use getExtractWithExtendCost() to calculate the cost of
14604 // extractelement/ext pair.
14605 InstructionCost Cost = TTI->getExtractWithExtendCost(
14606 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
14607 CostKind);
14608 // Subtract the cost of s|zext which is subtracted separately.
14609 Cost -= TTI->getCastInstrCost(
14610 Ext->getOpcode(), Ext->getType(), I->getType(),
14612 return Cost;
14613 }
14614 }
14615 if (DemandedElts.isZero())
14616 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
14617 DemandedElts.setBit(*getExtractIndex(I));
14619 };
14620 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14621 return CommonCost - (DemandedElts.isZero()
14623 : TTI.getScalarizationOverhead(
14624 SrcVecTy, DemandedElts, /*Insert=*/false,
14625 /*Extract=*/true, CostKind));
14626 };
14627 return GetCostDiff(GetScalarCost, GetVectorCost);
14628 }
14629 case Instruction::InsertElement: {
14630 assert(E->ReuseShuffleIndices.empty() &&
14631 "Unique insertelements only are expected.");
14632 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
14633 unsigned const NumElts = SrcVecTy->getNumElements();
14634 unsigned const NumScalars = VL.size();
14635
14636 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
14637
14638 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
14639 unsigned OffsetBeg = *getElementIndex(VL.front());
14640 unsigned OffsetEnd = OffsetBeg;
14641 InsertMask[OffsetBeg] = 0;
14642 for (auto [I, V] : enumerate(VL.drop_front())) {
14643 unsigned Idx = *getElementIndex(V);
14644 if (OffsetBeg > Idx)
14645 OffsetBeg = Idx;
14646 else if (OffsetEnd < Idx)
14647 OffsetEnd = Idx;
14648 InsertMask[Idx] = I + 1;
14649 }
14650 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
14651 if (NumOfParts > 0 && NumOfParts < NumElts)
14652 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14653 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14654 VecScalarsSz;
14655 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14656 unsigned InsertVecSz = std::min<unsigned>(
14657 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
14658 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14659 bool IsWholeSubvector =
14660 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14661 // Check if we can safely insert a subvector. If it is not possible, just
14662 // generate a whole-sized vector and shuffle the source vector and the new
14663 // subvector.
14664 if (OffsetBeg + InsertVecSz > VecSz) {
14665 // Align OffsetBeg to generate correct mask.
14666 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
14667 InsertVecSz = VecSz;
14668 }
14669
14670 APInt DemandedElts = APInt::getZero(NumElts);
14671 // TODO: Add support for Instruction::InsertValue.
14672 SmallVector<int> Mask;
14673 if (!E->ReorderIndices.empty()) {
14674 inversePermutation(E->ReorderIndices, Mask);
14675 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
14676 } else {
14677 Mask.assign(VecSz, PoisonMaskElem);
14678 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
14679 }
14680 bool IsIdentity = true;
14681 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
14682 Mask.swap(PrevMask);
14683 for (unsigned I = 0; I < NumScalars; ++I) {
14684 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
14685 DemandedElts.setBit(InsertIdx);
14686 IsIdentity &= InsertIdx - OffsetBeg == I;
14687 Mask[InsertIdx - OffsetBeg] = I;
14688 }
14689 assert(Offset < NumElts && "Failed to find vector index offset");
14690
14692 Cost -=
14693 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
14694 /*Insert*/ true, /*Extract*/ false, CostKind);
14695
14696 // First cost - resize to actual vector size if not identity shuffle or
14697 // need to shift the vector.
14698 // Do not calculate the cost if the actual size is the register size and
14699 // we can merge this shuffle with the following SK_Select.
14700 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
14701 if (!IsIdentity)
14703 InsertVecTy, Mask);
14704 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
14705 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14706 }));
14707 // Second cost - permutation with subvector, if some elements are from the
14708 // initial vector or inserting a subvector.
14709 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
14710 // subvector of ActualVecTy.
14711 SmallBitVector InMask =
14712 isUndefVector(FirstInsert->getOperand(0),
14713 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14714 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
14715 if (InsertVecSz != VecSz) {
14716 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
14717 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
14718 CostKind, OffsetBeg - Offset, InsertVecTy);
14719 } else {
14720 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
14721 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
14722 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
14723 I <= End; ++I)
14724 if (Mask[I] != PoisonMaskElem)
14725 Mask[I] = I + VecSz;
14726 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
14727 Mask[I] =
14728 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
14729 Cost +=
14730 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
14731 }
14732 }
14733 return Cost;
14734 }
14735 case Instruction::ZExt:
14736 case Instruction::SExt:
14737 case Instruction::FPToUI:
14738 case Instruction::FPToSI:
14739 case Instruction::FPExt:
14740 case Instruction::PtrToInt:
14741 case Instruction::IntToPtr:
14742 case Instruction::SIToFP:
14743 case Instruction::UIToFP:
14744 case Instruction::Trunc:
14745 case Instruction::FPTrunc:
14746 case Instruction::BitCast: {
14747 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
14748 Type *SrcScalarTy = VL0->getOperand(0)->getType();
14749 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
14750 unsigned Opcode = ShuffleOrOp;
14751 unsigned VecOpcode = Opcode;
14752 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
14753 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14754 // Check if the values are candidates to demote.
14755 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
14756 if (SrcIt != MinBWs.end()) {
14757 SrcBWSz = SrcIt->second.first;
14758 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
14759 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
14760 SrcVecTy =
14761 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
14762 }
14763 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
14764 if (BWSz == SrcBWSz) {
14765 VecOpcode = Instruction::BitCast;
14766 } else if (BWSz < SrcBWSz) {
14767 VecOpcode = Instruction::Trunc;
14768 } else if (It != MinBWs.end()) {
14769 assert(BWSz > SrcBWSz && "Invalid cast!");
14770 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14771 } else if (SrcIt != MinBWs.end()) {
14772 assert(BWSz > SrcBWSz && "Invalid cast!");
14773 VecOpcode =
14774 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14775 }
14776 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14777 !SrcIt->second.second) {
14778 VecOpcode = Instruction::UIToFP;
14779 }
14780 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
14781 assert(Idx == 0 && "Expected 0 index only");
14782 return TTI->getCastInstrCost(Opcode, VL0->getType(),
14783 VL0->getOperand(0)->getType(),
14785 };
14786 auto GetVectorCost = [=](InstructionCost CommonCost) {
14787 // Do not count cost here if minimum bitwidth is in effect and it is just
14788 // a bitcast (here it is just a noop).
14789 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14790 return CommonCost;
14791 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
14792 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
14793
14794 bool IsArithmeticExtendedReduction =
14795 E->Idx == 0 && UserIgnoreList &&
14796 all_of(*UserIgnoreList, [](Value *V) {
14797 auto *I = cast<Instruction>(V);
14798 return is_contained({Instruction::Add, Instruction::FAdd,
14799 Instruction::Mul, Instruction::FMul,
14800 Instruction::And, Instruction::Or,
14801 Instruction::Xor},
14802 I->getOpcode());
14803 });
14804 if (IsArithmeticExtendedReduction &&
14805 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14806 return CommonCost;
14807 return CommonCost +
14808 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
14809 VecOpcode == Opcode ? VI : nullptr);
14810 };
14811 return GetCostDiff(GetScalarCost, GetVectorCost);
14812 }
14813 case Instruction::FCmp:
14814 case Instruction::ICmp:
14815 case Instruction::Select: {
14816 CmpPredicate VecPred, SwappedVecPred;
14817 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
14818 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
14819 match(VL0, MatchCmp))
14820 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
14821 else
14822 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
14825 auto GetScalarCost = [&](unsigned Idx) {
14826 if (isa<PoisonValue>(UniqueValues[Idx]))
14828
14829 auto *VI = cast<Instruction>(UniqueValues[Idx]);
14830 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
14833 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
14834 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
14835 !match(VI, MatchCmp)) ||
14836 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
14837 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
14838 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
14841
14842 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
14843 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14844 CostKind, getOperandInfo(VI->getOperand(0)),
14845 getOperandInfo(VI->getOperand(1)), VI);
14846 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
14847 if (IntrinsicCost.isValid())
14848 ScalarCost = IntrinsicCost;
14849
14850 return ScalarCost;
14851 };
14852 auto GetVectorCost = [&](InstructionCost CommonCost) {
14853 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
14854
14855 InstructionCost VecCost =
14856 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
14857 CostKind, getOperandInfo(E->getOperand(0)),
14858 getOperandInfo(E->getOperand(1)), VL0);
14859 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
14860 auto *CondType =
14861 getWidenedType(SI->getCondition()->getType(), VL.size());
14862 unsigned CondNumElements = CondType->getNumElements();
14863 unsigned VecTyNumElements = getNumElements(VecTy);
14864 assert(VecTyNumElements >= CondNumElements &&
14865 VecTyNumElements % CondNumElements == 0 &&
14866 "Cannot vectorize Instruction::Select");
14867 if (CondNumElements != VecTyNumElements) {
14868 // When the return type is i1 but the source is fixed vector type, we
14869 // need to duplicate the condition value.
14870 VecCost += ::getShuffleCost(
14871 *TTI, TTI::SK_PermuteSingleSrc, CondType,
14872 createReplicatedMask(VecTyNumElements / CondNumElements,
14873 CondNumElements));
14874 }
14875 }
14876 return VecCost + CommonCost;
14877 };
14878 return GetCostDiff(GetScalarCost, GetVectorCost);
14879 }
14880 case TreeEntry::MinMax: {
14881 auto GetScalarCost = [&](unsigned Idx) {
14882 return GetMinMaxCost(OrigScalarTy);
14883 };
14884 auto GetVectorCost = [&](InstructionCost CommonCost) {
14885 InstructionCost VecCost = GetMinMaxCost(VecTy);
14886 return VecCost + CommonCost;
14887 };
14888 return GetCostDiff(GetScalarCost, GetVectorCost);
14889 }
14890 case TreeEntry::FMulAdd: {
14891 auto GetScalarCost = [&](unsigned Idx) {
14892 if (isa<PoisonValue>(UniqueValues[Idx]))
14894 return GetFMulAddCost(E->getOperations(),
14895 cast<Instruction>(UniqueValues[Idx]));
14896 };
14897 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14898 FastMathFlags FMF;
14899 FMF.set();
14900 for (Value *V : E->Scalars) {
14901 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
14902 FMF &= FPCI->getFastMathFlags();
14903 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
14904 FMF &= FPCIOp->getFastMathFlags();
14905 }
14906 }
14907 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14908 {VecTy, VecTy, VecTy}, FMF);
14909 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
14910 return VecCost + CommonCost;
14911 };
14912 return GetCostDiff(GetScalarCost, GetVectorCost);
14913 }
14914 case Instruction::FNeg:
14915 case Instruction::Add:
14916 case Instruction::FAdd:
14917 case Instruction::Sub:
14918 case Instruction::FSub:
14919 case Instruction::Mul:
14920 case Instruction::FMul:
14921 case Instruction::UDiv:
14922 case Instruction::SDiv:
14923 case Instruction::FDiv:
14924 case Instruction::URem:
14925 case Instruction::SRem:
14926 case Instruction::FRem:
14927 case Instruction::Shl:
14928 case Instruction::LShr:
14929 case Instruction::AShr:
14930 case Instruction::And:
14931 case Instruction::Or:
14932 case Instruction::Xor: {
14933 auto GetScalarCost = [&](unsigned Idx) {
14934 if (isa<PoisonValue>(UniqueValues[Idx]))
14936
14937 // We cannot retrieve the operand from UniqueValues[Idx] because an
14938 // interchangeable instruction may be used. The order and the actual
14939 // operand might differ from what is retrieved from UniqueValues[Idx].
14940 Value *Op1 = E->getOperand(0)[Idx];
14941 Value *Op2;
14942 SmallVector<const Value *, 2> Operands(1, Op1);
14943 if (isa<UnaryOperator>(UniqueValues[Idx])) {
14944 Op2 = Op1;
14945 } else {
14946 Op2 = E->getOperand(1)[Idx];
14947 Operands.push_back(Op2);
14948 }
14951 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
14952 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
14953 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
14954 I && (ShuffleOrOp == Instruction::FAdd ||
14955 ShuffleOrOp == Instruction::FSub)) {
14956 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
14957 if (IntrinsicCost.isValid())
14958 ScalarCost = IntrinsicCost;
14959 }
14960 return ScalarCost;
14961 };
14962 auto GetVectorCost = [=](InstructionCost CommonCost) {
14963 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14964 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
14965 ArrayRef<Value *> Ops = E->getOperand(I);
14966 if (all_of(Ops, [&](Value *Op) {
14967 auto *CI = dyn_cast<ConstantInt>(Op);
14968 return CI && CI->getValue().countr_one() >= It->second.first;
14969 }))
14970 return CommonCost;
14971 }
14972 }
14973 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
14974 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
14975 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
14976 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
14977 Op2Info, {}, nullptr, TLI) +
14978 CommonCost;
14979 };
14980 return GetCostDiff(GetScalarCost, GetVectorCost);
14981 }
14982 case Instruction::GetElementPtr: {
14983 return CommonCost + GetGEPCostDiff(VL, VL0);
14984 }
14985 case Instruction::Load: {
14986 auto GetScalarCost = [&](unsigned Idx) {
14987 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
14988 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14989 VI->getAlign(), VI->getPointerAddressSpace(),
14991 };
14992 auto *LI0 = cast<LoadInst>(VL0);
14993 auto GetVectorCost = [&](InstructionCost CommonCost) {
14994 InstructionCost VecLdCost;
14995 switch (E->State) {
14996 case TreeEntry::Vectorize:
14997 if (unsigned Factor = E->getInterleaveFactor()) {
14998 VecLdCost = TTI->getInterleavedMemoryOpCost(
14999 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
15000 LI0->getPointerAddressSpace(), CostKind);
15001
15002 } else {
15003 VecLdCost = TTI->getMemoryOpCost(
15004 Instruction::Load, VecTy, LI0->getAlign(),
15005 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15006 }
15007 break;
15008 case TreeEntry::StridedVectorize: {
15009 Align CommonAlignment =
15010 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15011 VecLdCost = TTI->getStridedMemoryOpCost(
15012 Instruction::Load, VecTy, LI0->getPointerOperand(),
15013 /*VariableMask=*/false, CommonAlignment, CostKind);
15014 break;
15015 }
15016 case TreeEntry::CompressVectorize: {
15017 bool IsMasked;
15018 unsigned InterleaveFactor;
15019 SmallVector<int> CompressMask;
15020 VectorType *LoadVecTy;
15021 SmallVector<Value *> Scalars(VL);
15022 if (!E->ReorderIndices.empty()) {
15023 SmallVector<int> Mask(E->ReorderIndices.begin(),
15024 E->ReorderIndices.end());
15025 reorderScalars(Scalars, Mask);
15026 }
15027 SmallVector<Value *> PointerOps(Scalars.size());
15028 for (auto [I, V] : enumerate(Scalars))
15029 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15030 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
15031 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15032 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
15033 CompressMask, LoadVecTy);
15034 assert(IsVectorized && "Failed to vectorize load");
15035 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
15036 InterleaveFactor, IsMasked);
15037 Align CommonAlignment = LI0->getAlign();
15038 if (InterleaveFactor) {
15039 VecLdCost = TTI->getInterleavedMemoryOpCost(
15040 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15041 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
15042 } else if (IsMasked) {
15043 VecLdCost = TTI->getMaskedMemoryOpCost(
15044 Instruction::Load, LoadVecTy, CommonAlignment,
15045 LI0->getPointerAddressSpace(), CostKind);
15046 // TODO: include this cost into CommonCost.
15047 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15048 LoadVecTy, CompressMask, CostKind);
15049 } else {
15050 VecLdCost = TTI->getMemoryOpCost(
15051 Instruction::Load, LoadVecTy, CommonAlignment,
15052 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15053 // TODO: include this cost into CommonCost.
15054 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15055 LoadVecTy, CompressMask, CostKind);
15056 }
15057 break;
15058 }
15059 case TreeEntry::ScatterVectorize: {
15060 Align CommonAlignment =
15061 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15062 VecLdCost = TTI->getGatherScatterOpCost(
15063 Instruction::Load, VecTy, LI0->getPointerOperand(),
15064 /*VariableMask=*/false, CommonAlignment, CostKind);
15065 break;
15066 }
15067 case TreeEntry::CombinedVectorize:
15068 case TreeEntry::SplitVectorize:
15069 case TreeEntry::NeedToGather:
15070 llvm_unreachable("Unexpected vectorization state.");
15071 }
15072 return VecLdCost + CommonCost;
15073 };
15074
15075 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
15076 // If this node generates masked gather load then it is not a terminal node.
15077 // Hence address operand cost is estimated separately.
15078 if (E->State == TreeEntry::ScatterVectorize)
15079 return Cost;
15080
15081 // Estimate cost of GEPs since this tree node is a terminator.
15082 SmallVector<Value *> PointerOps(VL.size());
15083 for (auto [I, V] : enumerate(VL))
15084 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15085 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15086 }
15087 case Instruction::Store: {
15088 bool IsReorder = !E->ReorderIndices.empty();
15089 auto GetScalarCost = [=](unsigned Idx) {
15090 auto *VI = cast<StoreInst>(VL[Idx]);
15091 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
15092 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15093 VI->getAlign(), VI->getPointerAddressSpace(),
15094 CostKind, OpInfo, VI);
15095 };
15096 auto *BaseSI =
15097 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
15098 auto GetVectorCost = [=](InstructionCost CommonCost) {
15099 // We know that we can merge the stores. Calculate the cost.
15100 InstructionCost VecStCost;
15101 if (E->State == TreeEntry::StridedVectorize) {
15102 Align CommonAlignment =
15103 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
15104 VecStCost = TTI->getStridedMemoryOpCost(
15105 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
15106 /*VariableMask=*/false, CommonAlignment, CostKind);
15107 } else {
15108 assert(E->State == TreeEntry::Vectorize &&
15109 "Expected either strided or consecutive stores.");
15110 if (unsigned Factor = E->getInterleaveFactor()) {
15111 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
15112 "No reused shuffles expected");
15113 CommonCost = 0;
15114 VecStCost = TTI->getInterleavedMemoryOpCost(
15115 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15116 BaseSI->getPointerAddressSpace(), CostKind);
15117 } else {
15118 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
15119 VecStCost = TTI->getMemoryOpCost(
15120 Instruction::Store, VecTy, BaseSI->getAlign(),
15121 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
15122 }
15123 }
15124 return VecStCost + CommonCost;
15125 };
15126 SmallVector<Value *> PointerOps(VL.size());
15127 for (auto [I, V] : enumerate(VL)) {
15128 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
15129 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
15130 }
15131
15132 return GetCostDiff(GetScalarCost, GetVectorCost) +
15133 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15134 }
15135 case Instruction::Call: {
15136 auto GetScalarCost = [&](unsigned Idx) {
15137 auto *CI = cast<CallInst>(UniqueValues[Idx]);
15140 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
15141 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15142 }
15143 return TTI->getCallInstrCost(CI->getCalledFunction(),
15145 CI->getFunctionType()->params(), CostKind);
15146 };
15147 auto GetVectorCost = [=](InstructionCost CommonCost) {
15148 auto *CI = cast<CallInst>(VL0);
15151 CI, ID, VecTy->getNumElements(),
15152 It != MinBWs.end() ? It->second.first : 0, TTI);
15153 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15154 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15155 };
15156 return GetCostDiff(GetScalarCost, GetVectorCost);
15157 }
15158 case Instruction::ShuffleVector: {
15159 if (!SLPReVec || E->isAltShuffle())
15160 assert(E->isAltShuffle() &&
15161 ((Instruction::isBinaryOp(E->getOpcode()) &&
15162 Instruction::isBinaryOp(E->getAltOpcode())) ||
15163 (Instruction::isCast(E->getOpcode()) &&
15164 Instruction::isCast(E->getAltOpcode())) ||
15165 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15166 "Invalid Shuffle Vector Operand");
15167 // Try to find the previous shuffle node with the same operands and same
15168 // main/alternate ops.
15169 auto TryFindNodeWithEqualOperands = [=]() {
15170 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15171 if (TE.get() == E)
15172 break;
15173 if (TE->hasState() && TE->isAltShuffle() &&
15174 ((TE->getOpcode() == E->getOpcode() &&
15175 TE->getAltOpcode() == E->getAltOpcode()) ||
15176 (TE->getOpcode() == E->getAltOpcode() &&
15177 TE->getAltOpcode() == E->getOpcode())) &&
15178 TE->hasEqualOperands(*E))
15179 return true;
15180 }
15181 return false;
15182 };
15183 auto GetScalarCost = [&](unsigned Idx) {
15184 if (isa<PoisonValue>(UniqueValues[Idx]))
15186
15187 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15188 assert(E->getMatchingMainOpOrAltOp(VI) &&
15189 "Unexpected main/alternate opcode");
15190 (void)E;
15191 return TTI->getInstructionCost(VI, CostKind);
15192 };
15193 // Need to clear CommonCost since the final shuffle cost is included into
15194 // vector cost.
15195 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15196 // VecCost is equal to sum of the cost of creating 2 vectors
15197 // and the cost of creating shuffle.
15198 InstructionCost VecCost = 0;
15199 if (TryFindNodeWithEqualOperands()) {
15200 LLVM_DEBUG({
15201 dbgs() << "SLP: diamond match for alternate node found.\n";
15202 E->dump();
15203 });
15204 // No need to add new vector costs here since we're going to reuse
15205 // same main/alternate vector ops, just do different shuffling.
15206 } else if (Instruction::isBinaryOp(E->getOpcode())) {
15207 VecCost =
15208 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15209 VecCost +=
15210 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15211 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15212 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15213 VecCost = TTIRef.getCmpSelInstrCost(
15214 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15215 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15216 VL0);
15217 VecCost += TTIRef.getCmpSelInstrCost(
15218 E->getOpcode(), VecTy, MaskTy,
15219 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15220 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15221 E->getAltOp());
15222 } else {
15223 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15224 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15225 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15226 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15227 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15228 unsigned SrcBWSz =
15229 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15230 if (SrcIt != MinBWs.end()) {
15231 SrcBWSz = SrcIt->second.first;
15232 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15233 SrcTy = getWidenedType(SrcSclTy, VL.size());
15234 }
15235 if (BWSz <= SrcBWSz) {
15236 if (BWSz < SrcBWSz)
15237 VecCost =
15238 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15240 LLVM_DEBUG({
15241 dbgs()
15242 << "SLP: alternate extension, which should be truncated.\n";
15243 E->dump();
15244 });
15245 return VecCost;
15246 }
15247 }
15248 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15250 VecCost +=
15251 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15253 }
15254 SmallVector<int> Mask;
15255 E->buildAltOpShuffleMask(
15256 [&](Instruction *I) {
15257 assert(E->getMatchingMainOpOrAltOp(I) &&
15258 "Unexpected main/alternate opcode");
15259 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15260 *TLI);
15261 },
15262 Mask);
15264 FinalVecTy, Mask, CostKind);
15265 // Patterns like [fadd,fsub] can be combined into a single instruction
15266 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15267 // need to take into account their order when looking for the most used
15268 // order.
15269 unsigned Opcode0 = E->getOpcode();
15270 unsigned Opcode1 = E->getAltOpcode();
15271 SmallBitVector OpcodeMask(
15272 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15273 // If this pattern is supported by the target then we consider the
15274 // order.
15275 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15276 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15277 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15278 return AltVecCost < VecCost ? AltVecCost : VecCost;
15279 }
15280 // TODO: Check the reverse order too.
15281 return VecCost;
15282 };
15283 if (SLPReVec && !E->isAltShuffle())
15284 return GetCostDiff(
15285 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15286 // If a group uses mask in order, the shufflevector can be
15287 // eliminated by instcombine. Then the cost is 0.
15289 "Not supported shufflevector usage.");
15290 auto *SV = cast<ShuffleVectorInst>(VL.front());
15291 unsigned SVNumElements =
15292 cast<FixedVectorType>(SV->getOperand(0)->getType())
15293 ->getNumElements();
15294 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15295 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15296 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15297 int NextIndex = 0;
15298 if (!all_of(Group, [&](Value *V) {
15300 "Not supported shufflevector usage.");
15301 auto *SV = cast<ShuffleVectorInst>(V);
15302 int Index;
15303 [[maybe_unused]] bool IsExtractSubvectorMask =
15304 SV->isExtractSubvectorMask(Index);
15305 assert(IsExtractSubvectorMask &&
15306 "Not supported shufflevector usage.");
15307 if (NextIndex != Index)
15308 return false;
15309 NextIndex += SV->getShuffleMask().size();
15310 return true;
15311 }))
15312 return ::getShuffleCost(
15314 calculateShufflevectorMask(E->Scalars));
15315 }
15316 return TTI::TCC_Free;
15317 });
15318 return GetCostDiff(GetScalarCost, GetVectorCost);
15319 }
15320 case Instruction::Freeze:
15321 return CommonCost;
15322 default:
15323 llvm_unreachable("Unknown instruction");
15324 }
15325}
15326
15327bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15328 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15329 << VectorizableTree.size() << " is fully vectorizable .\n");
15330
15331 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15332 SmallVector<int> Mask;
15333 return TE->isGather() &&
15334 !any_of(TE->Scalars,
15335 [this](Value *V) { return EphValues.contains(V); }) &&
15336 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
15337 TE->Scalars.size() < Limit ||
15338 (((TE->hasState() &&
15339 TE->getOpcode() == Instruction::ExtractElement) ||
15341 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
15342 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
15343 !TE->isAltShuffle()) ||
15344 any_of(TE->Scalars, IsaPred<LoadInst>));
15345 };
15346
15347 // We only handle trees of heights 1 and 2.
15348 if (VectorizableTree.size() == 1 &&
15349 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15350 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15351 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15352 (ForReduction &&
15353 AreVectorizableGathers(VectorizableTree[0].get(),
15354 VectorizableTree[0]->Scalars.size()) &&
15355 VectorizableTree[0]->getVectorFactor() > 2)))
15356 return true;
15357
15358 if (VectorizableTree.size() != 2)
15359 return false;
15360
15361 // Handle splat and all-constants stores. Also try to vectorize tiny trees
15362 // with the second gather nodes if they have less scalar operands rather than
15363 // the initial tree element (may be profitable to shuffle the second gather)
15364 // or they are extractelements, which form shuffle.
15365 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15366 AreVectorizableGathers(VectorizableTree[1].get(),
15367 VectorizableTree[0]->Scalars.size()))
15368 return true;
15369
15370 // Gathering cost would be too much for tiny trees.
15371 if (VectorizableTree[0]->isGather() ||
15372 (VectorizableTree[1]->isGather() &&
15373 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15374 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15375 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15376 return false;
15377
15378 return true;
15379}
15380
15381static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
15383 bool MustMatchOrInst) {
15384 // Look past the root to find a source value. Arbitrarily follow the
15385 // path through operand 0 of any 'or'. Also, peek through optional
15386 // shift-left-by-multiple-of-8-bits.
15387 Value *ZextLoad = Root;
15388 const APInt *ShAmtC;
15389 bool FoundOr = false;
15390 while (!isa<ConstantExpr>(ZextLoad) &&
15391 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
15392 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
15393 ShAmtC->urem(8) == 0))) {
15394 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15395 ZextLoad = BinOp->getOperand(0);
15396 if (BinOp->getOpcode() == Instruction::Or)
15397 FoundOr = true;
15398 }
15399 // Check if the input is an extended load of the required or/shift expression.
15400 Value *Load;
15401 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15402 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
15403 return false;
15404
15405 // Require that the total load bit width is a legal integer type.
15406 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
15407 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
15408 Type *SrcTy = Load->getType();
15409 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15410 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
15411 return false;
15412
15413 // Everything matched - assume that we can fold the whole sequence using
15414 // load combining.
15415 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
15416 << *(cast<Instruction>(Root)) << "\n");
15417
15418 return true;
15419}
15420
15422 if (RdxKind != RecurKind::Or)
15423 return false;
15424
15425 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15426 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15427 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
15428 /* MatchOr */ false);
15429}
15430
15432 // Peek through a final sequence of stores and check if all operations are
15433 // likely to be load-combined.
15434 unsigned NumElts = Stores.size();
15435 for (Value *Scalar : Stores) {
15436 Value *X;
15437 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
15438 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
15439 return false;
15440 }
15441 return true;
15442}
15443
15444bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
15445 if (!DebugCounter::shouldExecute(VectorizedGraphs))
15446 return true;
15447
15448 // Graph is empty - do nothing.
15449 if (VectorizableTree.empty()) {
15450 assert(ExternalUses.empty() && "We shouldn't have any external users");
15451
15452 return true;
15453 }
15454
15455 // No need to vectorize inserts of gathered values.
15456 if (VectorizableTree.size() == 2 &&
15457 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15458 VectorizableTree[1]->isGather() &&
15459 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15460 !(isSplat(VectorizableTree[1]->Scalars) ||
15461 allConstant(VectorizableTree[1]->Scalars))))
15462 return true;
15463
15464 // If the graph includes only PHI nodes and gathers, it is defnitely not
15465 // profitable for the vectorization, we can skip it, if the cost threshold is
15466 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
15467 // gathers/buildvectors.
15468 constexpr int Limit = 4;
15469 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15470 !VectorizableTree.empty() &&
15471 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15472 return (TE->isGather() &&
15473 (!TE->hasState() ||
15474 TE->getOpcode() != Instruction::ExtractElement) &&
15475 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15476 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15477 }))
15478 return true;
15479
15480 // Do not vectorize small tree of phis only, if all vector phis are also
15481 // gathered.
15482 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15483 VectorizableTree.size() <= Limit &&
15484 all_of(VectorizableTree,
15485 [&](const std::unique_ptr<TreeEntry> &TE) {
15486 return (TE->isGather() &&
15487 (!TE->hasState() ||
15488 TE->getOpcode() != Instruction::ExtractElement) &&
15489 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15490 Limit) ||
15491 (TE->hasState() &&
15492 (TE->getOpcode() == Instruction::InsertElement ||
15493 (TE->getOpcode() == Instruction::PHI &&
15494 all_of(TE->Scalars, [&](Value *V) {
15495 return isa<PoisonValue>(V) || MustGather.contains(V);
15496 }))));
15497 }) &&
15498 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15499 return TE->State == TreeEntry::Vectorize &&
15500 TE->getOpcode() == Instruction::PHI;
15501 }))
15502 return true;
15503
15504 // If the tree contains only phis, buildvectors, split nodes and
15505 // small nodes with reuses, we can skip it.
15506 SmallVector<const TreeEntry *> StoreLoadNodes;
15507 unsigned NumGathers = 0;
15508 constexpr int LimitTreeSize = 36;
15509 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15510 all_of(VectorizableTree,
15511 [&](const std::unique_ptr<TreeEntry> &TE) {
15512 if (!TE->isGather() && TE->hasState() &&
15513 (TE->getOpcode() == Instruction::Load ||
15514 TE->getOpcode() == Instruction::Store)) {
15515 StoreLoadNodes.push_back(TE.get());
15516 return true;
15517 }
15518 if (TE->isGather())
15519 ++NumGathers;
15520 return TE->State == TreeEntry::SplitVectorize ||
15521 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15522 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15523 VectorizableTree.size() > LimitTreeSize) ||
15524 (TE->isGather() &&
15525 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15526 (TE->hasState() &&
15527 (TE->getOpcode() == Instruction::PHI ||
15528 (TE->hasCopyableElements() &&
15529 static_cast<unsigned>(count_if(
15530 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15531 TE->Scalars.size() / 2) ||
15532 ((!TE->ReuseShuffleIndices.empty() ||
15533 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15534 TE->Scalars.size() == 2)));
15535 }) &&
15536 (StoreLoadNodes.empty() ||
15537 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
15538 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
15539 return TE->getOpcode() == Instruction::Store ||
15540 all_of(TE->Scalars, [&](Value *V) {
15541 return !isa<LoadInst>(V) ||
15542 areAllUsersVectorized(cast<Instruction>(V));
15543 });
15544 })))))
15545 return true;
15546
15547 // If the tree contains only buildvector, 2 non-buildvectors (with root user
15548 // tree node) and other buildvectors, we can skip it.
15549 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15550 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15551 VectorizableTree.size() >= Limit &&
15552 count_if(ArrayRef(VectorizableTree).drop_front(),
15553 [&](const std::unique_ptr<TreeEntry> &TE) {
15554 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15555 TE->UserTreeIndex.UserTE->Idx == 0;
15556 }) == 2)
15557 return true;
15558
15559 // If the tree contains only vectorization of the phi node from the
15560 // buildvector - skip it.
15561 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15562 VectorizableTree.size() > 2 &&
15563 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15564 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15565 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15566 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15567 all_of(
15568 ArrayRef(VectorizableTree).drop_front(2),
15569 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
15570 return true;
15571
15572 // We can vectorize the tree if its size is greater than or equal to the
15573 // minimum size specified by the MinTreeSize command line option.
15574 if (VectorizableTree.size() >= MinTreeSize)
15575 return false;
15576
15577 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
15578 // can vectorize it if we can prove it fully vectorizable.
15579 if (isFullyVectorizableTinyTree(ForReduction))
15580 return false;
15581
15582 // Check if any of the gather node forms an insertelement buildvector
15583 // somewhere.
15584 bool IsAllowedSingleBVNode =
15585 VectorizableTree.size() > 1 ||
15586 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15587 !VectorizableTree.front()->isAltShuffle() &&
15588 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15589 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15590 allSameBlock(VectorizableTree.front()->Scalars));
15591 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15592 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
15593 return isa<ExtractElementInst, Constant>(V) ||
15594 (IsAllowedSingleBVNode &&
15595 !V->hasNUsesOrMore(UsesLimit) &&
15596 any_of(V->users(), IsaPred<InsertElementInst>));
15597 });
15598 }))
15599 return false;
15600
15601 if (VectorizableTree.back()->isGather() &&
15602 VectorizableTree.back()->hasState() &&
15603 VectorizableTree.back()->isAltShuffle() &&
15604 VectorizableTree.back()->getVectorFactor() > 2 &&
15605 allSameBlock(VectorizableTree.back()->Scalars) &&
15606 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15607 TTI->getScalarizationOverhead(
15608 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15609 VectorizableTree.back()->getVectorFactor()),
15610 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
15611 /*Insert=*/true, /*Extract=*/false,
15613 return false;
15614
15615 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
15616 // vectorizable.
15617 return true;
15618}
15619
15622 constexpr unsigned SmallTree = 3;
15623 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15624 getCanonicalGraphSize() <= SmallTree &&
15625 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
15626 [](const std::unique_ptr<TreeEntry> &TE) {
15627 return TE->isGather() && TE->hasState() &&
15628 TE->getOpcode() == Instruction::Load &&
15629 !allSameBlock(TE->Scalars);
15630 }) == 1)
15631 return true;
15632 return false;
15633 }
15634 bool Res = false;
15635 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
15636 TreeEntry &E = *VectorizableTree[Idx];
15637 if (E.State == TreeEntry::SplitVectorize)
15638 return false;
15639 if (!E.isGather())
15640 continue;
15641 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15642 (!E.hasState() &&
15644 (isa<ExtractElementInst>(E.Scalars.front()) &&
15645 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
15646 return false;
15647 if (isSplat(E.Scalars) || allConstant(E.Scalars))
15648 continue;
15649 Res = true;
15650 }
15651 return Res;
15652}
15653
15655 // Walk from the bottom of the tree to the top, tracking which values are
15656 // live. When we see a call instruction that is not part of our tree,
15657 // query TTI to see if there is a cost to keeping values live over it
15658 // (for example, if spills and fills are required).
15659
15660 const TreeEntry *Root = VectorizableTree.front().get();
15661 if (Root->isGather())
15662 return 0;
15663
15666 EntriesToOperands;
15667 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
15668 SmallPtrSet<const Instruction *, 8> LastInstructions;
15669 for (const auto &TEPtr : VectorizableTree) {
15670 if (!TEPtr->isGather()) {
15671 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15672 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
15673 LastInstructions.insert(LastInst);
15674 }
15675 if (TEPtr->UserTreeIndex)
15676 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15677 }
15678
15679 auto NoCallIntrinsic = [this](const Instruction *I) {
15680 const auto *II = dyn_cast<IntrinsicInst>(I);
15681 if (!II)
15682 return false;
15683 if (II->isAssumeLikeIntrinsic())
15684 return true;
15685 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
15686 InstructionCost IntrCost =
15687 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
15688 InstructionCost CallCost = TTI->getCallInstrCost(
15689 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
15690 return IntrCost < CallCost;
15691 };
15692
15693 // Maps last instruction in the entry to the last instruction for the one of
15694 // operand entries and the flag. If the flag is true, there are no calls in
15695 // between these instructions.
15697 CheckedInstructions;
15698 unsigned Budget = 0;
15699 const unsigned BudgetLimit =
15700 ScheduleRegionSizeBudget / VectorizableTree.size();
15701 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
15702 const Instruction *Last) {
15703 assert(First->getParent() == Last->getParent() &&
15704 "Expected instructions in same block.");
15705 if (auto It = CheckedInstructions.find(Last);
15706 It != CheckedInstructions.end()) {
15707 const Instruction *Checked = It->second.getPointer();
15708 if (Checked == First || Checked->comesBefore(First))
15709 return It->second.getInt() != 0;
15710 Last = Checked;
15711 } else if (Last == First || Last->comesBefore(First)) {
15712 return true;
15713 }
15715 ++First->getIterator().getReverse(),
15716 PrevInstIt =
15717 Last->getIterator().getReverse();
15718 SmallVector<const Instruction *> LastInstsInRange;
15719 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15720 // Debug information does not impact spill cost.
15721 // Vectorized calls, represented as vector intrinsics, do not impact spill
15722 // cost.
15723 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
15724 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
15725 for (const Instruction *LastInst : LastInstsInRange)
15726 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
15727 return false;
15728 }
15729 if (LastInstructions.contains(&*PrevInstIt))
15730 LastInstsInRange.push_back(&*PrevInstIt);
15731
15732 ++PrevInstIt;
15733 ++Budget;
15734 }
15735 for (const Instruction *LastInst : LastInstsInRange)
15736 CheckedInstructions.try_emplace(
15737 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
15738 Budget <= BudgetLimit ? 1 : 0);
15739 return Budget <= BudgetLimit;
15740 };
15741 auto AddCosts = [&](const TreeEntry *Op) {
15742 Type *ScalarTy = Op->Scalars.front()->getType();
15743 auto It = MinBWs.find(Op);
15744 if (It != MinBWs.end())
15745 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
15746 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
15747 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15748 if (ScalarTy->isVectorTy()) {
15749 // Handle revec dead vector instructions.
15750 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15751 }
15752 };
15753 // Memoize the relationship between blocks, i.e. if there is (at least one)
15754 // non-vectorized call between the blocks. This allows to skip the analysis of
15755 // the same block paths multiple times.
15757 ParentOpParentToPreds;
15758 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
15759 BasicBlock *OpParent) {
15760 auto Key = std::make_pair(Root, OpParent);
15761 if (auto It = ParentOpParentToPreds.find(Key);
15762 It != ParentOpParentToPreds.end())
15763 return It->second;
15765 if (Pred)
15766 Worklist.push_back(Pred);
15767 else
15768 Worklist.append(pred_begin(Root), pred_end(Root));
15771 ParentsPairsToAdd;
15772 bool Res = false;
15773 auto Cleanup = make_scope_exit([&]() {
15774 for (const auto &KeyPair : ParentsPairsToAdd) {
15775 assert(!ParentOpParentToPreds.contains(KeyPair) &&
15776 "Should not have been added before.");
15777 ParentOpParentToPreds.try_emplace(KeyPair, Res);
15778 }
15779 });
15780 while (!Worklist.empty()) {
15781 BasicBlock *BB = Worklist.pop_back_val();
15782 if (BB == OpParent || !Visited.insert(BB).second)
15783 continue;
15784 auto Pair = std::make_pair(BB, OpParent);
15785 if (auto It = ParentOpParentToPreds.find(Pair);
15786 It != ParentOpParentToPreds.end()) {
15787 Res = It->second;
15788 return Res;
15789 }
15790 ParentsPairsToAdd.insert(Pair);
15791 unsigned BlockSize = BB->size();
15792 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
15793 return Res;
15794 Budget += BlockSize;
15795 if (Budget > BudgetLimit)
15796 return Res;
15797 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
15798 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
15799 BB->getTerminator()))
15800 return Res;
15801 Worklist.append(pred_begin(BB), pred_end(BB));
15802 }
15803 Res = true;
15804 return Res;
15805 };
15806 SmallVector<const TreeEntry *> LiveEntries(1, Root);
15807 while (!LiveEntries.empty()) {
15808 const TreeEntry *Entry = LiveEntries.pop_back_val();
15809 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
15810 if (Operands.empty())
15811 continue;
15812 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
15813 BasicBlock *Parent = LastInst->getParent();
15814 for (const TreeEntry *Op : Operands) {
15815 if (!Op->isGather())
15816 LiveEntries.push_back(Op);
15817 if (Entry->State == TreeEntry::SplitVectorize ||
15818 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
15819 (Op->isGather() && allConstant(Op->Scalars)))
15820 continue;
15821 Budget = 0;
15822 BasicBlock *Pred = nullptr;
15823 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
15824 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15825 BasicBlock *OpParent;
15826 Instruction *OpLastInst;
15827 if (Op->isGather()) {
15828 assert(Entry->getOpcode() == Instruction::PHI &&
15829 "Expected phi node only.");
15830 OpParent = cast<PHINode>(Entry->getMainOp())
15831 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15832 OpLastInst = OpParent->getTerminator();
15833 for (Value *V : Op->Scalars) {
15834 auto *Inst = dyn_cast<Instruction>(V);
15835 if (!Inst)
15836 continue;
15837 if (isVectorized(V)) {
15838 OpParent = Inst->getParent();
15839 OpLastInst = Inst;
15840 break;
15841 }
15842 }
15843 } else {
15844 OpLastInst = EntriesToLastInstruction.at(Op);
15845 OpParent = OpLastInst->getParent();
15846 }
15847 // Check the call instructions within the same basic blocks.
15848 if (OpParent == Parent) {
15849 if (Entry->getOpcode() == Instruction::PHI) {
15850 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15851 AddCosts(Op);
15852 continue;
15853 }
15854 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15855 AddCosts(Op);
15856 continue;
15857 }
15858 // Check for call instruction in between blocks.
15859 // 1. Check entry's block to the head.
15860 if (Entry->getOpcode() != Instruction::PHI &&
15861 !CheckForNonVecCallsInSameBlock(
15862 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
15863 LastInst)) {
15864 AddCosts(Op);
15865 continue;
15866 }
15867 // 2. Check op's block from the end.
15868 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15869 OpParent->getTerminator())) {
15870 AddCosts(Op);
15871 continue;
15872 }
15873 // 3. Check the predecessors of entry's block till op's block.
15874 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15875 AddCosts(Op);
15876 continue;
15877 }
15878 }
15879 }
15880
15881 return Cost;
15882}
15883
15884/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
15885/// buildvector sequence.
15887 const InsertElementInst *IE2) {
15888 if (IE1 == IE2)
15889 return false;
15890 const auto *I1 = IE1;
15891 const auto *I2 = IE2;
15892 const InsertElementInst *PrevI1;
15893 const InsertElementInst *PrevI2;
15894 unsigned Idx1 = *getElementIndex(IE1);
15895 unsigned Idx2 = *getElementIndex(IE2);
15896 do {
15897 if (I2 == IE1)
15898 return true;
15899 if (I1 == IE2)
15900 return false;
15901 PrevI1 = I1;
15902 PrevI2 = I2;
15903 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15904 getElementIndex(I1).value_or(Idx2) != Idx2)
15905 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
15906 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
15907 getElementIndex(I2).value_or(Idx1) != Idx1)
15908 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
15909 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15910 llvm_unreachable("Two different buildvectors not expected.");
15911}
15912
15913namespace {
15914/// Returns incoming Value *, if the requested type is Value * too, or a default
15915/// value, otherwise.
15916struct ValueSelect {
15917 template <typename U>
15918 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
15919 return V;
15920 }
15921 template <typename U>
15922 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
15923 return U();
15924 }
15925};
15926} // namespace
15927
15928/// Does the analysis of the provided shuffle masks and performs the requested
15929/// actions on the vectors with the given shuffle masks. It tries to do it in
15930/// several steps.
15931/// 1. If the Base vector is not undef vector, resizing the very first mask to
15932/// have common VF and perform action for 2 input vectors (including non-undef
15933/// Base). Other shuffle masks are combined with the resulting after the 1 stage
15934/// and processed as a shuffle of 2 elements.
15935/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
15936/// action only for 1 vector with the given mask, if it is not the identity
15937/// mask.
15938/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
15939/// vectors, combing the masks properly between the steps.
15940template <typename T>
15942 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
15943 function_ref<unsigned(T *)> GetVF,
15944 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
15946 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
15947 SmallVector<int> Mask(ShuffleMask.begin()->second);
15948 auto VMIt = std::next(ShuffleMask.begin());
15949 T *Prev = nullptr;
15950 SmallBitVector UseMask =
15951 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15952 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
15953 if (!IsBaseUndef.all()) {
15954 // Base is not undef, need to combine it with the next subvectors.
15955 std::pair<T *, bool> Res =
15956 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
15957 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
15958 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15959 if (Mask[Idx] == PoisonMaskElem)
15960 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
15961 else
15962 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15963 }
15964 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
15965 assert((!V || GetVF(V) == Mask.size()) &&
15966 "Expected base vector of VF number of elements.");
15967 Prev = Action(Mask, {nullptr, Res.first});
15968 } else if (ShuffleMask.size() == 1) {
15969 // Base is undef and only 1 vector is shuffled - perform the action only for
15970 // single vector, if the mask is not the identity mask.
15971 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15972 /*ForSingleMask=*/true);
15973 if (Res.second)
15974 // Identity mask is found.
15975 Prev = Res.first;
15976 else
15977 Prev = Action(Mask, {ShuffleMask.begin()->first});
15978 } else {
15979 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
15980 // shuffles step by step, combining shuffle between the steps.
15981 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15982 unsigned Vec2VF = GetVF(VMIt->first);
15983 if (Vec1VF == Vec2VF) {
15984 // No need to resize the input vectors since they are of the same size, we
15985 // can shuffle them directly.
15986 ArrayRef<int> SecMask = VMIt->second;
15987 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15988 if (SecMask[I] != PoisonMaskElem) {
15989 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15990 Mask[I] = SecMask[I] + Vec1VF;
15991 }
15992 }
15993 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15994 } else {
15995 // Vectors of different sizes - resize and reshuffle.
15996 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15997 /*ForSingleMask=*/false);
15998 std::pair<T *, bool> Res2 =
15999 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16000 ArrayRef<int> SecMask = VMIt->second;
16001 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16002 if (Mask[I] != PoisonMaskElem) {
16003 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16004 if (Res1.second)
16005 Mask[I] = I;
16006 } else if (SecMask[I] != PoisonMaskElem) {
16007 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16008 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
16009 }
16010 }
16011 Prev = Action(Mask, {Res1.first, Res2.first});
16012 }
16013 VMIt = std::next(VMIt);
16014 }
16015 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
16016 // Perform requested actions for the remaining masks/vectors.
16017 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
16018 // Shuffle other input vectors, if any.
16019 std::pair<T *, bool> Res =
16020 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16021 ArrayRef<int> SecMask = VMIt->second;
16022 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16023 if (SecMask[I] != PoisonMaskElem) {
16024 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
16025 "Multiple uses of scalars.");
16026 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
16027 } else if (Mask[I] != PoisonMaskElem) {
16028 Mask[I] = I;
16029 }
16030 }
16031 Prev = Action(Mask, {Prev, Res.first});
16032 }
16033 return Prev;
16034}
16035
16036namespace {
16037/// Data type for handling buildvector sequences with the reused scalars from
16038/// other tree entries.
16039template <typename T> struct ShuffledInsertData {
16040 /// List of insertelements to be replaced by shuffles.
16041 SmallVector<InsertElementInst *> InsertElements;
16042 /// The parent vectors and shuffle mask for the given list of inserts.
16043 MapVector<T, SmallVector<int>> ValueMasks;
16044};
16045} // namespace
16046
16048 InstructionCost ReductionCost) {
16049 InstructionCost Cost = ReductionCost;
16050 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
16051 << VectorizableTree.size() << ".\n");
16052
16053 SmallPtrSet<Value *, 4> CheckedExtracts;
16054 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
16055 TreeEntry &TE = *VectorizableTree[I];
16056 // No need to count the cost for combined entries, they are combined and
16057 // just skip their cost.
16058 if (TE.State == TreeEntry::CombinedVectorize) {
16059 LLVM_DEBUG(
16060 dbgs() << "SLP: Skipping cost for combined node that starts with "
16061 << *TE.Scalars[0] << ".\n";
16062 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16063 continue;
16064 }
16065 if (TE.hasState() &&
16066 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16067 if (const TreeEntry *E =
16068 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16069 E && E->getVectorFactor() == TE.getVectorFactor()) {
16070 // Some gather nodes might be absolutely the same as some vectorizable
16071 // nodes after reordering, need to handle it.
16072 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
16073 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16074 << "SLP: Current total cost = " << Cost << "\n");
16075 continue;
16076 }
16077 }
16078
16079 // Exclude cost of gather loads nodes which are not used. These nodes were
16080 // built as part of the final attempt to vectorize gathered loads.
16081 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16082 "Expected gather nodes with users only.");
16083
16084 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
16085 Cost += C;
16086 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
16087 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16088 << "SLP: Current total cost = " << Cost << "\n");
16089 }
16090
16091 if (Cost >= -SLPCostThreshold &&
16092 none_of(ExternalUses, [](const ExternalUser &EU) {
16093 return isa_and_nonnull<InsertElementInst>(EU.User);
16094 }))
16095 return Cost;
16096
16097 SmallPtrSet<Value *, 16> ExtractCostCalculated;
16098 InstructionCost ExtractCost = 0;
16100 SmallVector<APInt> DemandedElts;
16101 SmallDenseSet<Value *, 4> UsedInserts;
16103 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16105 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
16106 // Keep track {Scalar, Index, User} tuple.
16107 // On AArch64, this helps in fusing a mov instruction, associated with
16108 // extractelement, with fmul in the backend so that extractelement is free.
16110 for (ExternalUser &EU : ExternalUses) {
16111 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
16112 }
16113 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
16114 for (ExternalUser &EU : ExternalUses) {
16115 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
16116 << EU.E.Idx << " in lane " << EU.Lane << "\n");
16117 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
16118 else dbgs() << " User: nullptr\n");
16119 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
16120
16121 // Uses by ephemeral values are free (because the ephemeral value will be
16122 // removed prior to code generation, and so the extraction will be
16123 // removed as well).
16124 if (EphValues.count(EU.User))
16125 continue;
16126
16127 // Check if the scalar for the given user or all users is accounted already.
16128 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
16129 (EU.User &&
16130 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
16131 continue;
16132
16133 // Used in unreachable blocks or in EH pads (rarely executed) or is
16134 // terminated with unreachable instruction.
16135 if (BasicBlock *UserParent =
16136 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
16137 UserParent &&
16138 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16139 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
16140 continue;
16141
16142 // We only add extract cost once for the same scalar.
16143 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
16144 !ExtractCostCalculated.insert(EU.Scalar).second)
16145 continue;
16146
16147 // No extract cost for vector "scalar" if REVEC is disabled
16148 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
16149 continue;
16150
16151 // If found user is an insertelement, do not calculate extract cost but try
16152 // to detect it as a final shuffled/identity match.
16153 // TODO: what if a user is insertvalue when REVEC is enabled?
16154 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
16155 VU && VU->getOperand(1) == EU.Scalar) {
16156 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
16157 if (!UsedInserts.insert(VU).second)
16158 continue;
16159 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16160 if (InsertIdx) {
16161 const TreeEntry *ScalarTE = &EU.E;
16162 auto *It = find_if(
16163 ShuffledInserts,
16164 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
16165 // Checks if 2 insertelements are from the same buildvector.
16166 InsertElementInst *VecInsert = Data.InsertElements.front();
16168 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
16169 Value *Op0 = II->getOperand(0);
16170 if (isVectorized(II) && !isVectorized(Op0))
16171 return nullptr;
16172 return Op0;
16173 });
16174 });
16175 int VecId = -1;
16176 if (It == ShuffledInserts.end()) {
16177 auto &Data = ShuffledInserts.emplace_back();
16178 Data.InsertElements.emplace_back(VU);
16179 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
16180 VecId = ShuffledInserts.size() - 1;
16181 auto It = MinBWs.find(ScalarTE);
16182 if (It != MinBWs.end() &&
16183 VectorCasts
16184 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
16185 .second) {
16186 unsigned BWSz = It->second.first;
16187 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16188 unsigned VecOpcode;
16189 if (DstBWSz < BWSz)
16190 VecOpcode = Instruction::Trunc;
16191 else
16192 VecOpcode =
16193 It->second.second ? Instruction::SExt : Instruction::ZExt;
16195 InstructionCost C = TTI->getCastInstrCost(
16196 VecOpcode, FTy,
16197 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
16198 FTy->getNumElements()),
16200 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16201 << " for extending externally used vector with "
16202 "non-equal minimum bitwidth.\n");
16203 Cost += C;
16204 }
16205 } else {
16206 if (isFirstInsertElement(VU, It->InsertElements.front()))
16207 It->InsertElements.front() = VU;
16208 VecId = std::distance(ShuffledInserts.begin(), It);
16209 }
16210 int InIdx = *InsertIdx;
16211 SmallVectorImpl<int> &Mask =
16212 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16213 if (Mask.empty())
16214 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16215 Mask[InIdx] = EU.Lane;
16216 DemandedElts[VecId].setBit(InIdx);
16217 continue;
16218 }
16219 }
16220 }
16221
16223 // If we plan to rewrite the tree in a smaller type, we will need to sign
16224 // extend the extracted value back to the original type. Here, we account
16225 // for the extract and the added cost of the sign extend if needed.
16226 InstructionCost ExtraCost = TTI::TCC_Free;
16227 auto *ScalarTy = EU.Scalar->getType();
16228 const unsigned BundleWidth = EU.E.getVectorFactor();
16229 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
16230 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
16231 const TreeEntry *Entry = &EU.E;
16232 auto It = MinBWs.find(Entry);
16233 if (It != MinBWs.end()) {
16234 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
16235 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16236 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
16237 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
16238 ? Instruction::ZExt
16239 : Instruction::SExt;
16240 VecTy = getWidenedType(MinTy, BundleWidth);
16241 ExtraCost =
16242 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
16243 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
16244 << ExtraCost << "\n");
16245 } else {
16246 ExtraCost =
16247 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
16248 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16249 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
16250 << *VecTy << ": " << ExtraCost << "\n");
16251 }
16252 // Leave the scalar instructions as is if they are cheaper than extracts.
16253 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16254 Entry->getOpcode() == Instruction::Load) {
16255 // Checks if the user of the external scalar is phi in loop body.
16256 auto IsPhiInLoop = [&](const ExternalUser &U) {
16257 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16258 auto *I = cast<Instruction>(U.Scalar);
16259 const Loop *L = LI->getLoopFor(Phi->getParent());
16260 return L && (Phi->getParent() == I->getParent() ||
16261 L == LI->getLoopFor(I->getParent()));
16262 }
16263 return false;
16264 };
16265 if (!ValueToExtUses) {
16266 ValueToExtUses.emplace();
16267 for (const auto &P : enumerate(ExternalUses)) {
16268 // Ignore phis in loops.
16269 if (IsPhiInLoop(P.value()))
16270 continue;
16271
16272 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
16273 }
16274 }
16275 // Can use original instruction, if no operands vectorized or they are
16276 // marked as externally used already.
16277 auto *Inst = cast<Instruction>(EU.Scalar);
16278 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
16279 auto OperandIsScalar = [&](Value *V) {
16280 if (!isVectorized(V)) {
16281 // Some extractelements might be not vectorized, but
16282 // transformed into shuffle and removed from the function,
16283 // consider it here.
16284 if (auto *EE = dyn_cast<ExtractElementInst>(V))
16285 return !EE->hasOneUse() || !MustGather.contains(EE);
16286 return true;
16287 }
16288 return ValueToExtUses->contains(V);
16289 };
16290 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
16291 bool CanBeUsedAsScalarCast = false;
16292 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16293 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
16294 Op && all_of(Op->operands(), OperandIsScalar)) {
16295 InstructionCost OpCost =
16296 (isVectorized(Op) && !ValueToExtUses->contains(Op))
16297 ? TTI->getInstructionCost(Op, CostKind)
16298 : 0;
16299 if (ScalarCost + OpCost <= ExtraCost) {
16300 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
16301 ScalarCost += OpCost;
16302 }
16303 }
16304 }
16305 if (CanBeUsedAsScalar) {
16306 bool KeepScalar = ScalarCost <= ExtraCost;
16307 // Try to keep original scalar if the user is the phi node from the same
16308 // block as the root phis, currently vectorized. It allows to keep
16309 // better ordering info of PHIs, being vectorized currently.
16310 bool IsProfitablePHIUser =
16311 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
16312 VectorizableTree.front()->Scalars.size() > 2)) &&
16313 VectorizableTree.front()->hasState() &&
16314 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16315 !Inst->hasNUsesOrMore(UsesLimit) &&
16316 none_of(Inst->users(),
16317 [&](User *U) {
16318 auto *PHIUser = dyn_cast<PHINode>(U);
16319 return (!PHIUser ||
16320 PHIUser->getParent() !=
16321 cast<Instruction>(
16322 VectorizableTree.front()->getMainOp())
16323 ->getParent()) &&
16324 !isVectorized(U);
16325 }) &&
16326 count_if(Entry->Scalars, [&](Value *V) {
16327 return ValueToExtUses->contains(V);
16328 }) <= 2;
16329 if (IsProfitablePHIUser) {
16330 KeepScalar = true;
16331 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
16332 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
16333 (!GatheredLoadsEntriesFirst.has_value() ||
16334 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16335 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
16336 return ValueToExtUses->contains(V);
16337 });
16338 auto It = ExtractsCount.find(Entry);
16339 if (It != ExtractsCount.end()) {
16340 assert(ScalarUsesCount >= It->getSecond().size() &&
16341 "Expected total number of external uses not less than "
16342 "number of scalar uses.");
16343 ScalarUsesCount -= It->getSecond().size();
16344 }
16345 // Keep original scalar if number of externally used instructions in
16346 // the same entry is not power of 2. It may help to do some extra
16347 // vectorization for now.
16348 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
16349 }
16350 if (KeepScalar) {
16351 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16352 for (Value *V : Inst->operands()) {
16353 auto It = ValueToExtUses->find(V);
16354 if (It != ValueToExtUses->end()) {
16355 // Replace all uses to avoid compiler crash.
16356 ExternalUses[It->second].User = nullptr;
16357 }
16358 }
16359 ExtraCost = ScalarCost;
16360 if (!IsPhiInLoop(EU))
16361 ExtractsCount[Entry].insert(Inst);
16362 if (CanBeUsedAsScalarCast) {
16363 ScalarOpsFromCasts.insert(Inst->getOperand(0));
16364 // Update the users of the operands of the cast operand to avoid
16365 // compiler crash.
16366 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
16367 for (Value *V : IOp->operands()) {
16368 auto It = ValueToExtUses->find(V);
16369 if (It != ValueToExtUses->end()) {
16370 // Replace all uses to avoid compiler crash.
16371 ExternalUses[It->second].User = nullptr;
16372 }
16373 }
16374 }
16375 }
16376 }
16377 }
16378 }
16379
16380 ExtractCost += ExtraCost;
16381 }
16382 // Insert externals for extract of operands of casts to be emitted as scalars
16383 // instead of extractelement.
16384 for (Value *V : ScalarOpsFromCasts) {
16385 ExternalUsesAsOriginalScalar.insert(V);
16386 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
16387 ExternalUses.emplace_back(V, nullptr, *TEs.front(),
16388 TEs.front()->findLaneForValue(V));
16389 }
16390 }
16391 // Add reduced value cost, if resized.
16392 if (!VectorizedVals.empty()) {
16393 const TreeEntry &Root = *VectorizableTree.front();
16394 auto BWIt = MinBWs.find(&Root);
16395 if (BWIt != MinBWs.end()) {
16396 Type *DstTy = Root.Scalars.front()->getType();
16397 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
16398 unsigned SrcSz =
16399 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16400 if (OriginalSz != SrcSz) {
16401 unsigned Opcode = Instruction::Trunc;
16402 if (OriginalSz > SrcSz)
16403 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16404 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
16405 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
16406 assert(SLPReVec && "Only supported by REVEC.");
16407 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
16408 }
16409 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16412 }
16413 }
16414 }
16415
16416 Cost += ExtractCost;
16417 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
16418 bool ForSingleMask) {
16419 InstructionCost C = 0;
16420 unsigned VF = Mask.size();
16421 unsigned VecVF = TE->getVectorFactor();
16422 bool HasLargeIndex =
16423 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
16424 if ((VF != VecVF && HasLargeIndex) ||
16426
16427 if (HasLargeIndex) {
16428 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
16429 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16430 OrigMask.begin());
16432 getWidenedType(TE->getMainOp()->getType(), VecVF),
16433 OrigMask);
16434 LLVM_DEBUG(
16435 dbgs() << "SLP: Adding cost " << C
16436 << " for final shuffle of insertelement external users.\n";
16437 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16438 Cost += C;
16439 return std::make_pair(TE, true);
16440 }
16441
16442 if (!ForSingleMask) {
16443 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16444 for (unsigned I = 0; I < VF; ++I) {
16445 if (Mask[I] != PoisonMaskElem)
16446 ResizeMask[Mask[I]] = Mask[I];
16447 }
16448 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
16451 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
16452 LLVM_DEBUG(
16453 dbgs() << "SLP: Adding cost " << C
16454 << " for final shuffle of insertelement external users.\n";
16455 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16456
16457 Cost += C;
16458 }
16459 }
16460 return std::make_pair(TE, false);
16461 };
16462 // Calculate the cost of the reshuffled vectors, if any.
16463 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16464 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
16465 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16466 unsigned VF = 0;
16467 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
16469 assert((TEs.size() == 1 || TEs.size() == 2) &&
16470 "Expected exactly 1 or 2 tree entries.");
16471 if (TEs.size() == 1) {
16472 if (VF == 0)
16473 VF = TEs.front()->getVectorFactor();
16474 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16475 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
16476 !all_of(enumerate(Mask), [=](const auto &Data) {
16477 return Data.value() == PoisonMaskElem ||
16478 (Data.index() < VF &&
16479 static_cast<int>(Data.index()) == Data.value());
16480 })) {
16483 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16484 << " for final shuffle of insertelement "
16485 "external users.\n";
16486 TEs.front()->dump();
16487 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16488 Cost += C;
16489 }
16490 } else {
16491 if (VF == 0) {
16492 if (TEs.front() &&
16493 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16494 VF = TEs.front()->getVectorFactor();
16495 else
16496 VF = Mask.size();
16497 }
16498 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16500 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
16501 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16502 << " for final shuffle of vector node and external "
16503 "insertelement users.\n";
16504 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16505 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16506 Cost += C;
16507 }
16508 VF = Mask.size();
16509 return TEs.back();
16510 };
16512 MutableArrayRef(Vector.data(), Vector.size()), Base,
16513 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16514 EstimateShufflesCost);
16515 InstructionCost InsertCost = TTI->getScalarizationOverhead(
16517 ShuffledInserts[I].InsertElements.front()->getType()),
16518 DemandedElts[I],
16519 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
16520 Cost -= InsertCost;
16521 }
16522
16523 // Add the cost for reduced value resize (if required).
16524 if (ReductionBitWidth != 0) {
16525 assert(UserIgnoreList && "Expected reduction tree.");
16526 const TreeEntry &E = *VectorizableTree.front();
16527 auto It = MinBWs.find(&E);
16528 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16529 unsigned SrcSize = It->second.first;
16530 unsigned DstSize = ReductionBitWidth;
16531 unsigned Opcode = Instruction::Trunc;
16532 if (SrcSize < DstSize) {
16533 bool IsArithmeticExtendedReduction =
16534 all_of(*UserIgnoreList, [](Value *V) {
16535 auto *I = cast<Instruction>(V);
16536 return is_contained({Instruction::Add, Instruction::FAdd,
16537 Instruction::Mul, Instruction::FMul,
16538 Instruction::And, Instruction::Or,
16539 Instruction::Xor},
16540 I->getOpcode());
16541 });
16542 if (IsArithmeticExtendedReduction)
16543 Opcode =
16544 Instruction::BitCast; // Handle it by getExtendedReductionCost
16545 else
16546 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16547 }
16548 if (Opcode != Instruction::BitCast) {
16549 auto *SrcVecTy =
16550 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16551 auto *DstVecTy =
16552 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16553 TTI::CastContextHint CCH = getCastContextHint(E);
16554 InstructionCost CastCost;
16555 switch (E.getOpcode()) {
16556 case Instruction::SExt:
16557 case Instruction::ZExt:
16558 case Instruction::Trunc: {
16559 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16560 CCH = getCastContextHint(*OpTE);
16561 break;
16562 }
16563 default:
16564 break;
16565 }
16566 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16568 Cost += CastCost;
16569 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
16570 << " for final resize for reduction from " << SrcVecTy
16571 << " to " << DstVecTy << "\n";
16572 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16573 }
16574 }
16575 }
16576
16577 std::optional<InstructionCost> SpillCost;
16578 if (Cost < -SLPCostThreshold) {
16579 SpillCost = getSpillCost();
16580 Cost += *SpillCost;
16581 }
16582#ifndef NDEBUG
16583 SmallString<256> Str;
16584 {
16585 raw_svector_ostream OS(Str);
16586 OS << "SLP: Spill Cost = ";
16587 if (SpillCost)
16588 OS << *SpillCost;
16589 else
16590 OS << "<skipped>";
16591 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
16592 << "SLP: Total Cost = " << Cost << ".\n";
16593 }
16594 LLVM_DEBUG(dbgs() << Str);
16595 if (ViewSLPTree)
16596 ViewGraph(this, "SLP" + F->getName(), false, Str);
16597#endif
16598
16599 return Cost;
16600}
16601
16602/// Tries to find extractelement instructions with constant indices from fixed
16603/// vector type and gather such instructions into a bunch, which highly likely
16604/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16605/// successful, the matched scalars are replaced by poison values in \p VL for
16606/// future analysis.
16607std::optional<TTI::ShuffleKind>
16608BoUpSLP::tryToGatherSingleRegisterExtractElements(
16610 // Scan list of gathered scalars for extractelements that can be represented
16611 // as shuffles.
16613 SmallVector<int> UndefVectorExtracts;
16614 for (int I = 0, E = VL.size(); I < E; ++I) {
16615 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16616 if (!EI) {
16617 if (isa<UndefValue>(VL[I]))
16618 UndefVectorExtracts.push_back(I);
16619 continue;
16620 }
16621 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
16622 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
16623 continue;
16624 std::optional<unsigned> Idx = getExtractIndex(EI);
16625 // Undefined index.
16626 if (!Idx) {
16627 UndefVectorExtracts.push_back(I);
16628 continue;
16629 }
16630 if (Idx >= VecTy->getNumElements()) {
16631 UndefVectorExtracts.push_back(I);
16632 continue;
16633 }
16634 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
16635 ExtractMask.reset(*Idx);
16636 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
16637 UndefVectorExtracts.push_back(I);
16638 continue;
16639 }
16640 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
16641 }
16642 // Sort the vector operands by the maximum number of uses in extractelements.
16644 VectorOpToIdx.takeVector();
16645 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
16646 return P1.second.size() > P2.second.size();
16647 });
16648 // Find the best pair of the vectors or a single vector.
16649 const int UndefSz = UndefVectorExtracts.size();
16650 unsigned SingleMax = 0;
16651 unsigned PairMax = 0;
16652 if (!Vectors.empty()) {
16653 SingleMax = Vectors.front().second.size() + UndefSz;
16654 if (Vectors.size() > 1) {
16655 auto *ItNext = std::next(Vectors.begin());
16656 PairMax = SingleMax + ItNext->second.size();
16657 }
16658 }
16659 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16660 return std::nullopt;
16661 // Check if better to perform a shuffle of 2 vectors or just of a single
16662 // vector.
16663 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
16664 SmallVector<Value *> GatheredExtracts(
16665 VL.size(), PoisonValue::get(VL.front()->getType()));
16666 if (SingleMax >= PairMax && SingleMax) {
16667 for (int Idx : Vectors.front().second)
16668 std::swap(GatheredExtracts[Idx], VL[Idx]);
16669 } else if (!Vectors.empty()) {
16670 for (unsigned Idx : {0, 1})
16671 for (int Idx : Vectors[Idx].second)
16672 std::swap(GatheredExtracts[Idx], VL[Idx]);
16673 }
16674 // Add extracts from undefs too.
16675 for (int Idx : UndefVectorExtracts)
16676 std::swap(GatheredExtracts[Idx], VL[Idx]);
16677 // Check that gather of extractelements can be represented as just a
16678 // shuffle of a single/two vectors the scalars are extracted from.
16679 std::optional<TTI::ShuffleKind> Res =
16680 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
16681 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16682 // TODO: try to check other subsets if possible.
16683 // Restore the original VL if attempt was not successful.
16684 copy(SavedVL, VL.begin());
16685 return std::nullopt;
16686 }
16687 // Restore unused scalars from mask, if some of the extractelements were not
16688 // selected for shuffle.
16689 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
16690 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
16691 isa<UndefValue>(GatheredExtracts[I])) {
16692 std::swap(VL[I], GatheredExtracts[I]);
16693 continue;
16694 }
16695 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16696 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
16697 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
16698 is_contained(UndefVectorExtracts, I))
16699 continue;
16700 }
16701 return Res;
16702}
16703
16704/// Tries to find extractelement instructions with constant indices from fixed
16705/// vector type and gather such instructions into a bunch, which highly likely
16706/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16707/// successful, the matched scalars are replaced by poison values in \p VL for
16708/// future analysis.
16710BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16711 SmallVectorImpl<int> &Mask,
16712 unsigned NumParts) const {
16713 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
16714 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
16715 Mask.assign(VL.size(), PoisonMaskElem);
16716 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
16717 for (unsigned Part : seq<unsigned>(NumParts)) {
16718 // Scan list of gathered scalars for extractelements that can be represented
16719 // as shuffles.
16720 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
16721 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
16722 SmallVector<int> SubMask;
16723 std::optional<TTI::ShuffleKind> Res =
16724 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16725 ShufflesRes[Part] = Res;
16726 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
16727 }
16728 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
16729 return Res.has_value();
16730 }))
16731 ShufflesRes.clear();
16732 return ShufflesRes;
16733}
16734
16735std::optional<TargetTransformInfo::ShuffleKind>
16736BoUpSLP::isGatherShuffledSingleRegisterEntry(
16737 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
16738 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
16739 Entries.clear();
16740 // TODO: currently checking only for Scalars in the tree entry, need to count
16741 // reused elements too for better cost estimation.
16742 auto GetUserEntry = [&](const TreeEntry *TE) {
16743 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16744 TE = TE->UserTreeIndex.UserTE;
16745 if (TE == VectorizableTree.front().get())
16746 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
16747 return TE->UserTreeIndex;
16748 };
16749 auto HasGatherUser = [&](const TreeEntry *TE) {
16750 while (TE->Idx != 0 && TE->UserTreeIndex) {
16751 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16752 return true;
16753 TE = TE->UserTreeIndex.UserTE;
16754 }
16755 return false;
16756 };
16757 const EdgeInfo TEUseEI = GetUserEntry(TE);
16758 if (!TEUseEI)
16759 return std::nullopt;
16760 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16761 const BasicBlock *TEInsertBlock = nullptr;
16762 // Main node of PHI entries keeps the correct order of operands/incoming
16763 // blocks.
16764 if (auto *PHI = dyn_cast_or_null<PHINode>(
16765 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
16766 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16767 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16768 TEInsertPt = TEInsertBlock->getTerminator();
16769 } else {
16770 TEInsertBlock = TEInsertPt->getParent();
16771 }
16772 if (!DT->isReachableFromEntry(TEInsertBlock))
16773 return std::nullopt;
16774 auto *NodeUI = DT->getNode(TEInsertBlock);
16775 assert(NodeUI && "Should only process reachable instructions");
16776 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
16777 auto CheckOrdering = [&](const Instruction *InsertPt) {
16778 // Argument InsertPt is an instruction where vector code for some other
16779 // tree entry (one that shares one or more scalars with TE) is going to be
16780 // generated. This lambda returns true if insertion point of vector code
16781 // for the TE dominates that point (otherwise dependency is the other way
16782 // around). The other node is not limited to be of a gather kind. Gather
16783 // nodes are not scheduled and their vector code is inserted before their
16784 // first user. If user is PHI, that is supposed to be at the end of a
16785 // predecessor block. Otherwise it is the last instruction among scalars of
16786 // the user node. So, instead of checking dependency between instructions
16787 // themselves, we check dependency between their insertion points for vector
16788 // code (since each scalar instruction ends up as a lane of a vector
16789 // instruction).
16790 const BasicBlock *InsertBlock = InsertPt->getParent();
16791 auto *NodeEUI = DT->getNode(InsertBlock);
16792 if (!NodeEUI)
16793 return false;
16794 assert((NodeUI == NodeEUI) ==
16795 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16796 "Different nodes should have different DFS numbers");
16797 // Check the order of the gather nodes users.
16798 if (TEInsertPt->getParent() != InsertBlock &&
16799 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16800 return false;
16801 if (TEInsertPt->getParent() == InsertBlock &&
16802 TEInsertPt->comesBefore(InsertPt))
16803 return false;
16804 return true;
16805 };
16806 // Find all tree entries used by the gathered values. If no common entries
16807 // found - not a shuffle.
16808 // Here we build a set of tree nodes for each gathered value and trying to
16809 // find the intersection between these sets. If we have at least one common
16810 // tree node for each gathered value - we have just a permutation of the
16811 // single vector. If we have 2 different sets, we're in situation where we
16812 // have a permutation of 2 input vectors.
16814 SmallDenseMap<Value *, int> UsedValuesEntry;
16815 SmallPtrSet<const Value *, 16> VisitedValue;
16816 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
16817 // The node is reused - exit.
16818 if ((TEPtr->getVectorFactor() != VL.size() &&
16819 TEPtr->Scalars.size() != VL.size()) ||
16820 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
16821 return false;
16822 UsedTEs.clear();
16823 UsedTEs.emplace_back().insert(TEPtr);
16824 for (Value *V : VL) {
16825 if (isConstant(V))
16826 continue;
16827 UsedValuesEntry.try_emplace(V, 0);
16828 }
16829 return true;
16830 };
16831 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
16832 unsigned EdgeIdx) {
16833 const TreeEntry *Ptr1 = User1;
16834 const TreeEntry *Ptr2 = User2;
16835 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16836 while (Ptr2) {
16837 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
16838 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16839 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16840 }
16841 while (Ptr1) {
16842 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16843 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16844 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
16845 return Idx < It->second;
16846 }
16847 return false;
16848 };
16849 for (Value *V : VL) {
16850 if (isConstant(V) || !VisitedValue.insert(V).second)
16851 continue;
16852 // Build a list of tree entries where V is used.
16853 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16854 for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16855 if (TEPtr == TE || TEPtr->Idx == 0)
16856 continue;
16857 assert(any_of(TEPtr->Scalars,
16858 [&](Value *V) { return GatheredScalars.contains(V); }) &&
16859 "Must contain at least single gathered value.");
16860 assert(TEPtr->UserTreeIndex &&
16861 "Expected only single user of a gather node.");
16862 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16863
16864 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16865 UseEI.UserTE->hasState())
16866 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
16867 : nullptr;
16868 Instruction *InsertPt =
16869 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
16870 : &getLastInstructionInBundle(UseEI.UserTE);
16871 if (TEInsertPt == InsertPt) {
16872 // Check nodes, which might be emitted first.
16873 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16874 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16875 TEUseEI.UserTE->isAltShuffle()) &&
16876 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
16877 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16878 (UseEI.UserTE->hasState() &&
16879 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16880 !UseEI.UserTE->isAltShuffle()) ||
16881 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
16882 continue;
16883 }
16884
16885 // If the schedulable insertion point is used in multiple entries - just
16886 // exit, no known ordering at this point, available only after real
16887 // scheduling.
16888 if (!doesNotNeedToBeScheduled(InsertPt) &&
16889 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16890 continue;
16891 // If the users are the PHI nodes with the same incoming blocks - skip.
16892 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16893 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16894 UseEI.UserTE->State == TreeEntry::Vectorize &&
16895 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16896 TEUseEI.UserTE != UseEI.UserTE)
16897 continue;
16898 // If 2 gathers are operands of the same entry (regardless of whether
16899 // user is PHI or else), compare operands indices, use the earlier one
16900 // as the base.
16901 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16902 continue;
16903 // If the user instruction is used for some reason in different
16904 // vectorized nodes - make it depend on index.
16905 if (TEUseEI.UserTE != UseEI.UserTE &&
16906 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16907 HasGatherUser(TEUseEI.UserTE)))
16908 continue;
16909 // If the user node is the operand of the other user node - skip.
16910 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16911 continue;
16912 }
16913
16914 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16915 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16916 UseEI.UserTE->doesNotNeedToSchedule() &&
16917 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
16918 continue;
16919 // Check if the user node of the TE comes after user node of TEPtr,
16920 // otherwise TEPtr depends on TE.
16921 if ((TEInsertBlock != InsertPt->getParent() ||
16922 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16923 (!CheckOrdering(InsertPt) ||
16924 (UseEI.UserTE->hasCopyableElements() &&
16925 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
16926 is_contained(UseEI.UserTE->Scalars, TEInsertPt))))
16927 continue;
16928 // The node is reused - exit.
16929 if (CheckAndUseSameNode(TEPtr))
16930 break;
16931 VToTEs.insert(TEPtr);
16932 }
16933 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
16934 const auto *It = find_if(
16935 VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
16936 if (It != VTEs.end()) {
16937 const TreeEntry *VTE = *It;
16938 if (none_of(TE->CombinedEntriesWithIndices,
16939 [&](const auto &P) { return P.first == VTE->Idx; })) {
16940 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16941 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16942 continue;
16943 }
16944 // The node is reused - exit.
16945 if (CheckAndUseSameNode(VTE))
16946 break;
16947 VToTEs.insert(VTE);
16948 }
16949 }
16950 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
16951 const TreeEntry *VTE = VTEs.front();
16952 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16953 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16954 VTEs = VTEs.drop_front();
16955 // Iterate through all vectorized nodes.
16956 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
16957 return MTE->State == TreeEntry::Vectorize;
16958 });
16959 if (MIt == VTEs.end())
16960 continue;
16961 VTE = *MIt;
16962 }
16963 if (none_of(TE->CombinedEntriesWithIndices,
16964 [&](const auto &P) { return P.first == VTE->Idx; })) {
16965 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16966 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16967 continue;
16968 }
16969 // The node is reused - exit.
16970 if (CheckAndUseSameNode(VTE))
16971 break;
16972 VToTEs.insert(VTE);
16973 }
16974 if (VToTEs.empty())
16975 continue;
16976 if (UsedTEs.empty()) {
16977 // The first iteration, just insert the list of nodes to vector.
16978 UsedTEs.push_back(VToTEs);
16979 UsedValuesEntry.try_emplace(V, 0);
16980 } else {
16981 // Need to check if there are any previously used tree nodes which use V.
16982 // If there are no such nodes, consider that we have another one input
16983 // vector.
16984 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16985 unsigned Idx = 0;
16986 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16987 // Do we have a non-empty intersection of previously listed tree entries
16988 // and tree entries using current V?
16989 set_intersect(VToTEs, Set);
16990 if (!VToTEs.empty()) {
16991 // Yes, write the new subset and continue analysis for the next
16992 // scalar.
16993 Set.swap(VToTEs);
16994 break;
16995 }
16996 VToTEs = SavedVToTEs;
16997 ++Idx;
16998 }
16999 // No non-empty intersection found - need to add a second set of possible
17000 // source vectors.
17001 if (Idx == UsedTEs.size()) {
17002 // If the number of input vectors is greater than 2 - not a permutation,
17003 // fallback to the regular gather.
17004 // TODO: support multiple reshuffled nodes.
17005 if (UsedTEs.size() == 2)
17006 continue;
17007 UsedTEs.push_back(SavedVToTEs);
17008 Idx = UsedTEs.size() - 1;
17009 }
17010 UsedValuesEntry.try_emplace(V, Idx);
17011 }
17012 }
17013
17014 if (UsedTEs.empty()) {
17015 Entries.clear();
17016 return std::nullopt;
17017 }
17018
17019 unsigned VF = 0;
17020 if (UsedTEs.size() == 1) {
17021 // Keep the order to avoid non-determinism.
17022 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
17023 UsedTEs.front().end());
17024 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17025 return TE1->Idx < TE2->Idx;
17026 });
17027 // Try to find the perfect match in another gather node at first.
17028 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
17029 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
17030 });
17031 if (It != FirstEntries.end() &&
17032 ((*It)->getVectorFactor() == VL.size() ||
17033 ((*It)->getVectorFactor() == TE->Scalars.size() &&
17034 TE->ReuseShuffleIndices.size() == VL.size() &&
17035 (*It)->isSame(TE->Scalars)))) {
17036 Entries.push_back(*It);
17037 if ((*It)->getVectorFactor() == VL.size()) {
17038 std::iota(std::next(Mask.begin(), Part * VL.size()),
17039 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
17040 } else {
17041 SmallVector<int> CommonMask = TE->getCommonMask();
17042 copy(CommonMask, Mask.begin());
17043 }
17044 // Clear undef scalars.
17045 for (unsigned I : seq<unsigned>(VL.size()))
17046 if (isa<PoisonValue>(VL[I]))
17047 Mask[Part * VL.size() + I] = PoisonMaskElem;
17049 }
17050 // No perfect match, just shuffle, so choose the first tree node from the
17051 // tree.
17052 Entries.push_back(FirstEntries.front());
17053 // Update mapping between values and corresponding tree entries.
17054 for (auto &P : UsedValuesEntry)
17055 P.second = 0;
17056 VF = FirstEntries.front()->getVectorFactor();
17057 } else {
17058 // Try to find nodes with the same vector factor.
17059 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
17060 // Keep the order of tree nodes to avoid non-determinism.
17061 DenseMap<int, const TreeEntry *> VFToTE;
17062 for (const TreeEntry *TE : UsedTEs.front()) {
17063 unsigned VF = TE->getVectorFactor();
17064 auto It = VFToTE.find(VF);
17065 if (It != VFToTE.end()) {
17066 if (It->second->Idx > TE->Idx)
17067 It->getSecond() = TE;
17068 continue;
17069 }
17070 VFToTE.try_emplace(VF, TE);
17071 }
17072 // Same, keep the order to avoid non-determinism.
17073 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
17074 UsedTEs.back().end());
17075 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17076 return TE1->Idx < TE2->Idx;
17077 });
17078 for (const TreeEntry *TE : SecondEntries) {
17079 auto It = VFToTE.find(TE->getVectorFactor());
17080 if (It != VFToTE.end()) {
17081 VF = It->first;
17082 Entries.push_back(It->second);
17083 Entries.push_back(TE);
17084 break;
17085 }
17086 }
17087 // No 2 source vectors with the same vector factor - just choose 2 with max
17088 // index.
17089 if (Entries.empty()) {
17091 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
17092 return TE1->Idx < TE2->Idx;
17093 }));
17094 Entries.push_back(SecondEntries.front());
17095 VF = std::max(Entries.front()->getVectorFactor(),
17096 Entries.back()->getVectorFactor());
17097 } else {
17098 VF = Entries.front()->getVectorFactor();
17099 }
17100 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
17101 for (const TreeEntry *E : Entries)
17102 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
17103 E->Scalars.end());
17104 // Update mapping between values and corresponding tree entries.
17105 for (auto &P : UsedValuesEntry) {
17106 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
17107 if (ValuesToEntries[Idx].contains(P.first)) {
17108 P.second = Idx;
17109 break;
17110 }
17111 }
17112 }
17113
17114 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
17115 // Checks if the 2 PHIs are compatible in terms of high possibility to be
17116 // vectorized.
17117 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
17118 auto *PHI = cast<PHINode>(V);
17119 auto *PHI1 = cast<PHINode>(V1);
17120 // Check that all incoming values are compatible/from same parent (if they
17121 // are instructions).
17122 // The incoming values are compatible if they all are constants, or
17123 // instruction with the same/alternate opcodes from the same basic block.
17124 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
17125 Value *In = PHI->getIncomingValue(I);
17126 Value *In1 = PHI1->getIncomingValue(I);
17127 if (isConstant(In) && isConstant(In1))
17128 continue;
17129 if (!getSameOpcode({In, In1}, *TLI))
17130 return false;
17131 if (cast<Instruction>(In)->getParent() !=
17133 return false;
17134 }
17135 return true;
17136 };
17137 // Check if the value can be ignored during analysis for shuffled gathers.
17138 // We suppose it is better to ignore instruction, which do not form splats,
17139 // are not vectorized/not extractelements (these instructions will be handled
17140 // by extractelements processing) or may form vector node in future.
17141 auto MightBeIgnored = [=](Value *V) {
17142 auto *I = dyn_cast<Instruction>(V);
17143 return I && !IsSplatOrUndefs && !isVectorized(I) &&
17145 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
17146 };
17147 // Check that the neighbor instruction may form a full vector node with the
17148 // current instruction V. It is possible, if they have same/alternate opcode
17149 // and same parent basic block.
17150 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
17151 Value *V1 = VL[Idx];
17152 bool UsedInSameVTE = false;
17153 auto It = UsedValuesEntry.find(V1);
17154 if (It != UsedValuesEntry.end())
17155 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17156 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17157 getSameOpcode({V, V1}, *TLI) &&
17158 cast<Instruction>(V)->getParent() ==
17159 cast<Instruction>(V1)->getParent() &&
17160 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
17161 };
17162 // Build a shuffle mask for better cost estimation and vector emission.
17163 SmallBitVector UsedIdxs(Entries.size());
17165 for (int I = 0, E = VL.size(); I < E; ++I) {
17166 Value *V = VL[I];
17167 auto It = UsedValuesEntry.find(V);
17168 if (It == UsedValuesEntry.end())
17169 continue;
17170 // Do not try to shuffle scalars, if they are constants, or instructions
17171 // that can be vectorized as a result of the following vector build
17172 // vectorization.
17173 if (isConstant(V) || (MightBeIgnored(V) &&
17174 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
17175 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
17176 continue;
17177 unsigned Idx = It->second;
17178 EntryLanes.emplace_back(Idx, I);
17179 UsedIdxs.set(Idx);
17180 }
17181 // Iterate through all shuffled scalars and select entries, which can be used
17182 // for final shuffle.
17184 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
17185 if (!UsedIdxs.test(I))
17186 continue;
17187 // Fix the entry number for the given scalar. If it is the first entry, set
17188 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
17189 // These indices are used when calculating final shuffle mask as the vector
17190 // offset.
17191 for (std::pair<unsigned, int> &Pair : EntryLanes)
17192 if (Pair.first == I)
17193 Pair.first = TempEntries.size();
17194 TempEntries.push_back(Entries[I]);
17195 }
17196 Entries.swap(TempEntries);
17197 if (EntryLanes.size() == Entries.size() &&
17198 !VL.equals(ArrayRef(TE->Scalars)
17199 .slice(Part * VL.size(),
17200 std::min<int>(VL.size(), TE->Scalars.size())))) {
17201 // We may have here 1 or 2 entries only. If the number of scalars is equal
17202 // to the number of entries, no need to do the analysis, it is not very
17203 // profitable. Since VL is not the same as TE->Scalars, it means we already
17204 // have some shuffles before. Cut off not profitable case.
17205 Entries.clear();
17206 return std::nullopt;
17207 }
17208 // Build the final mask, check for the identity shuffle, if possible.
17209 bool IsIdentity = Entries.size() == 1;
17210 // Pair.first is the offset to the vector, while Pair.second is the index of
17211 // scalar in the list.
17212 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
17213 unsigned Idx = Part * VL.size() + Pair.second;
17214 Mask[Idx] =
17215 Pair.first * VF +
17216 (ForOrder ? std::distance(
17217 Entries[Pair.first]->Scalars.begin(),
17218 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17219 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17220 IsIdentity &= Mask[Idx] == Pair.second;
17221 }
17222 if (ForOrder || IsIdentity || Entries.empty()) {
17223 switch (Entries.size()) {
17224 case 1:
17225 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17227 break;
17228 case 2:
17229 if (EntryLanes.size() > 2 || VL.size() <= 2)
17231 break;
17232 default:
17233 break;
17234 }
17235 } else if (!isa<VectorType>(VL.front()->getType()) &&
17236 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17237 // Do the cost estimation if shuffle beneficial than buildvector.
17238 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
17239 std::next(Mask.begin(), (Part + 1) * VL.size()));
17240 int MinElement = SubMask.front(), MaxElement = SubMask.front();
17241 for (int Idx : SubMask) {
17242 if (Idx == PoisonMaskElem)
17243 continue;
17244 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
17245 MinElement = Idx;
17246 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
17247 MaxElement = Idx;
17248 }
17249 assert(MaxElement >= 0 && MinElement >= 0 &&
17250 MaxElement % VF >= MinElement % VF &&
17251 "Expected at least single element.");
17252 unsigned NewVF = std::max<unsigned>(
17253 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
17254 (MaxElement % VF) -
17255 (MinElement % VF) + 1));
17256 if (NewVF < VF) {
17257 for (int &Idx : SubMask) {
17258 if (Idx == PoisonMaskElem)
17259 continue;
17260 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17261 (Idx >= static_cast<int>(VF) ? NewVF : 0);
17262 }
17263 } else {
17264 NewVF = VF;
17265 }
17266
17268 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
17269 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
17270 auto GetShuffleCost = [&,
17271 &TTI = *TTI](ArrayRef<int> Mask,
17273 VectorType *VecTy) -> InstructionCost {
17274 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17276 Mask, Entries.front()->getInterleaveFactor()))
17277 return TTI::TCC_Free;
17278 return ::getShuffleCost(TTI,
17279 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
17281 VecTy, Mask, CostKind);
17282 };
17283 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17284 InstructionCost FirstShuffleCost = 0;
17285 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17286 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17287 FirstShuffleCost = ShuffleCost;
17288 } else {
17289 // Transform mask to include only first entry.
17290 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17291 bool IsIdentity = true;
17292 for (auto [I, Idx] : enumerate(FirstMask)) {
17293 if (Idx >= static_cast<int>(NewVF)) {
17294 Idx = PoisonMaskElem;
17295 } else {
17296 DemandedElts.clearBit(I);
17297 if (Idx != PoisonMaskElem)
17298 IsIdentity &= static_cast<int>(I) == Idx;
17299 }
17300 }
17301 if (!IsIdentity)
17302 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17303 FirstShuffleCost += getScalarizationOverhead(
17304 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17305 /*Extract=*/false, CostKind);
17306 }
17307 InstructionCost SecondShuffleCost = 0;
17308 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17309 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17310 SecondShuffleCost = ShuffleCost;
17311 } else {
17312 // Transform mask to include only first entry.
17313 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17314 bool IsIdentity = true;
17315 for (auto [I, Idx] : enumerate(SecondMask)) {
17316 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
17317 Idx = PoisonMaskElem;
17318 } else {
17319 DemandedElts.clearBit(I);
17320 if (Idx != PoisonMaskElem) {
17321 Idx -= NewVF;
17322 IsIdentity &= static_cast<int>(I) == Idx;
17323 }
17324 }
17325 }
17326 if (!IsIdentity)
17327 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17328 SecondShuffleCost += getScalarizationOverhead(
17329 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17330 /*Extract=*/false, CostKind);
17331 }
17332 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17333 for (auto [I, Idx] : enumerate(SubMask))
17334 if (Idx == PoisonMaskElem)
17335 DemandedElts.clearBit(I);
17336 InstructionCost BuildVectorCost = getScalarizationOverhead(
17337 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17338 /*Extract=*/false, CostKind);
17339 const TreeEntry *BestEntry = nullptr;
17340 if (FirstShuffleCost < ShuffleCost) {
17341 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17342 std::next(Mask.begin(), (Part + 1) * VL.size()),
17343 [&](int &Idx) {
17344 if (Idx >= static_cast<int>(VF))
17345 Idx = PoisonMaskElem;
17346 });
17347 BestEntry = Entries.front();
17348 ShuffleCost = FirstShuffleCost;
17349 }
17350 if (SecondShuffleCost < ShuffleCost) {
17351 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17352 std::next(Mask.begin(), (Part + 1) * VL.size()),
17353 [&](int &Idx) {
17354 if (Idx < static_cast<int>(VF))
17355 Idx = PoisonMaskElem;
17356 else
17357 Idx -= VF;
17358 });
17359 BestEntry = Entries[1];
17360 ShuffleCost = SecondShuffleCost;
17361 }
17362 if (BuildVectorCost >= ShuffleCost) {
17363 if (BestEntry) {
17364 Entries.clear();
17365 Entries.push_back(BestEntry);
17366 }
17367 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
17369 }
17370 }
17371 Entries.clear();
17372 // Clear the corresponding mask elements.
17373 std::fill(std::next(Mask.begin(), Part * VL.size()),
17374 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
17375 return std::nullopt;
17376}
17377
17379BoUpSLP::isGatherShuffledEntry(
17380 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
17381 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
17382 bool ForOrder) {
17383 assert(NumParts > 0 && NumParts < VL.size() &&
17384 "Expected positive number of registers.");
17385 Entries.clear();
17386 // No need to check for the topmost gather node.
17387 if (TE == VectorizableTree.front().get() &&
17388 (!GatheredLoadsEntriesFirst.has_value() ||
17389 none_of(ArrayRef(VectorizableTree).drop_front(),
17390 [](const std::unique_ptr<TreeEntry> &TE) {
17391 return !TE->isGather();
17392 })))
17393 return {};
17394 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
17395 // implemented yet.
17396 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17397 return {};
17398 Mask.assign(VL.size(), PoisonMaskElem);
17399 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17400 "Expected only single user of the gather node.");
17401 assert(VL.size() % NumParts == 0 &&
17402 "Number of scalars must be divisible by NumParts.");
17403 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
17404 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17405 (TE->Idx == 0 ||
17406 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
17407 isSplat(TE->Scalars) ||
17408 (TE->hasState() &&
17409 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
17410 return {};
17411 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17413 for (unsigned Part : seq<unsigned>(NumParts)) {
17414 ArrayRef<Value *> SubVL =
17415 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17416 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17417 std::optional<TTI::ShuffleKind> SubRes =
17418 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17419 ForOrder);
17420 if (!SubRes)
17421 SubEntries.clear();
17422 Res.push_back(SubRes);
17423 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
17424 SubEntries.front()->getVectorFactor() == VL.size() &&
17425 (SubEntries.front()->isSame(TE->Scalars) ||
17426 SubEntries.front()->isSame(VL))) {
17427 SmallVector<const TreeEntry *> LocalSubEntries;
17428 LocalSubEntries.swap(SubEntries);
17429 Entries.clear();
17430 Res.clear();
17431 std::iota(Mask.begin(), Mask.end(), 0);
17432 // Clear undef scalars.
17433 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
17434 if (isa<PoisonValue>(VL[I]))
17436 Entries.emplace_back(1, LocalSubEntries.front());
17438 return Res;
17439 }
17440 }
17441 if (all_of(Res,
17442 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
17443 Entries.clear();
17444 return {};
17445 }
17446 return Res;
17447}
17448
17449InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
17450 Type *ScalarTy) const {
17451 const unsigned VF = VL.size();
17452 auto *VecTy = getWidenedType(ScalarTy, VF);
17453 // Find the cost of inserting/extracting values from the vector.
17454 // Check if the same elements are inserted several times and count them as
17455 // shuffle candidates.
17456 APInt DemandedElements = APInt::getZero(VF);
17459 auto EstimateInsertCost = [&](unsigned I, Value *V) {
17460 DemandedElements.setBit(I);
17461 if (V->getType() != ScalarTy)
17462 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
17464 };
17465 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
17466 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17467 for (auto [I, V] : enumerate(VL)) {
17468 // No need to shuffle duplicates for constants.
17469 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
17470 continue;
17471
17472 if (isConstant(V)) {
17473 ConstantShuffleMask[I] = I + VF;
17474 continue;
17475 }
17476 EstimateInsertCost(I, V);
17477 }
17478 // FIXME: add a cost for constant vector materialization.
17479 bool IsAnyNonUndefConst =
17480 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
17481 // 1. Shuffle input source vector and constant vector.
17482 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17484 ConstantShuffleMask);
17485 }
17486
17487 // 2. Insert unique non-constants.
17488 if (!DemandedElements.isZero())
17489 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
17490 /*Insert=*/true,
17491 /*Extract=*/false, CostKind,
17492 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17493 return Cost;
17494}
17495
17496Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
17497 auto It = EntryToLastInstruction.find(E);
17498 if (It != EntryToLastInstruction.end())
17499 return *cast<Instruction>(It->second);
17500 Instruction *Res = nullptr;
17501 // Get the basic block this bundle is in. All instructions in the bundle
17502 // should be in this block (except for extractelement-like instructions with
17503 // constant indices or gathered loads or copyables).
17504 Instruction *Front;
17505 unsigned Opcode;
17506 if (E->hasState()) {
17507 Front = E->getMainOp();
17508 Opcode = E->getOpcode();
17509 } else {
17510 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
17511 Opcode = Front->getOpcode();
17512 }
17513 auto *BB = Front->getParent();
17514 assert(
17515 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17516 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
17517 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
17518 all_of(E->Scalars,
17519 [=](Value *V) -> bool {
17520 if (Opcode == Instruction::GetElementPtr &&
17521 !isa<GetElementPtrInst>(V))
17522 return true;
17523 auto *I = dyn_cast<Instruction>(V);
17524 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17525 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17526 })) &&
17527 "Expected gathered loads or GEPs or instructions from same basic "
17528 "block.");
17529
17530 auto FindLastInst = [&]() {
17531 Instruction *LastInst = Front;
17532 for (Value *V : E->Scalars) {
17533 auto *I = dyn_cast<Instruction>(V);
17534 if (!I)
17535 continue;
17536 if (E->isCopyableElement(I))
17537 continue;
17538 if (LastInst->getParent() == I->getParent()) {
17539 if (LastInst->comesBefore(I))
17540 LastInst = I;
17541 continue;
17542 }
17543 assert(((Opcode == Instruction::GetElementPtr &&
17545 E->State == TreeEntry::SplitVectorize ||
17546 (isVectorLikeInstWithConstOps(LastInst) &&
17548 (GatheredLoadsEntriesFirst.has_value() &&
17549 Opcode == Instruction::Load && E->isGather() &&
17550 E->Idx < *GatheredLoadsEntriesFirst)) &&
17551 "Expected vector-like or non-GEP in GEP node insts only.");
17552 if (!DT->isReachableFromEntry(LastInst->getParent())) {
17553 LastInst = I;
17554 continue;
17555 }
17556 if (!DT->isReachableFromEntry(I->getParent()))
17557 continue;
17558 auto *NodeA = DT->getNode(LastInst->getParent());
17559 auto *NodeB = DT->getNode(I->getParent());
17560 assert(NodeA && "Should only process reachable instructions");
17561 assert(NodeB && "Should only process reachable instructions");
17562 assert((NodeA == NodeB) ==
17563 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17564 "Different nodes should have different DFS numbers");
17565 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17566 LastInst = I;
17567 }
17568 BB = LastInst->getParent();
17569 return LastInst;
17570 };
17571
17572 auto FindFirstInst = [&]() {
17573 Instruction *FirstInst = Front;
17574 for (Value *V : E->Scalars) {
17575 auto *I = dyn_cast<Instruction>(V);
17576 if (!I)
17577 continue;
17578 if (E->isCopyableElement(I))
17579 continue;
17580 if (FirstInst->getParent() == I->getParent()) {
17581 if (I->comesBefore(FirstInst))
17582 FirstInst = I;
17583 continue;
17584 }
17585 assert(((Opcode == Instruction::GetElementPtr &&
17587 (isVectorLikeInstWithConstOps(FirstInst) &&
17589 "Expected vector-like or non-GEP in GEP node insts only.");
17590 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
17591 FirstInst = I;
17592 continue;
17593 }
17594 if (!DT->isReachableFromEntry(I->getParent()))
17595 continue;
17596 auto *NodeA = DT->getNode(FirstInst->getParent());
17597 auto *NodeB = DT->getNode(I->getParent());
17598 assert(NodeA && "Should only process reachable instructions");
17599 assert(NodeB && "Should only process reachable instructions");
17600 assert((NodeA == NodeB) ==
17601 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17602 "Different nodes should have different DFS numbers");
17603 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17604 FirstInst = I;
17605 }
17606 return FirstInst;
17607 };
17608
17609 if (E->State == TreeEntry::SplitVectorize) {
17610 Res = FindLastInst();
17611 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
17612 for (auto *E : Entries) {
17613 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
17614 if (!I)
17615 I = &getLastInstructionInBundle(E);
17616 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
17617 Res = I;
17618 }
17619 }
17620 EntryToLastInstruction.try_emplace(E, Res);
17621 return *Res;
17622 }
17623
17624 // Set insertpoint for gathered loads to the very first load.
17625 if (GatheredLoadsEntriesFirst.has_value() &&
17626 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17627 Opcode == Instruction::Load) {
17628 Res = FindFirstInst();
17629 EntryToLastInstruction.try_emplace(E, Res);
17630 return *Res;
17631 }
17632
17633 // Set the insert point to the beginning of the basic block if the entry
17634 // should not be scheduled.
17635 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
17636 if (E->isGather())
17637 return nullptr;
17638 // Found previously that the instruction do not need to be scheduled.
17639 const auto *It = BlocksSchedules.find(BB);
17640 if (It == BlocksSchedules.end())
17641 return nullptr;
17642 for (Value *V : E->Scalars) {
17643 auto *I = dyn_cast<Instruction>(V);
17644 if (!I || isa<PHINode>(I) ||
17645 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
17646 continue;
17647 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
17648 if (Bundles.empty())
17649 continue;
17650 const auto *It = find_if(
17651 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
17652 if (It != Bundles.end())
17653 return *It;
17654 }
17655 return nullptr;
17656 };
17657 const ScheduleBundle *Bundle = FindScheduleBundle(E);
17658 if (!E->isGather() && !Bundle) {
17659 if ((Opcode == Instruction::GetElementPtr &&
17660 any_of(E->Scalars,
17661 [](Value *V) {
17662 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17663 })) ||
17664 (all_of(E->Scalars,
17665 [&](Value *V) {
17666 return isa<PoisonValue>(V) ||
17667 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17668 E->isCopyableElement(V) ||
17669 (!isVectorLikeInstWithConstOps(V) &&
17670 isUsedOutsideBlock(V));
17671 }) &&
17672 (!E->doesNotNeedToSchedule() ||
17673 any_of(E->Scalars,
17674 [&](Value *V) {
17675 if (!isa<Instruction>(V) ||
17676 (E->hasCopyableElements() && E->isCopyableElement(V)))
17677 return false;
17678 return !areAllOperandsNonInsts(V);
17679 }) ||
17680 none_of(E->Scalars, [&](Value *V) {
17681 if (!isa<Instruction>(V) ||
17682 (E->hasCopyableElements() && E->isCopyableElement(V)))
17683 return false;
17684 return MustGather.contains(V);
17685 }))))
17686 Res = FindLastInst();
17687 else
17688 Res = FindFirstInst();
17689 EntryToLastInstruction.try_emplace(E, Res);
17690 return *Res;
17691 }
17692
17693 // Find the last instruction. The common case should be that BB has been
17694 // scheduled, and the last instruction is VL.back(). So we start with
17695 // VL.back() and iterate over schedule data until we reach the end of the
17696 // bundle. The end of the bundle is marked by null ScheduleData.
17697 if (Bundle) {
17698 assert(!E->isGather() && "Gathered instructions should not be scheduled");
17699 Res = Bundle->getBundle().back()->getInst();
17700 EntryToLastInstruction.try_emplace(E, Res);
17701 return *Res;
17702 }
17703
17704 // LastInst can still be null at this point if there's either not an entry
17705 // for BB in BlocksSchedules or there's no ScheduleData available for
17706 // VL.back(). This can be the case if buildTreeRec aborts for various
17707 // reasons (e.g., the maximum recursion depth is reached, the maximum region
17708 // size is reached, etc.). ScheduleData is initialized in the scheduling
17709 // "dry-run".
17710 //
17711 // If this happens, we can still find the last instruction by brute force. We
17712 // iterate forwards from Front (inclusive) until we either see all
17713 // instructions in the bundle or reach the end of the block. If Front is the
17714 // last instruction in program order, LastInst will be set to Front, and we
17715 // will visit all the remaining instructions in the block.
17716 //
17717 // One of the reasons we exit early from buildTreeRec is to place an upper
17718 // bound on compile-time. Thus, taking an additional compile-time hit here is
17719 // not ideal. However, this should be exceedingly rare since it requires that
17720 // we both exit early from buildTreeRec and that the bundle be out-of-order
17721 // (causing us to iterate all the way to the end of the block).
17722 if (!Res)
17723 Res = FindLastInst();
17724 assert(Res && "Failed to find last instruction in bundle");
17725 EntryToLastInstruction.try_emplace(E, Res);
17726 return *Res;
17727}
17728
17729void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
17730 auto *Front = E->getMainOp();
17731 Instruction *LastInst = &getLastInstructionInBundle(E);
17732 assert(LastInst && "Failed to find last instruction in bundle");
17733 BasicBlock::iterator LastInstIt = LastInst->getIterator();
17734 // If the instruction is PHI, set the insert point after all the PHIs.
17735 bool IsPHI = isa<PHINode>(LastInst);
17736 if (IsPHI) {
17737 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
17738 if (LastInstIt != LastInst->getParent()->end() &&
17739 LastInstIt->getParent()->isLandingPad())
17740 LastInstIt = std::next(LastInstIt);
17741 }
17742 if (IsPHI ||
17743 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
17744 (E->doesNotNeedToSchedule() ||
17745 (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
17746 isUsedOutsideBlock(LastInst)))) ||
17747 (GatheredLoadsEntriesFirst.has_value() &&
17748 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17749 E->getOpcode() == Instruction::Load)) {
17750 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
17751 } else {
17752 // Set the insertion point after the last instruction in the bundle. Set the
17753 // debug location to Front.
17754 Builder.SetInsertPoint(
17755 LastInst->getParent(),
17756 LastInst->getNextNode()->getIterator());
17757 }
17758 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
17759}
17760
17761Value *BoUpSLP::gather(
17762 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
17763 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
17764 // List of instructions/lanes from current block and/or the blocks which are
17765 // part of the current loop. These instructions will be inserted at the end to
17766 // make it possible to optimize loops and hoist invariant instructions out of
17767 // the loops body with better chances for success.
17769 SmallSet<int, 4> PostponedIndices;
17770 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
17771 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
17772 SmallPtrSet<BasicBlock *, 4> Visited;
17773 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
17774 InsertBB = InsertBB->getSinglePredecessor();
17775 return InsertBB && InsertBB == InstBB;
17776 };
17777 for (int I = 0, E = VL.size(); I < E; ++I) {
17778 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
17779 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17780 isVectorized(Inst) ||
17781 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
17782 PostponedIndices.insert(I).second)
17783 PostponedInsts.emplace_back(Inst, I);
17784 }
17785
17786 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
17787 Type *Ty) {
17788 Value *Scalar = V;
17789 if (Scalar->getType() != Ty) {
17790 assert(Scalar->getType()->isIntOrIntVectorTy() &&
17791 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
17792 Value *V = Scalar;
17793 if (auto *CI = dyn_cast<CastInst>(Scalar);
17795 Value *Op = CI->getOperand(0);
17796 if (auto *IOp = dyn_cast<Instruction>(Op);
17797 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
17798 V = Op;
17799 }
17800 Scalar = Builder.CreateIntCast(
17801 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
17802 }
17803
17804 Instruction *InsElt;
17805 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
17806 assert(SLPReVec && "FixedVectorType is not expected.");
17807 Vec =
17808 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
17809 auto *II = dyn_cast<Instruction>(Vec);
17810 if (!II)
17811 return Vec;
17812 InsElt = II;
17813 } else {
17814 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17815 InsElt = dyn_cast<InsertElementInst>(Vec);
17816 if (!InsElt)
17817 return Vec;
17818 }
17819 GatherShuffleExtractSeq.insert(InsElt);
17820 CSEBlocks.insert(InsElt->getParent());
17821 // Add to our 'need-to-extract' list.
17822 if (isa<Instruction>(V)) {
17823 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
17824 // Find which lane we need to extract.
17825 User *UserOp = nullptr;
17826 if (Scalar != V) {
17827 if (auto *SI = dyn_cast<Instruction>(Scalar))
17828 UserOp = SI;
17829 } else {
17830 if (V->getType()->isVectorTy()) {
17831 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
17832 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17833 // Find shufflevector, caused by resize.
17834 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
17835 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
17836 if (SV->getOperand(0) == V)
17837 return SV;
17838 if (SV->getOperand(1) == V)
17839 return SV;
17840 }
17841 return nullptr;
17842 };
17843 InsElt = nullptr;
17844 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17845 InsElt = User;
17846 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17847 InsElt = User;
17848 assert(InsElt &&
17849 "Failed to find shufflevector, caused by resize.");
17850 }
17851 }
17852 UserOp = InsElt;
17853 }
17854 if (UserOp) {
17855 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17856 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17857 }
17858 }
17859 }
17860 return Vec;
17861 };
17862 auto *VecTy = getWidenedType(ScalarTy, VL.size());
17863 Value *Vec = PoisonValue::get(VecTy);
17864 SmallVector<int> NonConsts;
17865 SmallVector<int> Mask(VL.size());
17866 std::iota(Mask.begin(), Mask.end(), 0);
17867 Value *OriginalRoot = Root;
17868 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
17869 SV && isa<PoisonValue>(SV->getOperand(1)) &&
17870 SV->getOperand(0)->getType() == VecTy) {
17871 Root = SV->getOperand(0);
17872 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17873 }
17874 // Insert constant values at first.
17875 for (int I = 0, E = VL.size(); I < E; ++I) {
17876 if (PostponedIndices.contains(I))
17877 continue;
17878 if (!isConstant(VL[I])) {
17879 NonConsts.push_back(I);
17880 continue;
17881 }
17882 if (isa<PoisonValue>(VL[I]))
17883 continue;
17884 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17885 Mask[I] = I + E;
17886 }
17887 if (Root) {
17888 if (isa<PoisonValue>(Vec)) {
17889 Vec = OriginalRoot;
17890 } else {
17891 Vec = CreateShuffle(Root, Vec, Mask);
17892 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
17893 OI && OI->use_empty() &&
17894 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
17895 return TE->VectorizedValue == OI;
17896 }))
17897 eraseInstruction(OI);
17898 }
17899 }
17900 // Insert non-constant values.
17901 for (int I : NonConsts)
17902 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17903 // Append instructions, which are/may be part of the loop, in the end to make
17904 // it possible to hoist non-loop-based instructions.
17905 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17906 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17907
17908 return Vec;
17909}
17910
17911/// Merges shuffle masks and emits final shuffle instruction, if required. It
17912/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
17913/// when the actual shuffle instruction is generated only if this is actually
17914/// required. Otherwise, the shuffle instruction emission is delayed till the
17915/// end of the process, to reduce the number of emitted instructions and further
17916/// analysis/transformations.
17917/// The class also will look through the previously emitted shuffle instructions
17918/// and properly mark indices in mask as undef.
17919/// For example, given the code
17920/// \code
17921/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
17922/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
17923/// \endcode
17924/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
17925/// look through %s1 and %s2 and emit
17926/// \code
17927/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17928/// \endcode
17929/// instead.
17930/// If 2 operands are of different size, the smallest one will be resized and
17931/// the mask recalculated properly.
17932/// For example, given the code
17933/// \code
17934/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
17935/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
17936/// \endcode
17937/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
17938/// look through %s1 and %s2 and emit
17939/// \code
17940/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17941/// \endcode
17942/// instead.
17943class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
17944 bool IsFinalized = false;
17945 /// Combined mask for all applied operands and masks. It is built during
17946 /// analysis and actual emission of shuffle vector instructions.
17947 SmallVector<int> CommonMask;
17948 /// List of operands for the shuffle vector instruction. It hold at max 2
17949 /// operands, if the 3rd is going to be added, the first 2 are combined into
17950 /// shuffle with \p CommonMask mask, the first operand sets to be the
17951 /// resulting shuffle and the second operand sets to be the newly added
17952 /// operand. The \p CommonMask is transformed in the proper way after that.
17953 SmallVector<Value *, 2> InVectors;
17954 IRBuilderBase &Builder;
17955 BoUpSLP &R;
17956
17957 class ShuffleIRBuilder {
17958 IRBuilderBase &Builder;
17959 /// Holds all of the instructions that we gathered.
17960 SetVector<Instruction *> &GatherShuffleExtractSeq;
17961 /// A list of blocks that we are going to CSE.
17962 DenseSet<BasicBlock *> &CSEBlocks;
17963 /// Data layout.
17964 const DataLayout &DL;
17965
17966 public:
17967 ShuffleIRBuilder(IRBuilderBase &Builder,
17968 SetVector<Instruction *> &GatherShuffleExtractSeq,
17969 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
17970 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17971 CSEBlocks(CSEBlocks), DL(DL) {}
17972 ~ShuffleIRBuilder() = default;
17973 /// Creates shufflevector for the 2 operands with the given mask.
17974 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
17975 if (V1->getType() != V2->getType()) {
17977 V1->getType()->isIntOrIntVectorTy() &&
17978 "Expected integer vector types only.");
17979 if (V1->getType() != V2->getType()) {
17980 if (cast<VectorType>(V2->getType())
17981 ->getElementType()
17982 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
17983 ->getElementType()
17984 ->getIntegerBitWidth())
17985 V2 = Builder.CreateIntCast(
17986 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
17987 else
17988 V1 = Builder.CreateIntCast(
17989 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
17990 }
17991 }
17992 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17993 if (auto *I = dyn_cast<Instruction>(Vec)) {
17994 GatherShuffleExtractSeq.insert(I);
17995 CSEBlocks.insert(I->getParent());
17996 }
17997 return Vec;
17998 }
17999 /// Creates permutation of the single vector operand with the given mask, if
18000 /// it is not identity mask.
18001 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
18002 if (Mask.empty())
18003 return V1;
18004 unsigned VF = Mask.size();
18005 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
18006 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
18007 return V1;
18008 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
18009 if (auto *I = dyn_cast<Instruction>(Vec)) {
18010 GatherShuffleExtractSeq.insert(I);
18011 CSEBlocks.insert(I->getParent());
18012 }
18013 return Vec;
18014 }
18015 Value *createIdentity(Value *V) { return V; }
18016 Value *createPoison(Type *Ty, unsigned VF) {
18017 return PoisonValue::get(getWidenedType(Ty, VF));
18018 }
18019 /// Resizes 2 input vector to match the sizes, if the they are not equal
18020 /// yet. The smallest vector is resized to the size of the larger vector.
18021 void resizeToMatch(Value *&V1, Value *&V2) {
18022 if (V1->getType() == V2->getType())
18023 return;
18024 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
18025 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
18026 int VF = std::max(V1VF, V2VF);
18027 int MinVF = std::min(V1VF, V2VF);
18028 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
18029 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
18030 0);
18031 Value *&Op = MinVF == V1VF ? V1 : V2;
18032 Op = Builder.CreateShuffleVector(Op, IdentityMask);
18033 if (auto *I = dyn_cast<Instruction>(Op)) {
18034 GatherShuffleExtractSeq.insert(I);
18035 CSEBlocks.insert(I->getParent());
18036 }
18037 if (MinVF == V1VF)
18038 V1 = Op;
18039 else
18040 V2 = Op;
18041 }
18042 };
18043
18044 /// Smart shuffle instruction emission, walks through shuffles trees and
18045 /// tries to find the best matching vector for the actual shuffle
18046 /// instruction.
18047 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
18048 assert(V1 && "Expected at least one vector value.");
18049 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18050 R.CSEBlocks, *R.DL);
18051 return BaseShuffleAnalysis::createShuffle<Value *>(
18052 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18053 }
18054
18055 /// Cast value \p V to the vector type with the same number of elements, but
18056 /// the base type \p ScalarTy.
18057 Value *castToScalarTyElem(Value *V,
18058 std::optional<bool> IsSigned = std::nullopt) {
18059 auto *VecTy = cast<VectorType>(V->getType());
18060 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
18061 if (VecTy->getElementType() == ScalarTy->getScalarType())
18062 return V;
18063 return Builder.CreateIntCast(
18064 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18065 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
18066 }
18067
18068 Value *getVectorizedValue(const TreeEntry &E) {
18069 Value *Vec = E.VectorizedValue;
18070 if (!Vec->getType()->isIntOrIntVectorTy())
18071 return Vec;
18072 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
18073 return !isa<PoisonValue>(V) &&
18074 !isKnownNonNegative(
18075 V, SimplifyQuery(*R.DL));
18076 }));
18077 }
18078
18079public:
18081 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18082
18083 /// Adjusts extractelements after reusing them.
18084 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
18085 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18086 unsigned NumParts, bool &UseVecBaseAsInput) {
18087 UseVecBaseAsInput = false;
18088 SmallPtrSet<Value *, 4> UniqueBases;
18089 Value *VecBase = nullptr;
18090 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
18091 if (!E->ReorderIndices.empty()) {
18092 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18093 E->ReorderIndices.end());
18094 reorderScalars(VL, ReorderMask);
18095 }
18096 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18097 int Idx = Mask[I];
18098 if (Idx == PoisonMaskElem)
18099 continue;
18100 auto *EI = cast<ExtractElementInst>(VL[I]);
18101 VecBase = EI->getVectorOperand();
18102 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
18103 VecBase = TEs.front()->VectorizedValue;
18104 assert(VecBase && "Expected vectorized value.");
18105 UniqueBases.insert(VecBase);
18106 // If the only one use is vectorized - can delete the extractelement
18107 // itself.
18108 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18109 (NumParts != 1 && count(VL, EI) > 1) ||
18110 any_of(EI->users(), [&](User *U) {
18111 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18112 return UTEs.empty() || UTEs.size() > 1 ||
18113 (isa<GetElementPtrInst>(U) &&
18114 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
18115 (!UTEs.empty() &&
18116 count_if(R.VectorizableTree,
18117 [&](const std::unique_ptr<TreeEntry> &TE) {
18118 return TE->UserTreeIndex.UserTE ==
18119 UTEs.front() &&
18120 is_contained(VL, EI);
18121 }) != 1);
18122 }))
18123 continue;
18124 R.eraseInstruction(EI);
18125 }
18126 if (NumParts == 1 || UniqueBases.size() == 1) {
18127 assert(VecBase && "Expected vectorized value.");
18128 return castToScalarTyElem(VecBase);
18129 }
18130 UseVecBaseAsInput = true;
18131 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
18132 for (auto [I, Idx] : enumerate(Mask))
18133 if (Idx != PoisonMaskElem)
18134 Idx = I;
18135 };
18136 // Perform multi-register vector shuffle, joining them into a single virtual
18137 // long vector.
18138 // Need to shuffle each part independently and then insert all this parts
18139 // into a long virtual vector register, forming the original vector.
18140 Value *Vec = nullptr;
18141 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18142 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
18143 for (unsigned Part : seq<unsigned>(NumParts)) {
18144 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
18145 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
18146 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
18147 constexpr int MaxBases = 2;
18148 SmallVector<Value *, MaxBases> Bases(MaxBases);
18149 auto VLMask = zip(SubVL, SubMask);
18150 const unsigned VF = std::accumulate(
18151 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
18152 if (std::get<1>(D) == PoisonMaskElem)
18153 return S;
18154 Value *VecOp =
18155 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18156 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18157 !TEs.empty())
18158 VecOp = TEs.front()->VectorizedValue;
18159 assert(VecOp && "Expected vectorized value.");
18160 const unsigned Size =
18161 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18162 return std::max(S, Size);
18163 });
18164 for (const auto [V, I] : VLMask) {
18165 if (I == PoisonMaskElem)
18166 continue;
18167 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
18168 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
18169 VecOp = TEs.front()->VectorizedValue;
18170 assert(VecOp && "Expected vectorized value.");
18171 VecOp = castToScalarTyElem(VecOp);
18172 Bases[I / VF] = VecOp;
18173 }
18174 if (!Bases.front())
18175 continue;
18176 Value *SubVec;
18177 if (Bases.back()) {
18178 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18179 TransformToIdentity(SubMask);
18180 } else {
18181 SubVec = Bases.front();
18182 }
18183 if (!Vec) {
18184 Vec = SubVec;
18185 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
18186 [&](unsigned P) {
18187 ArrayRef<int> SubMask =
18188 Mask.slice(P * SliceSize,
18189 getNumElems(Mask.size(),
18190 SliceSize, P));
18191 return all_of(SubMask, [](int Idx) {
18192 return Idx == PoisonMaskElem;
18193 });
18194 })) &&
18195 "Expected first part or all previous parts masked.");
18196 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18197 } else {
18198 unsigned NewVF =
18199 cast<FixedVectorType>(Vec->getType())->getNumElements();
18200 if (Vec->getType() != SubVec->getType()) {
18201 unsigned SubVecVF =
18202 cast<FixedVectorType>(SubVec->getType())->getNumElements();
18203 NewVF = std::max(NewVF, SubVecVF);
18204 }
18205 // Adjust SubMask.
18206 for (int &Idx : SubMask)
18207 if (Idx != PoisonMaskElem)
18208 Idx += NewVF;
18209 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18210 Vec = createShuffle(Vec, SubVec, VecMask);
18211 TransformToIdentity(VecMask);
18212 }
18213 }
18214 copy(VecMask, Mask.begin());
18215 return Vec;
18216 }
18217 /// Checks if the specified entry \p E needs to be delayed because of its
18218 /// dependency nodes.
18219 std::optional<Value *>
18220 needToDelay(const TreeEntry *E,
18222 // No need to delay emission if all deps are ready.
18223 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
18224 return all_of(
18225 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
18226 }))
18227 return std::nullopt;
18228 // Postpone gather emission, will be emitted after the end of the
18229 // process to keep correct order.
18230 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
18231 return Builder.CreateAlignedLoad(
18232 ResVecTy,
18233 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
18234 MaybeAlign());
18235 }
18236 /// Reset the builder to handle perfect diamond match.
18238 IsFinalized = false;
18239 CommonMask.clear();
18240 InVectors.clear();
18241 }
18242 /// Adds 2 input vectors (in form of tree entries) and the mask for their
18243 /// shuffling.
18244 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
18245 Value *V1 = getVectorizedValue(E1);
18246 Value *V2 = getVectorizedValue(E2);
18247 add(V1, V2, Mask);
18248 }
18249 /// Adds single input vector (in form of tree entry) and the mask for its
18250 /// shuffling.
18251 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
18252 Value *V1 = getVectorizedValue(E1);
18253 add(V1, Mask);
18254 }
18255 /// Adds 2 input vectors and the mask for their shuffling.
18256 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
18257 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
18260 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18261 V1 = castToScalarTyElem(V1);
18262 V2 = castToScalarTyElem(V2);
18263 if (InVectors.empty()) {
18264 InVectors.push_back(V1);
18265 InVectors.push_back(V2);
18266 CommonMask.assign(Mask.begin(), Mask.end());
18267 return;
18268 }
18269 Value *Vec = InVectors.front();
18270 if (InVectors.size() == 2) {
18271 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18272 transformMaskAfterShuffle(CommonMask, CommonMask);
18273 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
18274 Mask.size()) {
18275 Vec = createShuffle(Vec, nullptr, CommonMask);
18276 transformMaskAfterShuffle(CommonMask, CommonMask);
18277 }
18278 V1 = createShuffle(V1, V2, Mask);
18279 unsigned VF = std::max(getVF(V1), getVF(Vec));
18280 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18281 if (Mask[Idx] != PoisonMaskElem)
18282 CommonMask[Idx] = Idx + VF;
18283 InVectors.front() = Vec;
18284 if (InVectors.size() == 2)
18285 InVectors.back() = V1;
18286 else
18287 InVectors.push_back(V1);
18288 }
18289 /// Adds another one input vector and the mask for the shuffling.
18290 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
18292 "castToScalarTyElem expects V1 to be FixedVectorType");
18293 V1 = castToScalarTyElem(V1);
18294 if (InVectors.empty()) {
18295 InVectors.push_back(V1);
18296 CommonMask.assign(Mask.begin(), Mask.end());
18297 return;
18298 }
18299 const auto *It = find(InVectors, V1);
18300 if (It == InVectors.end()) {
18301 if (InVectors.size() == 2 ||
18302 InVectors.front()->getType() != V1->getType()) {
18303 Value *V = InVectors.front();
18304 if (InVectors.size() == 2) {
18305 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18306 transformMaskAfterShuffle(CommonMask, CommonMask);
18307 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
18308 CommonMask.size()) {
18309 V = createShuffle(InVectors.front(), nullptr, CommonMask);
18310 transformMaskAfterShuffle(CommonMask, CommonMask);
18311 }
18312 unsigned VF = std::max(CommonMask.size(), Mask.size());
18313 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18314 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
18315 CommonMask[Idx] = V->getType() != V1->getType()
18316 ? Idx + VF
18317 : Mask[Idx] + getVF(V1);
18318 if (V->getType() != V1->getType())
18319 V1 = createShuffle(V1, nullptr, Mask);
18320 InVectors.front() = V;
18321 if (InVectors.size() == 2)
18322 InVectors.back() = V1;
18323 else
18324 InVectors.push_back(V1);
18325 return;
18326 }
18327 // Check if second vector is required if the used elements are already
18328 // used from the first one.
18329 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18330 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
18331 InVectors.push_back(V1);
18332 break;
18333 }
18334 }
18335 unsigned VF = 0;
18336 for (Value *V : InVectors)
18337 VF = std::max(VF, getVF(V));
18338 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18339 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
18340 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18341 }
18342 /// Adds another one input vector and the mask for the shuffling.
18344 SmallVector<int> NewMask;
18345 inversePermutation(Order, NewMask);
18346 add(V1, NewMask);
18347 }
18348 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
18349 Value *Root = nullptr) {
18350 return R.gather(VL, Root, ScalarTy,
18351 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
18352 return createShuffle(V1, V2, Mask);
18353 });
18354 }
18355 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
18356 /// Finalize emission of the shuffles.
18357 /// \param Action the action (if any) to be performed before final applying of
18358 /// the \p ExtMask mask.
18360 ArrayRef<int> ExtMask,
18361 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18362 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
18365 Action = {}) {
18366 IsFinalized = true;
18367 if (Action) {
18368 Value *Vec = InVectors.front();
18369 if (InVectors.size() == 2) {
18370 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18371 InVectors.pop_back();
18372 } else {
18373 Vec = createShuffle(Vec, nullptr, CommonMask);
18374 }
18375 transformMaskAfterShuffle(CommonMask, CommonMask);
18376 assert(VF > 0 &&
18377 "Expected vector length for the final value before action.");
18378 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
18379 if (VecVF < VF) {
18380 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
18381 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18382 Vec = createShuffle(Vec, nullptr, ResizeMask);
18383 }
18384 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
18385 return createShuffle(V1, V2, Mask);
18386 });
18387 InVectors.front() = Vec;
18388 }
18389 if (!SubVectors.empty()) {
18390 Value *Vec = InVectors.front();
18391 if (InVectors.size() == 2) {
18392 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18393 InVectors.pop_back();
18394 } else {
18395 Vec = createShuffle(Vec, nullptr, CommonMask);
18396 }
18397 transformMaskAfterShuffle(CommonMask, CommonMask);
18398 auto CreateSubVectors = [&](Value *Vec,
18399 SmallVectorImpl<int> &CommonMask) {
18400 for (auto [E, Idx] : SubVectors) {
18401 Value *V = getVectorizedValue(*E);
18402 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
18403 // Use scalar version of the SCalarType to correctly handle shuffles
18404 // for revectorization. The revectorization mode operates by the
18405 // vectors, but here we need to operate on the scalars, because the
18406 // masks were already transformed for the vector elements and we don't
18407 // need doing this transformation again.
18408 Type *OrigScalarTy = ScalarTy;
18409 ScalarTy = ScalarTy->getScalarType();
18410 Vec = createInsertVector(
18411 Builder, Vec, V, InsertionIndex,
18412 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
18413 _3));
18414 ScalarTy = OrigScalarTy;
18415 if (!CommonMask.empty()) {
18416 std::iota(std::next(CommonMask.begin(), Idx),
18417 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
18418 Idx);
18419 }
18420 }
18421 return Vec;
18422 };
18423 if (SubVectorsMask.empty()) {
18424 Vec = CreateSubVectors(Vec, CommonMask);
18425 } else {
18426 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
18427 copy(SubVectorsMask, SVMask.begin());
18428 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
18429 if (I2 != PoisonMaskElem) {
18430 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
18431 I1 = I2 + CommonMask.size();
18432 }
18433 }
18434 Value *InsertVec =
18435 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
18436 Vec = createShuffle(InsertVec, Vec, SVMask);
18437 transformMaskAfterShuffle(CommonMask, SVMask);
18438 }
18439 InVectors.front() = Vec;
18440 }
18441
18442 if (!ExtMask.empty()) {
18443 if (CommonMask.empty()) {
18444 CommonMask.assign(ExtMask.begin(), ExtMask.end());
18445 } else {
18446 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
18447 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
18448 if (ExtMask[I] == PoisonMaskElem)
18449 continue;
18450 NewMask[I] = CommonMask[ExtMask[I]];
18451 }
18452 CommonMask.swap(NewMask);
18453 }
18454 }
18455 if (CommonMask.empty()) {
18456 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
18457 return InVectors.front();
18458 }
18459 if (InVectors.size() == 2)
18460 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18461 return createShuffle(InVectors.front(), nullptr, CommonMask);
18462 }
18463
18465 assert((IsFinalized || CommonMask.empty()) &&
18466 "Shuffle construction must be finalized.");
18467 }
18468};
18469
18470Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
18471 return vectorizeTree(getOperandEntry(E, NodeIdx));
18472}
18473
18474template <typename BVTy, typename ResTy, typename... Args>
18475ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
18476 Args &...Params) {
18477 assert(E->isGather() && "Expected gather node.");
18478 unsigned VF = E->getVectorFactor();
18479
18480 bool NeedFreeze = false;
18481 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
18482 // Clear values, to be replaced by insertvector instructions.
18483 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18484 for_each(MutableArrayRef(GatheredScalars)
18485 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18486 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
18488 E->CombinedEntriesWithIndices.size());
18489 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18490 [&](const auto &P) {
18491 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18492 });
18493 // Build a mask out of the reorder indices and reorder scalars per this
18494 // mask.
18495 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18496 E->ReorderIndices.end());
18497 if (!ReorderMask.empty())
18498 reorderScalars(GatheredScalars, ReorderMask);
18499 SmallVector<int> SubVectorsMask;
18500 inversePermutation(E->ReorderIndices, SubVectorsMask);
18501 // Transform non-clustered elements in the mask to poison (-1).
18502 // "Clustered" operations will be reordered using this mask later.
18503 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
18504 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
18505 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
18506 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
18507 } else {
18508 SubVectorsMask.clear();
18509 }
18510 SmallVector<Value *> StoredGS(GatheredScalars);
18511 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
18512 unsigned I, unsigned SliceSize,
18513 bool IsNotPoisonous) {
18514 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
18515 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18516 }))
18517 return false;
18518 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18519 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18520 if (UserTE->getNumOperands() != 2)
18521 return false;
18522 if (!IsNotPoisonous) {
18523 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18524 [=](const std::unique_ptr<TreeEntry> &TE) {
18525 return TE->UserTreeIndex.UserTE == UserTE &&
18526 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18527 });
18528 if (It == VectorizableTree.end())
18529 return false;
18530 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
18531 if (!(*It)->ReorderIndices.empty()) {
18532 inversePermutation((*It)->ReorderIndices, ReorderMask);
18533 reorderScalars(GS, ReorderMask);
18534 }
18535 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
18536 Value *V0 = std::get<0>(P);
18537 Value *V1 = std::get<1>(P);
18538 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
18539 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
18540 is_contained(E->Scalars, V1));
18541 }))
18542 return false;
18543 }
18544 int Idx;
18545 if ((Mask.size() < InputVF &&
18546 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
18547 Idx == 0) ||
18548 (Mask.size() == InputVF &&
18549 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
18550 std::iota(
18551 std::next(Mask.begin(), I * SliceSize),
18552 std::next(Mask.begin(),
18553 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18554 0);
18555 } else {
18556 unsigned IVal =
18557 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
18558 std::fill(
18559 std::next(Mask.begin(), I * SliceSize),
18560 std::next(Mask.begin(),
18561 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18562 IVal);
18563 }
18564 return true;
18565 };
18566 BVTy ShuffleBuilder(ScalarTy, Params...);
18567 ResTy Res = ResTy();
18568 SmallVector<int> Mask;
18569 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
18571 Value *ExtractVecBase = nullptr;
18572 bool UseVecBaseAsInput = false;
18575 Type *OrigScalarTy = GatheredScalars.front()->getType();
18576 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
18577 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
18578 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
18579 // Check for gathered extracts.
18580 bool Resized = false;
18581 ExtractShuffles =
18582 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18583 if (!ExtractShuffles.empty()) {
18584 SmallVector<const TreeEntry *> ExtractEntries;
18585 for (auto [Idx, I] : enumerate(ExtractMask)) {
18586 if (I == PoisonMaskElem)
18587 continue;
18588 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
18589 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
18590 !TEs.empty())
18591 ExtractEntries.append(TEs.begin(), TEs.end());
18592 }
18593 if (std::optional<ResTy> Delayed =
18594 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18595 // Delay emission of gathers which are not ready yet.
18596 PostponedGathers.insert(E);
18597 // Postpone gather emission, will be emitted after the end of the
18598 // process to keep correct order.
18599 return *Delayed;
18600 }
18601 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
18602 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18603 ExtractVecBase = VecBase;
18604 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
18605 if (VF == VecBaseTy->getNumElements() &&
18606 GatheredScalars.size() != VF) {
18607 Resized = true;
18608 GatheredScalars.append(VF - GatheredScalars.size(),
18609 PoisonValue::get(OrigScalarTy));
18610 NumParts =
18611 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
18612 }
18613 }
18614 }
18615 // Gather extracts after we check for full matched gathers only.
18616 if (!ExtractShuffles.empty() || !E->hasState() ||
18617 E->getOpcode() != Instruction::Load ||
18618 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18619 any_of(E->Scalars, IsaPred<LoadInst>)) &&
18620 any_of(E->Scalars,
18621 [this](Value *V) {
18622 return isa<LoadInst>(V) && isVectorized(V);
18623 })) ||
18624 (E->hasState() && E->isAltShuffle()) ||
18625 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
18626 isSplat(E->Scalars) ||
18627 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18628 GatherShuffles =
18629 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18630 }
18631 if (!GatherShuffles.empty()) {
18632 if (std::optional<ResTy> Delayed =
18633 ShuffleBuilder.needToDelay(E, Entries)) {
18634 // Delay emission of gathers which are not ready yet.
18635 PostponedGathers.insert(E);
18636 // Postpone gather emission, will be emitted after the end of the
18637 // process to keep correct order.
18638 return *Delayed;
18639 }
18640 if (GatherShuffles.size() == 1 &&
18641 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
18642 Entries.front().front()->isSame(E->Scalars)) {
18643 // Perfect match in the graph, will reuse the previously vectorized
18644 // node. Cost is 0.
18645 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
18646 << shortBundleName(E->Scalars, E->Idx) << ".\n");
18647 // Restore the mask for previous partially matched values.
18648 Mask.resize(E->Scalars.size());
18649 const TreeEntry *FrontTE = Entries.front().front();
18650 if (FrontTE->ReorderIndices.empty() &&
18651 ((FrontTE->ReuseShuffleIndices.empty() &&
18652 E->Scalars.size() == FrontTE->Scalars.size()) ||
18653 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18654 std::iota(Mask.begin(), Mask.end(), 0);
18655 } else {
18656 for (auto [I, V] : enumerate(E->Scalars)) {
18657 if (isa<PoisonValue>(V)) {
18659 continue;
18660 }
18661 Mask[I] = FrontTE->findLaneForValue(V);
18662 }
18663 }
18664 // Reset the builder(s) to correctly handle perfect diamond matched
18665 // nodes.
18666 ShuffleBuilder.resetForSameNode();
18667 ShuffleBuilder.add(*FrontTE, Mask);
18668 // Full matched entry found, no need to insert subvectors.
18669 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18670 return Res;
18671 }
18672 if (!Resized) {
18673 if (GatheredScalars.size() != VF &&
18674 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
18675 return any_of(TEs, [&](const TreeEntry *TE) {
18676 return TE->getVectorFactor() == VF;
18677 });
18678 }))
18679 GatheredScalars.append(VF - GatheredScalars.size(),
18680 PoisonValue::get(OrigScalarTy));
18681 }
18682 // Remove shuffled elements from list of gathers.
18683 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18684 if (Mask[I] != PoisonMaskElem)
18685 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18686 }
18687 }
18688 }
18689 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18690 SmallVectorImpl<int> &ReuseMask,
18691 bool IsRootPoison) {
18692 // For splats with can emit broadcasts instead of gathers, so try to find
18693 // such sequences.
18694 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
18695 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
18696 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
18697 SmallVector<int> UndefPos;
18698 DenseMap<Value *, unsigned> UniquePositions;
18699 // Gather unique non-const values and all constant values.
18700 // For repeated values, just shuffle them.
18701 int NumNonConsts = 0;
18702 int SinglePos = 0;
18703 for (auto [I, V] : enumerate(Scalars)) {
18704 if (isa<UndefValue>(V)) {
18705 if (!isa<PoisonValue>(V)) {
18706 ReuseMask[I] = I;
18707 UndefPos.push_back(I);
18708 }
18709 continue;
18710 }
18711 if (isConstant(V)) {
18712 ReuseMask[I] = I;
18713 continue;
18714 }
18715 ++NumNonConsts;
18716 SinglePos = I;
18717 Value *OrigV = V;
18718 Scalars[I] = PoisonValue::get(OrigScalarTy);
18719 if (IsSplat) {
18720 Scalars.front() = OrigV;
18721 ReuseMask[I] = 0;
18722 } else {
18723 const auto Res = UniquePositions.try_emplace(OrigV, I);
18724 Scalars[Res.first->second] = OrigV;
18725 ReuseMask[I] = Res.first->second;
18726 }
18727 }
18728 if (NumNonConsts == 1) {
18729 // Restore single insert element.
18730 if (IsSplat) {
18731 ReuseMask.assign(VF, PoisonMaskElem);
18732 std::swap(Scalars.front(), Scalars[SinglePos]);
18733 if (!UndefPos.empty() && UndefPos.front() == 0)
18734 Scalars.front() = UndefValue::get(OrigScalarTy);
18735 }
18736 ReuseMask[SinglePos] = SinglePos;
18737 } else if (!UndefPos.empty() && IsSplat) {
18738 // For undef values, try to replace them with the simple broadcast.
18739 // We can do it if the broadcasted value is guaranteed to be
18740 // non-poisonous, or by freezing the incoming scalar value first.
18741 auto *It = find_if(Scalars, [this, E](Value *V) {
18742 return !isa<UndefValue>(V) &&
18744 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
18745 // Check if the value already used in the same operation in
18746 // one of the nodes already.
18747 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18748 is_contained(E->UserTreeIndex.UserTE->Scalars,
18749 U.getUser());
18750 })));
18751 });
18752 if (It != Scalars.end()) {
18753 // Replace undefs by the non-poisoned scalars and emit broadcast.
18754 int Pos = std::distance(Scalars.begin(), It);
18755 for (int I : UndefPos) {
18756 // Set the undef position to the non-poisoned scalar.
18757 ReuseMask[I] = Pos;
18758 // Replace the undef by the poison, in the mask it is replaced by
18759 // non-poisoned scalar already.
18760 if (I != Pos)
18761 Scalars[I] = PoisonValue::get(OrigScalarTy);
18762 }
18763 } else {
18764 // Replace undefs by the poisons, emit broadcast and then emit
18765 // freeze.
18766 for (int I : UndefPos) {
18767 ReuseMask[I] = PoisonMaskElem;
18768 if (isa<UndefValue>(Scalars[I]))
18769 Scalars[I] = PoisonValue::get(OrigScalarTy);
18770 }
18771 NeedFreeze = true;
18772 }
18773 }
18774 };
18775 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
18776 bool IsNonPoisoned = true;
18777 bool IsUsedInExpr = true;
18778 Value *Vec1 = nullptr;
18779 if (!ExtractShuffles.empty()) {
18780 // Gather of extractelements can be represented as just a shuffle of
18781 // a single/two vectors the scalars are extracted from.
18782 // Find input vectors.
18783 Value *Vec2 = nullptr;
18784 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18785 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
18786 ExtractMask[I] = PoisonMaskElem;
18787 }
18788 if (UseVecBaseAsInput) {
18789 Vec1 = ExtractVecBase;
18790 } else {
18791 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18792 if (ExtractMask[I] == PoisonMaskElem)
18793 continue;
18794 if (isa<UndefValue>(StoredGS[I]))
18795 continue;
18796 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
18797 Value *VecOp = EI->getVectorOperand();
18798 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
18799 !TEs.empty() && TEs.front()->VectorizedValue)
18800 VecOp = TEs.front()->VectorizedValue;
18801 if (!Vec1) {
18802 Vec1 = VecOp;
18803 } else if (Vec1 != VecOp) {
18804 assert((!Vec2 || Vec2 == VecOp) &&
18805 "Expected only 1 or 2 vectors shuffle.");
18806 Vec2 = VecOp;
18807 }
18808 }
18809 }
18810 if (Vec2) {
18811 IsUsedInExpr = false;
18812 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
18813 isGuaranteedNotToBePoison(Vec2, AC);
18814 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18815 } else if (Vec1) {
18816 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
18817 IsUsedInExpr &= FindReusedSplat(
18818 ExtractMask,
18819 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
18820 ExtractMask.size(), IsNotPoisonedVec);
18821 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
18822 IsNonPoisoned &= IsNotPoisonedVec;
18823 } else {
18824 IsUsedInExpr = false;
18825 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
18826 /*ForExtracts=*/true);
18827 }
18828 }
18829 if (!GatherShuffles.empty()) {
18830 unsigned SliceSize =
18831 getPartNumElems(E->Scalars.size(),
18832 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
18833 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18834 for (const auto [I, TEs] : enumerate(Entries)) {
18835 if (TEs.empty()) {
18836 assert(!GatherShuffles[I] &&
18837 "No shuffles with empty entries list expected.");
18838 continue;
18839 }
18840 assert((TEs.size() == 1 || TEs.size() == 2) &&
18841 "Expected shuffle of 1 or 2 entries.");
18842 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
18843 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
18844 VecMask.assign(VecMask.size(), PoisonMaskElem);
18845 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
18846 if (TEs.size() == 1) {
18847 bool IsNotPoisonedVec =
18848 TEs.front()->VectorizedValue
18849 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
18850 : true;
18851 IsUsedInExpr &=
18852 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
18853 SliceSize, IsNotPoisonedVec);
18854 ShuffleBuilder.add(*TEs.front(), VecMask);
18855 IsNonPoisoned &= IsNotPoisonedVec;
18856 } else {
18857 IsUsedInExpr = false;
18858 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
18859 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
18860 IsNonPoisoned &=
18861 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
18862 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
18863 }
18864 }
18865 }
18866 // Try to figure out best way to combine values: build a shuffle and insert
18867 // elements or just build several shuffles.
18868 // Insert non-constant scalars.
18869 SmallVector<Value *> NonConstants(GatheredScalars);
18870 int EMSz = ExtractMask.size();
18871 int MSz = Mask.size();
18872 // Try to build constant vector and shuffle with it only if currently we
18873 // have a single permutation and more than 1 scalar constants.
18874 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
18875 bool IsIdentityShuffle =
18876 ((UseVecBaseAsInput ||
18877 all_of(ExtractShuffles,
18878 [](const std::optional<TTI::ShuffleKind> &SK) {
18879 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18881 })) &&
18882 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
18883 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
18884 (!GatherShuffles.empty() &&
18885 all_of(GatherShuffles,
18886 [](const std::optional<TTI::ShuffleKind> &SK) {
18887 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18889 }) &&
18890 none_of(Mask, [&](int I) { return I >= MSz; }) &&
18892 bool EnoughConstsForShuffle =
18893 IsSingleShuffle &&
18894 (none_of(GatheredScalars,
18895 [](Value *V) {
18896 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18897 }) ||
18898 any_of(GatheredScalars,
18899 [](Value *V) {
18900 return isa<Constant>(V) && !isa<UndefValue>(V);
18901 })) &&
18902 (!IsIdentityShuffle ||
18903 (GatheredScalars.size() == 2 &&
18904 any_of(GatheredScalars,
18905 [](Value *V) { return !isa<UndefValue>(V); })) ||
18906 count_if(GatheredScalars, [](Value *V) {
18907 return isa<Constant>(V) && !isa<PoisonValue>(V);
18908 }) > 1);
18909 // NonConstants array contains just non-constant values, GatheredScalars
18910 // contains only constant to build final vector and then shuffle.
18911 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
18912 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
18913 NonConstants[I] = PoisonValue::get(OrigScalarTy);
18914 else
18915 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18916 }
18917 // Generate constants for final shuffle and build a mask for them.
18918 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
18919 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
18920 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
18921 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18922 ShuffleBuilder.add(BV, BVMask);
18923 }
18924 if (all_of(NonConstants, [=](Value *V) {
18925 return isa<PoisonValue>(V) ||
18926 (IsSingleShuffle && ((IsIdentityShuffle &&
18927 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
18928 }))
18929 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18930 SubVectorsMask);
18931 else
18932 Res = ShuffleBuilder.finalize(
18933 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
18934 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
18935 bool IsSplat = isSplat(NonConstants);
18936 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18937 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
18938 auto CheckIfSplatIsProfitable = [&]() {
18939 // Estimate the cost of splatting + shuffle and compare with
18940 // insert + shuffle.
18941 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18942 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18943 if (isa<ExtractElementInst>(V) || isVectorized(V))
18944 return false;
18945 InstructionCost SplatCost = TTI->getVectorInstrCost(
18946 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
18947 PoisonValue::get(VecTy), V);
18948 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18949 for (auto [Idx, I] : enumerate(BVMask))
18950 if (I != PoisonMaskElem)
18951 NewMask[Idx] = Mask.size();
18952 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18953 NewMask, CostKind);
18954 InstructionCost BVCost = TTI->getVectorInstrCost(
18955 Instruction::InsertElement, VecTy, CostKind,
18956 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18957 Vec, V);
18958 // Shuffle required?
18959 if (count(BVMask, PoisonMaskElem) <
18960 static_cast<int>(BVMask.size() - 1)) {
18961 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18962 for (auto [Idx, I] : enumerate(BVMask))
18963 if (I != PoisonMaskElem)
18964 NewMask[Idx] = I;
18965 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18966 VecTy, NewMask, CostKind);
18967 }
18968 return SplatCost <= BVCost;
18969 };
18970 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18971 for (auto [Idx, I] : enumerate(BVMask))
18972 if (I != PoisonMaskElem)
18973 Mask[Idx] = I;
18974 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
18975 } else {
18976 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18977 SmallVector<Value *> Values(NonConstants.size(),
18978 PoisonValue::get(ScalarTy));
18979 Values[0] = V;
18980 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18981 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
18982 transform(BVMask, SplatMask.begin(), [](int I) {
18983 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18984 });
18985 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
18986 BV = CreateShuffle(BV, nullptr, SplatMask);
18987 for (auto [Idx, I] : enumerate(BVMask))
18988 if (I != PoisonMaskElem)
18989 Mask[Idx] = BVMask.size() + Idx;
18990 Vec = CreateShuffle(Vec, BV, Mask);
18991 for (auto [Idx, I] : enumerate(Mask))
18992 if (I != PoisonMaskElem)
18993 Mask[Idx] = Idx;
18994 }
18995 });
18996 } else if (!allConstant(GatheredScalars)) {
18997 // Gather unique scalars and all constants.
18998 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
18999 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
19000 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
19001 ShuffleBuilder.add(BV, ReuseMask);
19002 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19003 SubVectorsMask);
19004 } else {
19005 // Gather all constants.
19006 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
19007 for (auto [I, V] : enumerate(GatheredScalars)) {
19008 if (!isa<PoisonValue>(V))
19009 Mask[I] = I;
19010 }
19011 Value *BV = ShuffleBuilder.gather(GatheredScalars);
19012 ShuffleBuilder.add(BV, Mask);
19013 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19014 SubVectorsMask);
19015 }
19016
19017 if (NeedFreeze)
19018 Res = ShuffleBuilder.createFreeze(Res);
19019 return Res;
19020}
19021
19022Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
19023 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
19024 (void)vectorizeTree(VectorizableTree[EIdx].get());
19025 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
19026 Builder, *this);
19027}
19028
19029/// \returns \p I after propagating metadata from \p VL only for instructions in
19030/// \p VL.
19033 for (Value *V : VL)
19034 if (isa<Instruction>(V))
19035 Insts.push_back(V);
19036 return llvm::propagateMetadata(Inst, Insts);
19037}
19038
19040 if (DebugLoc DL = PN.getDebugLoc())
19041 return DL;
19042 return DebugLoc::getUnknown();
19043}
19044
19045Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
19046 IRBuilderBase::InsertPointGuard Guard(Builder);
19047
19048 Value *V = E->Scalars.front();
19049 Type *ScalarTy = V->getType();
19050 if (!isa<CmpInst>(V))
19051 ScalarTy = getValueType(V);
19052 auto It = MinBWs.find(E);
19053 if (It != MinBWs.end()) {
19054 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
19055 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
19056 if (VecTy)
19057 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
19058 }
19059 if (E->VectorizedValue)
19060 return E->VectorizedValue;
19061 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
19062 if (E->isGather()) {
19063 // Set insert point for non-reduction initial nodes.
19064 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
19065 setInsertPointAfterBundle(E);
19066 Value *Vec = createBuildVector(E, ScalarTy);
19067 E->VectorizedValue = Vec;
19068 return Vec;
19069 }
19070 if (E->State == TreeEntry::SplitVectorize) {
19071 assert(E->CombinedEntriesWithIndices.size() == 2 &&
19072 "Expected exactly 2 combined entries.");
19073 setInsertPointAfterBundle(E);
19074 TreeEntry &OpTE1 =
19075 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
19076 assert(OpTE1.isSame(
19077 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19078 "Expected same first part of scalars.");
19079 Value *Op1 = vectorizeTree(&OpTE1);
19080 TreeEntry &OpTE2 =
19081 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
19082 assert(
19083 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19084 "Expected same second part of scalars.");
19085 Value *Op2 = vectorizeTree(&OpTE2);
19086 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
19087 bool IsSigned = false;
19088 auto It = MinBWs.find(OpE);
19089 if (It != MinBWs.end())
19090 IsSigned = It->second.second;
19091 else
19092 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19093 if (isa<PoisonValue>(V))
19094 return false;
19095 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19096 });
19097 return IsSigned;
19098 };
19099 if (cast<VectorType>(Op1->getType())->getElementType() !=
19100 ScalarTy->getScalarType()) {
19101 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19102 Op1 = Builder.CreateIntCast(
19103 Op1,
19105 ScalarTy,
19106 cast<FixedVectorType>(Op1->getType())->getNumElements()),
19107 GetOperandSignedness(&OpTE1));
19108 }
19109 if (cast<VectorType>(Op2->getType())->getElementType() !=
19110 ScalarTy->getScalarType()) {
19111 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19112 Op2 = Builder.CreateIntCast(
19113 Op2,
19115 ScalarTy,
19116 cast<FixedVectorType>(Op2->getType())->getNumElements()),
19117 GetOperandSignedness(&OpTE2));
19118 }
19119 if (E->ReorderIndices.empty()) {
19120 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
19121 std::iota(
19122 Mask.begin(),
19123 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
19124 0);
19125 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
19126 if (ScalarTyNumElements != 1) {
19127 assert(SLPReVec && "Only supported by REVEC.");
19128 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
19129 }
19130 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19131 Vec = createInsertVector(Builder, Vec, Op2,
19132 E->CombinedEntriesWithIndices.back().second *
19133 ScalarTyNumElements);
19134 E->VectorizedValue = Vec;
19135 return Vec;
19136 }
19137 unsigned CommonVF =
19138 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19139 if (getNumElements(Op1->getType()) != CommonVF) {
19140 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19141 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
19142 0);
19143 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19144 }
19145 if (getNumElements(Op2->getType()) != CommonVF) {
19146 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19147 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
19148 0);
19149 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19150 }
19151 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
19152 E->VectorizedValue = Vec;
19153 return Vec;
19154 }
19155
19156 bool IsReverseOrder =
19157 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
19158 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
19159 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
19160 if (E->getOpcode() == Instruction::Store &&
19161 E->State == TreeEntry::Vectorize) {
19162 ArrayRef<int> Mask =
19163 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
19164 E->ReorderIndices.size());
19165 ShuffleBuilder.add(V, Mask);
19166 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19167 E->State == TreeEntry::CompressVectorize) {
19168 ShuffleBuilder.addOrdered(V, {});
19169 } else {
19170 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
19171 }
19173 E->CombinedEntriesWithIndices.size());
19174 transform(
19175 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19176 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19177 });
19178 assert(
19179 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
19180 "Expected either combined subnodes or reordering");
19181 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
19182 };
19183
19184 assert(!E->isGather() && "Unhandled state");
19185 unsigned ShuffleOrOp =
19186 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
19187 Instruction *VL0 = E->getMainOp();
19188 auto GetOperandSignedness = [&](unsigned Idx) {
19189 const TreeEntry *OpE = getOperandEntry(E, Idx);
19190 bool IsSigned = false;
19191 auto It = MinBWs.find(OpE);
19192 if (It != MinBWs.end())
19193 IsSigned = It->second.second;
19194 else
19195 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19196 if (isa<PoisonValue>(V))
19197 return false;
19198 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19199 });
19200 return IsSigned;
19201 };
19202 switch (ShuffleOrOp) {
19203 case Instruction::PHI: {
19204 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
19205 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
19206 "PHI reordering is free.");
19207 auto *PH = cast<PHINode>(VL0);
19208 Builder.SetInsertPoint(PH->getParent(),
19209 PH->getParent()->getFirstNonPHIIt());
19210 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19211 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19212 Value *V = NewPhi;
19213
19214 // Adjust insertion point once all PHI's have been generated.
19215 Builder.SetInsertPoint(PH->getParent(),
19216 PH->getParent()->getFirstInsertionPt());
19217 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19218
19219 V = FinalShuffle(V, E);
19220
19221 E->VectorizedValue = V;
19222 // If phi node is fully emitted - exit.
19223 if (NewPhi->getNumIncomingValues() != 0)
19224 return NewPhi;
19225
19226 // PHINodes may have multiple entries from the same block. We want to
19227 // visit every block once.
19228 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19229
19230 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19231 BasicBlock *IBB = PH->getIncomingBlock(I);
19232
19233 // Stop emission if all incoming values are generated.
19234 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
19235 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
19236 return NewPhi;
19237 }
19238
19239 if (!VisitedBBs.insert(IBB).second) {
19240 Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
19241 NewPhi->addIncoming(VecOp, IBB);
19242 TreeEntry *OpTE = getOperandEntry(E, I);
19243 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
19244 OpTE->VectorizedValue = VecOp;
19245 continue;
19246 }
19247
19248 Builder.SetInsertPoint(IBB->getTerminator());
19249 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19250 Value *Vec = vectorizeOperand(E, I);
19251 if (VecTy != Vec->getType()) {
19252 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
19253 MinBWs.contains(getOperandEntry(E, I))) &&
19254 "Expected item in MinBWs.");
19255 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
19256 }
19257 NewPhi->addIncoming(Vec, IBB);
19258 }
19259
19260 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
19261 "Invalid number of incoming values");
19262 assert(E->VectorizedValue && "Expected vectorized value.");
19263 return E->VectorizedValue;
19264 }
19265
19266 case Instruction::ExtractElement: {
19267 Value *V = E->getSingleOperand(0);
19268 setInsertPointAfterBundle(E);
19269 V = FinalShuffle(V, E);
19270 E->VectorizedValue = V;
19271 return V;
19272 }
19273 case Instruction::ExtractValue: {
19274 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
19275 Builder.SetInsertPoint(LI);
19276 Value *Ptr = LI->getPointerOperand();
19277 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19278 Value *NewV = ::propagateMetadata(V, E->Scalars);
19279 NewV = FinalShuffle(NewV, E);
19280 E->VectorizedValue = NewV;
19281 return NewV;
19282 }
19283 case Instruction::InsertElement: {
19284 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
19285 if (const TreeEntry *OpE = getOperandEntry(E, 1);
19286 OpE && !OpE->isGather() && OpE->hasState() &&
19287 !OpE->hasCopyableElements())
19288 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
19289 else
19290 setInsertPointAfterBundle(E);
19291 Value *V = vectorizeOperand(E, 1);
19292 ArrayRef<Value *> Op = E->getOperand(1);
19293 Type *ScalarTy = Op.front()->getType();
19294 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
19295 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19296 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
19297 assert(Res.first > 0 && "Expected item in MinBWs.");
19298 V = Builder.CreateIntCast(
19299 V,
19301 ScalarTy,
19302 cast<FixedVectorType>(V->getType())->getNumElements()),
19303 Res.second);
19304 }
19305
19306 // Create InsertVector shuffle if necessary
19307 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
19308 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19309 }));
19310 const unsigned NumElts =
19311 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
19312 const unsigned NumScalars = E->Scalars.size();
19313
19314 unsigned Offset = *getElementIndex(VL0);
19315 assert(Offset < NumElts && "Failed to find vector index offset");
19316
19317 // Create shuffle to resize vector
19318 SmallVector<int> Mask;
19319 if (!E->ReorderIndices.empty()) {
19320 inversePermutation(E->ReorderIndices, Mask);
19321 Mask.append(NumElts - NumScalars, PoisonMaskElem);
19322 } else {
19323 Mask.assign(NumElts, PoisonMaskElem);
19324 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
19325 }
19326 // Create InsertVector shuffle if necessary
19327 bool IsIdentity = true;
19328 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
19329 Mask.swap(PrevMask);
19330 for (unsigned I = 0; I < NumScalars; ++I) {
19331 Value *Scalar = E->Scalars[PrevMask[I]];
19332 unsigned InsertIdx = *getElementIndex(Scalar);
19333 IsIdentity &= InsertIdx - Offset == I;
19334 Mask[InsertIdx - Offset] = I;
19335 }
19336 if (!IsIdentity || NumElts != NumScalars) {
19337 Value *V2 = nullptr;
19338 bool IsVNonPoisonous =
19340 SmallVector<int> InsertMask(Mask);
19341 if (NumElts != NumScalars && Offset == 0) {
19342 // Follow all insert element instructions from the current buildvector
19343 // sequence.
19344 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
19345 do {
19346 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
19347 if (!InsertIdx)
19348 break;
19349 if (InsertMask[*InsertIdx] == PoisonMaskElem)
19350 InsertMask[*InsertIdx] = *InsertIdx;
19351 if (!Ins->hasOneUse())
19352 break;
19354 Ins->getUniqueUndroppableUser());
19355 } while (Ins);
19356 SmallBitVector UseMask =
19357 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19358 SmallBitVector IsFirstPoison =
19359 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19360 SmallBitVector IsFirstUndef =
19361 isUndefVector(FirstInsert->getOperand(0), UseMask);
19362 if (!IsFirstPoison.all()) {
19363 unsigned Idx = 0;
19364 for (unsigned I = 0; I < NumElts; I++) {
19365 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
19366 IsFirstUndef.test(I)) {
19367 if (IsVNonPoisonous) {
19368 InsertMask[I] = I < NumScalars ? I : 0;
19369 continue;
19370 }
19371 if (!V2)
19372 V2 = UndefValue::get(V->getType());
19373 if (Idx >= NumScalars)
19374 Idx = NumScalars - 1;
19375 InsertMask[I] = NumScalars + Idx;
19376 ++Idx;
19377 } else if (InsertMask[I] != PoisonMaskElem &&
19378 Mask[I] == PoisonMaskElem) {
19379 InsertMask[I] = PoisonMaskElem;
19380 }
19381 }
19382 } else {
19383 InsertMask = Mask;
19384 }
19385 }
19386 if (!V2)
19387 V2 = PoisonValue::get(V->getType());
19388 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19389 if (auto *I = dyn_cast<Instruction>(V)) {
19390 GatherShuffleExtractSeq.insert(I);
19391 CSEBlocks.insert(I->getParent());
19392 }
19393 }
19394
19395 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
19396 for (unsigned I = 0; I < NumElts; I++) {
19397 if (Mask[I] != PoisonMaskElem)
19398 InsertMask[Offset + I] = I;
19399 }
19400 SmallBitVector UseMask =
19401 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19402 SmallBitVector IsFirstUndef =
19403 isUndefVector(FirstInsert->getOperand(0), UseMask);
19404 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
19405 NumElts != NumScalars) {
19406 if (IsFirstUndef.all()) {
19407 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
19408 SmallBitVector IsFirstPoison =
19409 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19410 if (!IsFirstPoison.all()) {
19411 for (unsigned I = 0; I < NumElts; I++) {
19412 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
19413 InsertMask[I] = I + NumElts;
19414 }
19415 }
19416 V = Builder.CreateShuffleVector(
19417 V,
19418 IsFirstPoison.all() ? PoisonValue::get(V->getType())
19419 : FirstInsert->getOperand(0),
19420 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
19421 if (auto *I = dyn_cast<Instruction>(V)) {
19422 GatherShuffleExtractSeq.insert(I);
19423 CSEBlocks.insert(I->getParent());
19424 }
19425 }
19426 } else {
19427 SmallBitVector IsFirstPoison =
19428 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19429 for (unsigned I = 0; I < NumElts; I++) {
19430 if (InsertMask[I] == PoisonMaskElem)
19431 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
19432 else
19433 InsertMask[I] += NumElts;
19434 }
19435 V = Builder.CreateShuffleVector(
19436 FirstInsert->getOperand(0), V, InsertMask,
19437 cast<Instruction>(E->Scalars.back())->getName());
19438 if (auto *I = dyn_cast<Instruction>(V)) {
19439 GatherShuffleExtractSeq.insert(I);
19440 CSEBlocks.insert(I->getParent());
19441 }
19442 }
19443 }
19444
19445 ++NumVectorInstructions;
19446 E->VectorizedValue = V;
19447 return V;
19448 }
19449 case Instruction::ZExt:
19450 case Instruction::SExt:
19451 case Instruction::FPToUI:
19452 case Instruction::FPToSI:
19453 case Instruction::FPExt:
19454 case Instruction::PtrToInt:
19455 case Instruction::IntToPtr:
19456 case Instruction::SIToFP:
19457 case Instruction::UIToFP:
19458 case Instruction::Trunc:
19459 case Instruction::FPTrunc:
19460 case Instruction::BitCast: {
19461 setInsertPointAfterBundle(E);
19462
19463 Value *InVec = vectorizeOperand(E, 0);
19464
19465 auto *CI = cast<CastInst>(VL0);
19466 Instruction::CastOps VecOpcode = CI->getOpcode();
19467 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
19468 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
19469 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
19470 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19471 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
19472 // Check if the values are candidates to demote.
19473 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19474 if (SrcIt != MinBWs.end())
19475 SrcBWSz = SrcIt->second.first;
19476 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
19477 if (BWSz == SrcBWSz) {
19478 VecOpcode = Instruction::BitCast;
19479 } else if (BWSz < SrcBWSz) {
19480 VecOpcode = Instruction::Trunc;
19481 } else if (It != MinBWs.end()) {
19482 assert(BWSz > SrcBWSz && "Invalid cast!");
19483 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19484 } else if (SrcIt != MinBWs.end()) {
19485 assert(BWSz > SrcBWSz && "Invalid cast!");
19486 VecOpcode =
19487 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19488 }
19489 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19490 !SrcIt->second.second) {
19491 VecOpcode = Instruction::UIToFP;
19492 }
19493 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19494 ? InVec
19495 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19496 V = FinalShuffle(V, E);
19497
19498 E->VectorizedValue = V;
19499 ++NumVectorInstructions;
19500 return V;
19501 }
19502 case Instruction::FCmp:
19503 case Instruction::ICmp: {
19504 setInsertPointAfterBundle(E);
19505
19506 Value *L = vectorizeOperand(E, 0);
19507 Value *R = vectorizeOperand(E, 1);
19508 if (L->getType() != R->getType()) {
19509 assert((getOperandEntry(E, 0)->isGather() ||
19510 getOperandEntry(E, 1)->isGather() ||
19511 MinBWs.contains(getOperandEntry(E, 0)) ||
19512 MinBWs.contains(getOperandEntry(E, 1))) &&
19513 "Expected item in MinBWs.");
19514 if (cast<VectorType>(L->getType())
19515 ->getElementType()
19516 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
19517 ->getElementType()
19518 ->getIntegerBitWidth()) {
19519 Type *CastTy = R->getType();
19520 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19521 } else {
19522 Type *CastTy = L->getType();
19523 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19524 }
19525 }
19526
19527 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
19528 Value *V = Builder.CreateCmp(P0, L, R);
19529 propagateIRFlags(V, E->Scalars, VL0);
19530 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
19531 ICmp->setSameSign(/*B=*/false);
19532 // Do not cast for cmps.
19533 VecTy = cast<FixedVectorType>(V->getType());
19534 V = FinalShuffle(V, E);
19535
19536 E->VectorizedValue = V;
19537 ++NumVectorInstructions;
19538 return V;
19539 }
19540 case Instruction::Select: {
19541 setInsertPointAfterBundle(E);
19542
19543 Value *Cond = vectorizeOperand(E, 0);
19544 Value *True = vectorizeOperand(E, 1);
19545 Value *False = vectorizeOperand(E, 2);
19546 if (True->getType() != VecTy || False->getType() != VecTy) {
19547 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
19548 getOperandEntry(E, 2)->isGather() ||
19549 MinBWs.contains(getOperandEntry(E, 1)) ||
19550 MinBWs.contains(getOperandEntry(E, 2))) &&
19551 "Expected item in MinBWs.");
19552 if (True->getType() != VecTy)
19553 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19554 if (False->getType() != VecTy)
19555 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19556 }
19557
19558 unsigned CondNumElements = getNumElements(Cond->getType());
19559 unsigned TrueNumElements = getNumElements(True->getType());
19560 assert(TrueNumElements >= CondNumElements &&
19561 TrueNumElements % CondNumElements == 0 &&
19562 "Cannot vectorize Instruction::Select");
19563 assert(TrueNumElements == getNumElements(False->getType()) &&
19564 "Cannot vectorize Instruction::Select");
19565 if (CondNumElements != TrueNumElements) {
19566 // When the return type is i1 but the source is fixed vector type, we
19567 // need to duplicate the condition value.
19568 Cond = Builder.CreateShuffleVector(
19569 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
19570 CondNumElements));
19571 }
19572 assert(getNumElements(Cond->getType()) == TrueNumElements &&
19573 "Cannot vectorize Instruction::Select");
19574 Value *V =
19575 Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
19576 V = FinalShuffle(V, E);
19577
19578 E->VectorizedValue = V;
19579 ++NumVectorInstructions;
19580 return V;
19581 }
19582 case Instruction::FNeg: {
19583 setInsertPointAfterBundle(E);
19584
19585 Value *Op = vectorizeOperand(E, 0);
19586
19587 Value *V = Builder.CreateUnOp(
19588 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
19589 propagateIRFlags(V, E->Scalars, VL0);
19590 if (auto *I = dyn_cast<Instruction>(V))
19591 V = ::propagateMetadata(I, E->Scalars);
19592
19593 V = FinalShuffle(V, E);
19594
19595 E->VectorizedValue = V;
19596 ++NumVectorInstructions;
19597
19598 return V;
19599 }
19600 case Instruction::Freeze: {
19601 setInsertPointAfterBundle(E);
19602
19603 Value *Op = vectorizeOperand(E, 0);
19604
19605 if (Op->getType() != VecTy) {
19606 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19607 MinBWs.contains(getOperandEntry(E, 0))) &&
19608 "Expected item in MinBWs.");
19609 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
19610 }
19611 Value *V = Builder.CreateFreeze(Op);
19612 V = FinalShuffle(V, E);
19613
19614 E->VectorizedValue = V;
19615 ++NumVectorInstructions;
19616
19617 return V;
19618 }
19619 case Instruction::Add:
19620 case Instruction::FAdd:
19621 case Instruction::Sub:
19622 case Instruction::FSub:
19623 case Instruction::Mul:
19624 case Instruction::FMul:
19625 case Instruction::UDiv:
19626 case Instruction::SDiv:
19627 case Instruction::FDiv:
19628 case Instruction::URem:
19629 case Instruction::SRem:
19630 case Instruction::FRem:
19631 case Instruction::Shl:
19632 case Instruction::LShr:
19633 case Instruction::AShr:
19634 case Instruction::And:
19635 case Instruction::Or:
19636 case Instruction::Xor: {
19637 setInsertPointAfterBundle(E);
19638
19639 Value *LHS = vectorizeOperand(E, 0);
19640 Value *RHS = vectorizeOperand(E, 1);
19641 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19642 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
19643 ArrayRef<Value *> Ops = E->getOperand(I);
19644 if (all_of(Ops, [&](Value *Op) {
19645 auto *CI = dyn_cast<ConstantInt>(Op);
19646 return CI && CI->getValue().countr_one() >= It->second.first;
19647 })) {
19648 V = FinalShuffle(I == 0 ? RHS : LHS, E);
19649 E->VectorizedValue = V;
19650 ++NumVectorInstructions;
19651 return V;
19652 }
19653 }
19654 }
19655 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
19656 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19657 getOperandEntry(E, 1)->isGather() ||
19658 MinBWs.contains(getOperandEntry(E, 0)) ||
19659 MinBWs.contains(getOperandEntry(E, 1))) &&
19660 "Expected item in MinBWs.");
19661 if (LHS->getType() != VecTy)
19662 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
19663 if (RHS->getType() != VecTy)
19664 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
19665 }
19666
19667 Value *V = Builder.CreateBinOp(
19668 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
19669 RHS);
19670 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
19671 if (auto *I = dyn_cast<Instruction>(V)) {
19672 V = ::propagateMetadata(I, E->Scalars);
19673 // Drop nuw flags for abs(sub(commutative), true).
19674 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
19675 any_of(E->Scalars, [](Value *V) {
19676 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19677 }))
19678 I->setHasNoUnsignedWrap(/*b=*/false);
19679 }
19680
19681 V = FinalShuffle(V, E);
19682
19683 E->VectorizedValue = V;
19684 ++NumVectorInstructions;
19685
19686 return V;
19687 }
19688 case Instruction::Load: {
19689 // Loads are inserted at the head of the tree because we don't want to
19690 // sink them all the way down past store instructions.
19691 setInsertPointAfterBundle(E);
19692
19693 LoadInst *LI = cast<LoadInst>(VL0);
19694 Instruction *NewLI;
19695 FixedVectorType *StridedLoadTy = nullptr;
19696 Value *PO = LI->getPointerOperand();
19697 if (E->State == TreeEntry::Vectorize) {
19698 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19699 } else if (E->State == TreeEntry::CompressVectorize) {
19700 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19701 CompressEntryToData.at(E);
19702 Align CommonAlignment = LI->getAlign();
19703 if (IsMasked) {
19704 unsigned VF = getNumElements(LoadVecTy);
19705 SmallVector<Constant *> MaskValues(
19706 VF / getNumElements(LI->getType()),
19707 ConstantInt::getFalse(VecTy->getContext()));
19708 for (int I : CompressMask)
19709 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
19710 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19711 assert(SLPReVec && "Only supported by REVEC.");
19712 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
19713 }
19714 Constant *MaskValue = ConstantVector::get(MaskValues);
19715 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19716 MaskValue);
19717 } else {
19718 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19719 }
19720 NewLI = ::propagateMetadata(NewLI, E->Scalars);
19721 // TODO: include this cost into CommonCost.
19722 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19723 assert(SLPReVec && "FixedVectorType is not expected.");
19724 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
19725 CompressMask);
19726 }
19727 NewLI =
19728 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
19729 } else if (E->State == TreeEntry::StridedVectorize) {
19730 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
19731 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
19732 PO = IsReverseOrder ? PtrN : Ptr0;
19733 Type *StrideTy = DL->getIndexType(PO->getType());
19734 Value *StrideVal;
19735 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
19736 StridedLoadTy = SPtrInfo.Ty;
19737 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
19738 unsigned StridedLoadEC =
19739 StridedLoadTy->getElementCount().getKnownMinValue();
19740
19741 Value *Stride = SPtrInfo.StrideVal;
19742 if (!Stride) {
19743 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19744 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
19745 SCEVExpander Expander(*SE, *DL, "strided-load-vec");
19746 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
19747 &*Builder.GetInsertPoint());
19748 }
19749 Value *NewStride =
19750 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
19751 StrideVal = Builder.CreateMul(
19752 NewStride, ConstantInt::get(
19753 StrideTy, (IsReverseOrder ? -1 : 1) *
19754 static_cast<int>(
19755 DL->getTypeAllocSize(ScalarTy))));
19756 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19757 auto *Inst = Builder.CreateIntrinsic(
19758 Intrinsic::experimental_vp_strided_load,
19759 {StridedLoadTy, PO->getType(), StrideTy},
19760 {PO, StrideVal,
19761 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
19762 Builder.getInt32(StridedLoadEC)});
19763 Inst->addParamAttr(
19764 /*ArgNo=*/0,
19765 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19766 NewLI = Inst;
19767 } else {
19768 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
19769 Value *VecPtr = vectorizeOperand(E, 0);
19770 if (isa<FixedVectorType>(ScalarTy)) {
19771 assert(SLPReVec && "FixedVectorType is not expected.");
19772 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
19773 // to expand VecPtr if ScalarTy is a vector type.
19774 unsigned ScalarTyNumElements =
19775 cast<FixedVectorType>(ScalarTy)->getNumElements();
19776 unsigned VecTyNumElements =
19777 cast<FixedVectorType>(VecTy)->getNumElements();
19778 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19779 "Cannot expand getelementptr.");
19780 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19781 SmallVector<Constant *> Indices(VecTyNumElements);
19782 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
19783 return Builder.getInt64(I % ScalarTyNumElements);
19784 });
19785 VecPtr = Builder.CreateGEP(
19786 VecTy->getElementType(),
19787 Builder.CreateShuffleVector(
19788 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
19789 ConstantVector::get(Indices));
19790 }
19791 // Use the minimum alignment of the gathered loads.
19792 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19793 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19794 }
19795 Value *V = E->State == TreeEntry::CompressVectorize
19796 ? NewLI
19797 : ::propagateMetadata(NewLI, E->Scalars);
19798
19799 V = FinalShuffle(V, E);
19800 E->VectorizedValue = V;
19801 ++NumVectorInstructions;
19802 return V;
19803 }
19804 case Instruction::Store: {
19805 auto *SI = cast<StoreInst>(VL0);
19806
19807 setInsertPointAfterBundle(E);
19808
19809 Value *VecValue = vectorizeOperand(E, 0);
19810 if (VecValue->getType() != VecTy)
19811 VecValue =
19812 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19813 VecValue = FinalShuffle(VecValue, E);
19814
19815 Value *Ptr = SI->getPointerOperand();
19816 Instruction *ST;
19817 if (E->State == TreeEntry::Vectorize) {
19818 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
19819 } else {
19820 assert(E->State == TreeEntry::StridedVectorize &&
19821 "Expected either strided or consecutive stores.");
19822 if (!E->ReorderIndices.empty()) {
19823 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
19824 Ptr = SI->getPointerOperand();
19825 }
19826 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
19827 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
19828 auto *Inst = Builder.CreateIntrinsic(
19829 Intrinsic::experimental_vp_strided_store,
19830 {VecTy, Ptr->getType(), StrideTy},
19831 {VecValue, Ptr,
19832 ConstantInt::get(
19833 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19834 Builder.getAllOnesMask(VecTy->getElementCount()),
19835 Builder.getInt32(E->Scalars.size())});
19836 Inst->addParamAttr(
19837 /*ArgNo=*/1,
19838 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19839 ST = Inst;
19840 }
19841
19842 Value *V = ::propagateMetadata(ST, E->Scalars);
19843
19844 E->VectorizedValue = V;
19845 ++NumVectorInstructions;
19846 return V;
19847 }
19848 case Instruction::GetElementPtr: {
19849 auto *GEP0 = cast<GetElementPtrInst>(VL0);
19850 setInsertPointAfterBundle(E);
19851
19852 Value *Op0 = vectorizeOperand(E, 0);
19853
19854 SmallVector<Value *> OpVecs;
19855 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
19856 Value *OpVec = vectorizeOperand(E, J);
19857 OpVecs.push_back(OpVec);
19858 }
19859
19860 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19861 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
19863 for (Value *V : E->Scalars) {
19865 GEPs.push_back(V);
19866 }
19867 V = ::propagateMetadata(I, GEPs);
19868 }
19869
19870 V = FinalShuffle(V, E);
19871
19872 E->VectorizedValue = V;
19873 ++NumVectorInstructions;
19874
19875 return V;
19876 }
19877 case Instruction::Call: {
19878 CallInst *CI = cast<CallInst>(VL0);
19879 setInsertPointAfterBundle(E);
19880
19882
19884 CI, ID, VecTy->getNumElements(),
19885 It != MinBWs.end() ? It->second.first : 0, TTI);
19886 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
19887 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
19888 VecCallCosts.first <= VecCallCosts.second;
19889
19890 Value *ScalarArg = nullptr;
19891 SmallVector<Value *> OpVecs;
19892 SmallVector<Type *, 2> TysForDecl;
19893 // Add return type if intrinsic is overloaded on it.
19894 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
19895 TysForDecl.push_back(VecTy);
19896 auto *CEI = cast<CallInst>(VL0);
19897 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
19898 // Some intrinsics have scalar arguments. This argument should not be
19899 // vectorized.
19900 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
19901 ScalarArg = CEI->getArgOperand(I);
19902 // if decided to reduce bitwidth of abs intrinsic, it second argument
19903 // must be set false (do not return poison, if value issigned min).
19904 if (ID == Intrinsic::abs && It != MinBWs.end() &&
19905 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19906 ScalarArg = Builder.getFalse();
19907 OpVecs.push_back(ScalarArg);
19909 TysForDecl.push_back(ScalarArg->getType());
19910 continue;
19911 }
19912
19913 Value *OpVec = vectorizeOperand(E, I);
19914 ScalarArg = CEI->getArgOperand(I);
19915 if (cast<VectorType>(OpVec->getType())->getElementType() !=
19916 ScalarArg->getType()->getScalarType() &&
19917 It == MinBWs.end()) {
19918 auto *CastTy =
19919 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
19920 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
19921 } else if (It != MinBWs.end()) {
19922 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
19923 }
19924 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
19925 OpVecs.push_back(OpVec);
19926 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
19927 TysForDecl.push_back(OpVec->getType());
19928 }
19929
19930 Function *CF;
19931 if (!UseIntrinsic) {
19932 VFShape Shape =
19934 ElementCount::getFixed(VecTy->getNumElements()),
19935 false /*HasGlobalPred*/);
19936 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19937 } else {
19938 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
19939 }
19940
19942 CI->getOperandBundlesAsDefs(OpBundles);
19943 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
19944
19945 propagateIRFlags(V, E->Scalars, VL0);
19946 V = FinalShuffle(V, E);
19947
19948 E->VectorizedValue = V;
19949 ++NumVectorInstructions;
19950 return V;
19951 }
19952 case Instruction::ShuffleVector: {
19953 Value *V;
19954 if (SLPReVec && !E->isAltShuffle()) {
19955 setInsertPointAfterBundle(E);
19956 Value *Src = vectorizeOperand(E, 0);
19957 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
19958 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
19959 SmallVector<int> NewMask(ThisMask.size());
19960 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
19961 return SVSrc->getShuffleMask()[Mask];
19962 });
19963 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19964 SVSrc->getOperand(1), NewMask);
19965 } else {
19966 V = Builder.CreateShuffleVector(Src, ThisMask);
19967 }
19968 propagateIRFlags(V, E->Scalars, VL0);
19969 if (auto *I = dyn_cast<Instruction>(V))
19970 V = ::propagateMetadata(I, E->Scalars);
19971 V = FinalShuffle(V, E);
19972 } else {
19973 assert(E->isAltShuffle() &&
19974 ((Instruction::isBinaryOp(E->getOpcode()) &&
19975 Instruction::isBinaryOp(E->getAltOpcode())) ||
19976 (Instruction::isCast(E->getOpcode()) &&
19977 Instruction::isCast(E->getAltOpcode())) ||
19978 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
19979 "Invalid Shuffle Vector Operand");
19980
19981 Value *LHS = nullptr, *RHS = nullptr;
19982 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
19983 setInsertPointAfterBundle(E);
19984 LHS = vectorizeOperand(E, 0);
19985 RHS = vectorizeOperand(E, 1);
19986 } else {
19987 setInsertPointAfterBundle(E);
19988 LHS = vectorizeOperand(E, 0);
19989 }
19990 if (LHS && RHS &&
19991 ((Instruction::isBinaryOp(E->getOpcode()) &&
19992 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
19993 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
19994 assert((It != MinBWs.end() ||
19995 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
19996 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
19997 MinBWs.contains(getOperandEntry(E, 0)) ||
19998 MinBWs.contains(getOperandEntry(E, 1))) &&
19999 "Expected item in MinBWs.");
20000 Type *CastTy = VecTy;
20001 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
20003 ->getElementType()
20004 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
20005 ->getElementType()
20006 ->getIntegerBitWidth())
20007 CastTy = RHS->getType();
20008 else
20009 CastTy = LHS->getType();
20010 }
20011 if (LHS->getType() != CastTy)
20012 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
20013 if (RHS->getType() != CastTy)
20014 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
20015 }
20016
20017 Value *V0, *V1;
20018 if (Instruction::isBinaryOp(E->getOpcode())) {
20019 V0 = Builder.CreateBinOp(
20020 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
20021 V1 = Builder.CreateBinOp(
20022 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
20023 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
20024 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
20025 auto *AltCI = cast<CmpInst>(E->getAltOp());
20026 CmpInst::Predicate AltPred = AltCI->getPredicate();
20027 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
20028 } else {
20029 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
20030 unsigned SrcBWSz = DL->getTypeSizeInBits(
20031 cast<VectorType>(LHS->getType())->getElementType());
20032 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
20033 if (BWSz <= SrcBWSz) {
20034 if (BWSz < SrcBWSz)
20035 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
20036 assert(LHS->getType() == VecTy &&
20037 "Expected same type as operand.");
20038 if (auto *I = dyn_cast<Instruction>(LHS))
20039 LHS = ::propagateMetadata(I, E->Scalars);
20040 LHS = FinalShuffle(LHS, E);
20041 E->VectorizedValue = LHS;
20042 ++NumVectorInstructions;
20043 return LHS;
20044 }
20045 }
20046 V0 = Builder.CreateCast(
20047 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
20048 V1 = Builder.CreateCast(
20049 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
20050 }
20051 // Add V0 and V1 to later analysis to try to find and remove matching
20052 // instruction, if any.
20053 for (Value *V : {V0, V1}) {
20054 if (auto *I = dyn_cast<Instruction>(V)) {
20055 GatherShuffleExtractSeq.insert(I);
20056 CSEBlocks.insert(I->getParent());
20057 }
20058 }
20059
20060 // Create shuffle to take alternate operations from the vector.
20061 // Also, gather up main and alt scalar ops to propagate IR flags to
20062 // each vector operation.
20063 ValueList OpScalars, AltScalars;
20064 SmallVector<int> Mask;
20065 E->buildAltOpShuffleMask(
20066 [E, this](Instruction *I) {
20067 assert(E->getMatchingMainOpOrAltOp(I) &&
20068 "Unexpected main/alternate opcode");
20069 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
20070 *TLI);
20071 },
20072 Mask, &OpScalars, &AltScalars);
20073
20074 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
20075 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
20076 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
20077 // Drop nuw flags for abs(sub(commutative), true).
20078 if (auto *I = dyn_cast<Instruction>(Vec);
20079 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
20080 any_of(E->Scalars, [](Value *V) {
20081 if (isa<PoisonValue>(V))
20082 return false;
20083 auto *IV = cast<Instruction>(V);
20084 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20085 }))
20086 I->setHasNoUnsignedWrap(/*b=*/false);
20087 };
20088 DropNuwFlag(V0, E->getOpcode());
20089 DropNuwFlag(V1, E->getAltOpcode());
20090
20091 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20092 assert(SLPReVec && "FixedVectorType is not expected.");
20093 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
20094 }
20095 V = Builder.CreateShuffleVector(V0, V1, Mask);
20096 if (auto *I = dyn_cast<Instruction>(V)) {
20097 V = ::propagateMetadata(I, E->Scalars);
20098 GatherShuffleExtractSeq.insert(I);
20099 CSEBlocks.insert(I->getParent());
20100 }
20101 }
20102
20103 E->VectorizedValue = V;
20104 ++NumVectorInstructions;
20105
20106 return V;
20107 }
20108 default:
20109 llvm_unreachable("unknown inst");
20110 }
20111 return nullptr;
20112}
20113
20115 ExtraValueToDebugLocsMap ExternallyUsedValues;
20116 return vectorizeTree(ExternallyUsedValues);
20117}
20118
20120 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
20121 Instruction *ReductionRoot,
20122 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20123 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
20124 // need to rebuild it.
20125 EntryToLastInstruction.clear();
20126 // All blocks must be scheduled before any instructions are inserted.
20127 for (auto &BSIter : BlocksSchedules)
20128 scheduleBlock(*this, BSIter.second.get());
20129 // Cache last instructions for the nodes to avoid side effects, which may
20130 // appear during vectorization, like extra uses, etc.
20131 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20132 if (TE->isGather())
20133 continue;
20134 (void)getLastInstructionInBundle(TE.get());
20135 }
20136
20137 if (ReductionRoot)
20138 Builder.SetInsertPoint(ReductionRoot->getParent(),
20139 ReductionRoot->getIterator());
20140 else
20141 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20142
20143 // Vectorize gather operands of the nodes with the external uses only.
20145 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20146 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20147 TE->UserTreeIndex.UserTE->hasState() &&
20148 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20149 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20150 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20151 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20152 all_of(TE->UserTreeIndex.UserTE->Scalars,
20153 [](Value *V) { return isUsedOutsideBlock(V); })) {
20154 Instruction &LastInst =
20155 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20156 GatherEntries.emplace_back(TE.get(), &LastInst);
20157 }
20158 }
20159 for (auto &Entry : GatherEntries) {
20160 IRBuilderBase::InsertPointGuard Guard(Builder);
20161 Builder.SetInsertPoint(Entry.second);
20162 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20163 (void)vectorizeTree(Entry.first);
20164 }
20165 // Emit gathered loads first to emit better code for the users of those
20166 // gathered loads.
20167 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20168 if (GatheredLoadsEntriesFirst.has_value() &&
20169 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20170 (!TE->isGather() || TE->UserTreeIndex)) {
20171 assert((TE->UserTreeIndex ||
20172 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20173 "Expected gathered load node.");
20174 (void)vectorizeTree(TE.get());
20175 }
20176 }
20177 (void)vectorizeTree(VectorizableTree[0].get());
20178 // Run through the list of postponed gathers and emit them, replacing the temp
20179 // emitted allocas with actual vector instructions.
20180 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
20182 for (const TreeEntry *E : PostponedNodes) {
20183 auto *TE = const_cast<TreeEntry *>(E);
20184 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
20185 TE->VectorizedValue = nullptr;
20186 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
20187 // If user is a PHI node, its vector code have to be inserted right before
20188 // block terminator. Since the node was delayed, there were some unresolved
20189 // dependencies at the moment when stab instruction was emitted. In a case
20190 // when any of these dependencies turn out an operand of another PHI, coming
20191 // from this same block, position of a stab instruction will become invalid.
20192 // The is because source vector that supposed to feed this gather node was
20193 // inserted at the end of the block [after stab instruction]. So we need
20194 // to adjust insertion point again to the end of block.
20195 if (isa<PHINode>(UserI) ||
20196 (TE->UserTreeIndex.UserTE->hasState() &&
20197 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20198 // Insert before all users.
20199 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
20200 for (User *U : PrevVec->users()) {
20201 if (U == UserI)
20202 continue;
20203 auto *UI = dyn_cast<Instruction>(U);
20204 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
20205 continue;
20206 if (UI->comesBefore(InsertPt))
20207 InsertPt = UI;
20208 }
20209 Builder.SetInsertPoint(InsertPt);
20210 } else {
20211 Builder.SetInsertPoint(PrevVec);
20212 }
20213 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20214 Value *Vec = vectorizeTree(TE);
20215 if (auto *VecI = dyn_cast<Instruction>(Vec);
20216 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20217 Builder.GetInsertPoint()->comesBefore(VecI))
20218 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20219 Builder.GetInsertPoint());
20220 if (Vec->getType() != PrevVec->getType()) {
20221 assert(Vec->getType()->isIntOrIntVectorTy() &&
20222 PrevVec->getType()->isIntOrIntVectorTy() &&
20223 "Expected integer vector types only.");
20224 std::optional<bool> IsSigned;
20225 for (Value *V : TE->Scalars) {
20226 if (isVectorized(V)) {
20227 for (const TreeEntry *MNTE : getTreeEntries(V)) {
20228 auto It = MinBWs.find(MNTE);
20229 if (It != MinBWs.end()) {
20230 IsSigned = IsSigned.value_or(false) || It->second.second;
20231 if (*IsSigned)
20232 break;
20233 }
20234 }
20235 if (IsSigned.value_or(false))
20236 break;
20237 // Scan through gather nodes.
20238 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20239 auto It = MinBWs.find(BVE);
20240 if (It != MinBWs.end()) {
20241 IsSigned = IsSigned.value_or(false) || It->second.second;
20242 if (*IsSigned)
20243 break;
20244 }
20245 }
20246 if (IsSigned.value_or(false))
20247 break;
20248 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
20249 IsSigned =
20250 IsSigned.value_or(false) ||
20251 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
20252 continue;
20253 }
20254 if (IsSigned.value_or(false))
20255 break;
20256 }
20257 }
20258 if (IsSigned.value_or(false)) {
20259 // Final attempt - check user node.
20260 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20261 if (It != MinBWs.end())
20262 IsSigned = It->second.second;
20263 }
20264 assert(IsSigned &&
20265 "Expected user node or perfect diamond match in MinBWs.");
20266 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
20267 }
20268 PrevVec->replaceAllUsesWith(Vec);
20269 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
20270 // Replace the stub vector node, if it was used before for one of the
20271 // buildvector nodes already.
20272 auto It = PostponedValues.find(PrevVec);
20273 if (It != PostponedValues.end()) {
20274 for (TreeEntry *VTE : It->getSecond())
20275 VTE->VectorizedValue = Vec;
20276 }
20277 eraseInstruction(PrevVec);
20278 }
20279
20280 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
20281 << " values .\n");
20282
20284 // Maps vector instruction to original insertelement instruction
20285 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
20286 // Maps extract Scalar to the corresponding extractelement instruction in the
20287 // basic block. Only one extractelement per block should be emitted.
20289 ScalarToEEs;
20290 SmallDenseSet<Value *, 4> UsedInserts;
20292 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
20294 // Extract all of the elements with the external uses.
20295 for (const auto &ExternalUse : ExternalUses) {
20296 Value *Scalar = ExternalUse.Scalar;
20297 llvm::User *User = ExternalUse.User;
20298
20299 // Skip users that we already RAUW. This happens when one instruction
20300 // has multiple uses of the same value.
20301 if (User && !is_contained(Scalar->users(), User))
20302 continue;
20303 const TreeEntry *E = &ExternalUse.E;
20304 assert(E && "Invalid scalar");
20305 assert(!E->isGather() && "Extracting from a gather list");
20306 // Non-instruction pointers are not deleted, just skip them.
20307 if (E->getOpcode() == Instruction::GetElementPtr &&
20308 !isa<GetElementPtrInst>(Scalar))
20309 continue;
20310
20311 Value *Vec = E->VectorizedValue;
20312 assert(Vec && "Can't find vectorizable value");
20313
20314 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20315 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
20316 if (Scalar->getType() != Vec->getType()) {
20317 Value *Ex = nullptr;
20318 Value *ExV = nullptr;
20319 auto *Inst = dyn_cast<Instruction>(Scalar);
20320 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20321 auto It = ScalarToEEs.find(Scalar);
20322 if (It != ScalarToEEs.end()) {
20323 // No need to emit many extracts, just move the only one in the
20324 // current block.
20325 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20326 : Builder.GetInsertBlock());
20327 if (EEIt != It->second.end()) {
20328 Value *PrevV = EEIt->second.first;
20329 if (auto *I = dyn_cast<Instruction>(PrevV);
20330 I && !ReplaceInst &&
20331 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20332 Builder.GetInsertPoint()->comesBefore(I)) {
20333 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20334 Builder.GetInsertPoint());
20335 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
20336 CI->moveAfter(I);
20337 }
20338 Ex = PrevV;
20339 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20340 }
20341 }
20342 if (!Ex) {
20343 // "Reuse" the existing extract to improve final codegen.
20344 if (ReplaceInst) {
20345 // Leave the instruction as is, if it cheaper extracts and all
20346 // operands are scalar.
20347 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
20348 IgnoredExtracts.insert(EE);
20349 Ex = EE;
20350 } else {
20351 auto *CloneInst = Inst->clone();
20352 CloneInst->insertBefore(Inst->getIterator());
20353 if (Inst->hasName())
20354 CloneInst->takeName(Inst);
20355 Ex = CloneInst;
20356 }
20357 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
20358 ES && isa<Instruction>(Vec)) {
20359 Value *V = ES->getVectorOperand();
20360 auto *IVec = cast<Instruction>(Vec);
20361 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
20362 V = ETEs.front()->VectorizedValue;
20363 if (auto *IV = dyn_cast<Instruction>(V);
20364 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
20365 IV->comesBefore(IVec))
20366 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20367 else
20368 Ex = Builder.CreateExtractElement(Vec, Lane);
20369 } else if (auto *VecTy =
20370 dyn_cast<FixedVectorType>(Scalar->getType())) {
20371 assert(SLPReVec && "FixedVectorType is not expected.");
20372 unsigned VecTyNumElements = VecTy->getNumElements();
20373 // When REVEC is enabled, we need to extract a vector.
20374 // Note: The element size of Scalar may be different from the
20375 // element size of Vec.
20376 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
20377 ExternalUse.Lane * VecTyNumElements);
20378 } else {
20379 Ex = Builder.CreateExtractElement(Vec, Lane);
20380 }
20381 // If necessary, sign-extend or zero-extend ScalarRoot
20382 // to the larger type.
20383 ExV = Ex;
20384 if (Scalar->getType() != Ex->getType())
20385 ExV = Builder.CreateIntCast(
20386 Ex, Scalar->getType(),
20387 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
20388 auto *I = dyn_cast<Instruction>(Ex);
20389 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
20390 : &F->getEntryBlock(),
20391 std::make_pair(Ex, ExV));
20392 }
20393 // The then branch of the previous if may produce constants, since 0
20394 // operand might be a constant.
20395 if (auto *ExI = dyn_cast<Instruction>(Ex);
20396 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
20397 GatherShuffleExtractSeq.insert(ExI);
20398 CSEBlocks.insert(ExI->getParent());
20399 }
20400 return ExV;
20401 }
20402 assert(isa<FixedVectorType>(Scalar->getType()) &&
20403 isa<InsertElementInst>(Scalar) &&
20404 "In-tree scalar of vector type is not insertelement?");
20405 auto *IE = cast<InsertElementInst>(Scalar);
20406 VectorToInsertElement.try_emplace(Vec, IE);
20407 return Vec;
20408 };
20409 // If User == nullptr, the Scalar remains as scalar in vectorized
20410 // instructions or is used as extra arg. Generate ExtractElement instruction
20411 // and update the record for this scalar in ExternallyUsedValues.
20412 if (!User) {
20413 if (!ScalarsWithNullptrUser.insert(Scalar).second)
20414 continue;
20415 assert(
20416 (ExternallyUsedValues.count(Scalar) ||
20417 ExternalUsesWithNonUsers.count(Scalar) ||
20418 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20419 any_of(
20420 Scalar->users(),
20421 [&, TTI = TTI](llvm::User *U) {
20422 if (ExternalUsesAsOriginalScalar.contains(U))
20423 return true;
20424 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20425 return !UseEntries.empty() &&
20426 (E->State == TreeEntry::Vectorize ||
20427 E->State == TreeEntry::StridedVectorize ||
20428 E->State == TreeEntry::CompressVectorize) &&
20429 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20430 return (UseEntry->State == TreeEntry::Vectorize ||
20431 UseEntry->State ==
20432 TreeEntry::StridedVectorize ||
20433 UseEntry->State ==
20434 TreeEntry::CompressVectorize) &&
20435 doesInTreeUserNeedToExtract(
20436 Scalar, getRootEntryInstruction(*UseEntry),
20437 TLI, TTI);
20438 });
20439 })) &&
20440 "Scalar with nullptr User must be registered in "
20441 "ExternallyUsedValues map or remain as scalar in vectorized "
20442 "instructions");
20443 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20444 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
20445 if (PHI->getParent()->isLandingPad())
20446 Builder.SetInsertPoint(
20447 PHI->getParent(),
20448 std::next(
20449 PHI->getParent()->getLandingPadInst()->getIterator()));
20450 else
20451 Builder.SetInsertPoint(PHI->getParent(),
20452 PHI->getParent()->getFirstNonPHIIt());
20453 } else {
20454 Builder.SetInsertPoint(VecI->getParent(),
20455 std::next(VecI->getIterator()));
20456 }
20457 } else {
20458 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20459 }
20460 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20461 // Required to update internally referenced instructions.
20462 if (Scalar != NewInst) {
20463 assert((!isa<ExtractElementInst>(Scalar) ||
20464 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
20465 "Extractelements should not be replaced.");
20466 Scalar->replaceAllUsesWith(NewInst);
20467 }
20468 continue;
20469 }
20470
20471 if (auto *VU = dyn_cast<InsertElementInst>(User);
20472 VU && VU->getOperand(1) == Scalar) {
20473 // Skip if the scalar is another vector op or Vec is not an instruction.
20474 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
20475 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
20476 if (!UsedInserts.insert(VU).second)
20477 continue;
20478 // Need to use original vector, if the root is truncated.
20479 auto BWIt = MinBWs.find(E);
20480 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
20481 auto *ScalarTy = FTy->getElementType();
20482 auto Key = std::make_pair(Vec, ScalarTy);
20483 auto VecIt = VectorCasts.find(Key);
20484 if (VecIt == VectorCasts.end()) {
20485 IRBuilderBase::InsertPointGuard Guard(Builder);
20486 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
20487 if (IVec->getParent()->isLandingPad())
20488 Builder.SetInsertPoint(IVec->getParent(),
20489 std::next(IVec->getParent()
20490 ->getLandingPadInst()
20491 ->getIterator()));
20492 else
20493 Builder.SetInsertPoint(
20494 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20495 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
20496 Builder.SetInsertPoint(IVec->getNextNode());
20497 }
20498 Vec = Builder.CreateIntCast(
20499 Vec,
20501 ScalarTy,
20502 cast<FixedVectorType>(Vec->getType())->getNumElements()),
20503 BWIt->second.second);
20504 VectorCasts.try_emplace(Key, Vec);
20505 } else {
20506 Vec = VecIt->second;
20507 }
20508 }
20509
20510 std::optional<unsigned> InsertIdx = getElementIndex(VU);
20511 if (InsertIdx) {
20512 auto *It = find_if(
20513 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
20514 // Checks if 2 insertelements are from the same buildvector.
20515 InsertElementInst *VecInsert = Data.InsertElements.front();
20517 VU, VecInsert,
20518 [](InsertElementInst *II) { return II->getOperand(0); });
20519 });
20520 unsigned Idx = *InsertIdx;
20521 if (It == ShuffledInserts.end()) {
20522 (void)ShuffledInserts.emplace_back();
20523 It = std::next(ShuffledInserts.begin(),
20524 ShuffledInserts.size() - 1);
20525 }
20526 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
20527 if (Mask.empty())
20528 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
20529 Mask[Idx] = ExternalUse.Lane;
20530 It->InsertElements.push_back(cast<InsertElementInst>(User));
20531 continue;
20532 }
20533 }
20534 }
20535 }
20536
20537 // Generate extracts for out-of-tree users.
20538 // Find the insertion point for the extractelement lane.
20539 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20540 if (PHINode *PH = dyn_cast<PHINode>(User)) {
20541 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
20542 if (PH->getIncomingValue(I) == Scalar) {
20543 Instruction *IncomingTerminator =
20544 PH->getIncomingBlock(I)->getTerminator();
20545 if (isa<CatchSwitchInst>(IncomingTerminator)) {
20546 Builder.SetInsertPoint(VecI->getParent(),
20547 std::next(VecI->getIterator()));
20548 } else {
20549 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
20550 }
20551 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20552 PH->setOperand(I, NewInst);
20553 }
20554 }
20555 } else {
20556 Builder.SetInsertPoint(cast<Instruction>(User));
20557 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20558 User->replaceUsesOfWith(Scalar, NewInst);
20559 }
20560 } else {
20561 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20562 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20563 User->replaceUsesOfWith(Scalar, NewInst);
20564 }
20565
20566 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
20567 }
20568
20569 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
20570 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
20571 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
20572 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
20573 for (int I = 0, E = Mask.size(); I < E; ++I) {
20574 if (Mask[I] < VF)
20575 CombinedMask1[I] = Mask[I];
20576 else
20577 CombinedMask2[I] = Mask[I] - VF;
20578 }
20579 ShuffleInstructionBuilder ShuffleBuilder(
20580 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
20581 ShuffleBuilder.add(V1, CombinedMask1);
20582 if (V2)
20583 ShuffleBuilder.add(V2, CombinedMask2);
20584 return ShuffleBuilder.finalize({}, {}, {});
20585 };
20586
20587 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
20588 bool ForSingleMask) {
20589 unsigned VF = Mask.size();
20590 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
20591 if (VF != VecVF) {
20592 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
20593 Vec = CreateShuffle(Vec, nullptr, Mask);
20594 return std::make_pair(Vec, true);
20595 }
20596 if (!ForSingleMask) {
20597 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
20598 for (unsigned I = 0; I < VF; ++I) {
20599 if (Mask[I] != PoisonMaskElem)
20600 ResizeMask[Mask[I]] = Mask[I];
20601 }
20602 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
20603 }
20604 }
20605
20606 return std::make_pair(Vec, false);
20607 };
20608 // Perform shuffling of the vectorize tree entries for better handling of
20609 // external extracts.
20610 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
20611 // Find the first and the last instruction in the list of insertelements.
20612 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
20613 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
20614 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
20615 Builder.SetInsertPoint(LastInsert);
20616 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
20618 MutableArrayRef(Vector.data(), Vector.size()),
20619 FirstInsert->getOperand(0),
20620 [](Value *Vec) {
20621 return cast<VectorType>(Vec->getType())
20622 ->getElementCount()
20623 .getKnownMinValue();
20624 },
20625 ResizeToVF,
20626 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20627 ArrayRef<Value *> Vals) {
20628 assert((Vals.size() == 1 || Vals.size() == 2) &&
20629 "Expected exactly 1 or 2 input values.");
20630 if (Vals.size() == 1) {
20631 // Do not create shuffle if the mask is a simple identity
20632 // non-resizing mask.
20633 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20634 ->getNumElements() ||
20635 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20636 return CreateShuffle(Vals.front(), nullptr, Mask);
20637 return Vals.front();
20638 }
20639 return CreateShuffle(Vals.front() ? Vals.front()
20640 : FirstInsert->getOperand(0),
20641 Vals.back(), Mask);
20642 });
20643 auto It = ShuffledInserts[I].InsertElements.rbegin();
20644 // Rebuild buildvector chain.
20645 InsertElementInst *II = nullptr;
20646 if (It != ShuffledInserts[I].InsertElements.rend())
20647 II = *It;
20649 while (It != ShuffledInserts[I].InsertElements.rend()) {
20650 assert(II && "Must be an insertelement instruction.");
20651 if (*It == II)
20652 ++It;
20653 else
20654 Inserts.push_back(cast<Instruction>(II));
20655 II = dyn_cast<InsertElementInst>(II->getOperand(0));
20656 }
20657 for (Instruction *II : reverse(Inserts)) {
20658 II->replaceUsesOfWith(II->getOperand(0), NewInst);
20659 if (auto *NewI = dyn_cast<Instruction>(NewInst))
20660 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
20661 II->moveAfter(NewI);
20662 NewInst = II;
20663 }
20664 LastInsert->replaceAllUsesWith(NewInst);
20665 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
20666 IE->replaceUsesOfWith(IE->getOperand(0),
20667 PoisonValue::get(IE->getOperand(0)->getType()));
20668 IE->replaceUsesOfWith(IE->getOperand(1),
20669 PoisonValue::get(IE->getOperand(1)->getType()));
20670 eraseInstruction(IE);
20671 }
20672 CSEBlocks.insert(LastInsert->getParent());
20673 }
20674
20675 SmallVector<Instruction *> RemovedInsts;
20676 // For each vectorized value:
20677 for (auto &TEPtr : VectorizableTree) {
20678 TreeEntry *Entry = TEPtr.get();
20679
20680 // No need to handle users of gathered values.
20681 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
20682 continue;
20683
20684 assert(Entry->VectorizedValue && "Can't find vectorizable value");
20685
20686 // For each lane:
20687 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
20688 Value *Scalar = Entry->Scalars[Lane];
20689
20690 if (Entry->getOpcode() == Instruction::GetElementPtr &&
20691 !isa<GetElementPtrInst>(Scalar))
20692 continue;
20693 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
20694 EE && IgnoredExtracts.contains(EE))
20695 continue;
20696 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
20697 continue;
20698#ifndef NDEBUG
20699 Type *Ty = Scalar->getType();
20700 if (!Ty->isVoidTy()) {
20701 for (User *U : Scalar->users()) {
20702 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
20703
20704 // It is legal to delete users in the ignorelist.
20705 assert((isVectorized(U) ||
20706 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20709 "Deleting out-of-tree value");
20710 }
20711 }
20712#endif
20713 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
20714 auto *I = cast<Instruction>(Scalar);
20715 RemovedInsts.push_back(I);
20716 }
20717 }
20718
20719 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
20720 // new vector instruction.
20721 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
20722 V->mergeDIAssignID(RemovedInsts);
20723
20724 // Clear up reduction references, if any.
20725 if (UserIgnoreList) {
20726 for (Instruction *I : RemovedInsts) {
20727 const TreeEntry *IE = getTreeEntries(I).front();
20728 if (IE->Idx != 0 &&
20729 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
20730 (ValueToGatherNodes.lookup(I).contains(
20731 VectorizableTree.front().get()) ||
20732 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20733 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20734 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20735 IE->UserTreeIndex &&
20736 is_contained(VectorizableTree.front()->Scalars, I)) &&
20737 !(GatheredLoadsEntriesFirst.has_value() &&
20738 IE->Idx >= *GatheredLoadsEntriesFirst &&
20739 VectorizableTree.front()->isGather() &&
20740 is_contained(VectorizableTree.front()->Scalars, I)) &&
20741 !(!VectorizableTree.front()->isGather() &&
20742 VectorizableTree.front()->isCopyableElement(I)))
20743 continue;
20744 SmallVector<SelectInst *> LogicalOpSelects;
20745 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
20746 // Do not replace condition of the logical op in form select <cond>.
20747 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20748 (match(U.getUser(), m_LogicalAnd()) ||
20749 match(U.getUser(), m_LogicalOr())) &&
20750 U.getOperandNo() == 0;
20751 if (IsPoisoningLogicalOp) {
20752 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20753 return false;
20754 }
20755 return UserIgnoreList->contains(U.getUser());
20756 });
20757 // Replace conditions of the poisoning logical ops with the non-poison
20758 // constant value.
20759 for (SelectInst *SI : LogicalOpSelects)
20760 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
20761 }
20762 }
20763 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
20764 // cache correctness.
20765 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
20766 // - instructions are not deleted until later.
20767 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
20768
20769 Builder.ClearInsertionPoint();
20770 InstrElementSize.clear();
20771
20772 const TreeEntry &RootTE = *VectorizableTree.front();
20773 Value *Vec = RootTE.VectorizedValue;
20774 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20775 It != MinBWs.end() &&
20776 ReductionBitWidth != It->second.first) {
20777 IRBuilder<>::InsertPointGuard Guard(Builder);
20778 Builder.SetInsertPoint(ReductionRoot->getParent(),
20779 ReductionRoot->getIterator());
20780 Vec = Builder.CreateIntCast(
20781 Vec,
20782 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
20783 cast<VectorType>(Vec->getType())->getElementCount()),
20784 It->second.second);
20785 }
20786 return Vec;
20787}
20788
20790 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
20791 << " gather sequences instructions.\n");
20792 // LICM InsertElementInst sequences.
20793 for (Instruction *I : GatherShuffleExtractSeq) {
20794 if (isDeleted(I))
20795 continue;
20796
20797 // Check if this block is inside a loop.
20798 Loop *L = LI->getLoopFor(I->getParent());
20799 if (!L)
20800 continue;
20801
20802 // Check if it has a preheader.
20803 BasicBlock *PreHeader = L->getLoopPreheader();
20804 if (!PreHeader)
20805 continue;
20806
20807 // If the vector or the element that we insert into it are
20808 // instructions that are defined in this basic block then we can't
20809 // hoist this instruction.
20810 if (any_of(I->operands(), [L](Value *V) {
20811 auto *OpI = dyn_cast<Instruction>(V);
20812 return OpI && L->contains(OpI);
20813 }))
20814 continue;
20815
20816 // We can hoist this instruction. Move it to the pre-header.
20817 I->moveBefore(PreHeader->getTerminator()->getIterator());
20818 CSEBlocks.insert(PreHeader);
20819 }
20820
20821 // Make a list of all reachable blocks in our CSE queue.
20823 CSEWorkList.reserve(CSEBlocks.size());
20824 for (BasicBlock *BB : CSEBlocks)
20825 if (DomTreeNode *N = DT->getNode(BB)) {
20826 assert(DT->isReachableFromEntry(N));
20827 CSEWorkList.push_back(N);
20828 }
20829
20830 // Sort blocks by domination. This ensures we visit a block after all blocks
20831 // dominating it are visited.
20832 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
20833 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
20834 "Different nodes should have different DFS numbers");
20835 return A->getDFSNumIn() < B->getDFSNumIn();
20836 });
20837
20838 // Less defined shuffles can be replaced by the more defined copies.
20839 // Between two shuffles one is less defined if it has the same vector operands
20840 // and its mask indeces are the same as in the first one or undefs. E.g.
20841 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
20842 // poison, <0, 0, 0, 0>.
20843 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
20844 Instruction *I2,
20845 SmallVectorImpl<int> &NewMask) {
20846 if (I1->getType() != I2->getType())
20847 return false;
20848 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
20849 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
20850 if (!SI1 || !SI2)
20851 return I1->isIdenticalTo(I2);
20852 if (SI1->isIdenticalTo(SI2))
20853 return true;
20854 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
20855 if (SI1->getOperand(I) != SI2->getOperand(I))
20856 return false;
20857 // Check if the second instruction is more defined than the first one.
20858 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20859 ArrayRef<int> SM1 = SI1->getShuffleMask();
20860 // Count trailing undefs in the mask to check the final number of used
20861 // registers.
20862 unsigned LastUndefsCnt = 0;
20863 for (int I = 0, E = NewMask.size(); I < E; ++I) {
20864 if (SM1[I] == PoisonMaskElem)
20865 ++LastUndefsCnt;
20866 else
20867 LastUndefsCnt = 0;
20868 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
20869 NewMask[I] != SM1[I])
20870 return false;
20871 if (NewMask[I] == PoisonMaskElem)
20872 NewMask[I] = SM1[I];
20873 }
20874 // Check if the last undefs actually change the final number of used vector
20875 // registers.
20876 return SM1.size() - LastUndefsCnt > 1 &&
20877 ::getNumberOfParts(*TTI, SI1->getType()) ==
20879 *TTI, getWidenedType(SI1->getType()->getElementType(),
20880 SM1.size() - LastUndefsCnt));
20881 };
20882 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
20883 // instructions. TODO: We can further optimize this scan if we split the
20884 // instructions into different buckets based on the insert lane.
20886 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
20887 assert(*I &&
20888 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
20889 "Worklist not sorted properly!");
20890 BasicBlock *BB = (*I)->getBlock();
20891 // For all instructions in blocks containing gather sequences:
20892 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
20893 if (isDeleted(&In))
20894 continue;
20896 !GatherShuffleExtractSeq.contains(&In))
20897 continue;
20898
20899 // Check if we can replace this instruction with any of the
20900 // visited instructions.
20901 bool Replaced = false;
20902 for (Instruction *&V : Visited) {
20903 SmallVector<int> NewMask;
20904 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20905 DT->dominates(V->getParent(), In.getParent())) {
20906 In.replaceAllUsesWith(V);
20907 eraseInstruction(&In);
20908 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
20909 if (!NewMask.empty())
20910 SI->setShuffleMask(NewMask);
20911 Replaced = true;
20912 break;
20913 }
20915 GatherShuffleExtractSeq.contains(V) &&
20916 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20917 DT->dominates(In.getParent(), V->getParent())) {
20918 In.moveAfter(V);
20919 V->replaceAllUsesWith(&In);
20921 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
20922 if (!NewMask.empty())
20923 SI->setShuffleMask(NewMask);
20924 V = &In;
20925 Replaced = true;
20926 break;
20927 }
20928 }
20929 if (!Replaced) {
20930 assert(!is_contained(Visited, &In));
20931 Visited.push_back(&In);
20932 }
20933 }
20934 }
20935 CSEBlocks.clear();
20936 GatherShuffleExtractSeq.clear();
20937}
20938
20939BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20940 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
20941 auto &BundlePtr =
20942 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20943 for (Value *V : VL) {
20944 if (S.isNonSchedulable(V))
20945 continue;
20946 auto *I = cast<Instruction>(V);
20947 if (S.isCopyableElement(V)) {
20948 // Add a copyable element model.
20949 ScheduleCopyableData &SD =
20950 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
20951 // Group the instructions to a bundle.
20952 BundlePtr->add(&SD);
20953 continue;
20954 }
20955 ScheduleData *BundleMember = getScheduleData(V);
20956 assert(BundleMember && "no ScheduleData for bundle member "
20957 "(maybe not in same basic block)");
20958 // Group the instructions to a bundle.
20959 BundlePtr->add(BundleMember);
20960 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
20961 BundlePtr.get());
20962 }
20963 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
20964 return *BundlePtr;
20965}
20966
20967// Groups the instructions to a bundle (which is then a single scheduling entity)
20968// and schedules instructions until the bundle gets ready.
20969std::optional<BoUpSLP::ScheduleBundle *>
20970BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
20971 const InstructionsState &S,
20972 const EdgeInfo &EI) {
20973 // No need to schedule PHIs, insertelement, extractelement and extractvalue
20974 // instructions.
20975 if (isa<PHINode>(S.getMainOp()) ||
20976 isVectorLikeInstWithConstOps(S.getMainOp()))
20977 return nullptr;
20978 // If the parent node is non-schedulable and the current node is copyable, and
20979 // any of parent instructions are used outside several basic blocks or in
20980 // bin-op node - cancel scheduling, it may cause wrong def-use deps in
20981 // analysis, leading to a crash.
20982 // Non-scheduled nodes may not have related ScheduleData model, which may lead
20983 // to a skipped dep analysis.
20984 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
20985 EI.UserTE->doesNotNeedToSchedule() &&
20986 EI.UserTE->getOpcode() != Instruction::PHI &&
20987 any_of(EI.UserTE->Scalars, [](Value *V) {
20988 auto *I = dyn_cast<Instruction>(V);
20989 if (!I || I->hasOneUser())
20990 return false;
20991 for (User *U : I->users()) {
20992 auto *UI = cast<Instruction>(U);
20993 if (isa<BinaryOperator>(UI))
20994 return true;
20995 }
20996 return false;
20997 }))
20998 return std::nullopt;
20999 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21000 EI.UserTE->hasCopyableElements() &&
21001 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
21002 all_of(VL, [&](Value *V) {
21003 if (S.isCopyableElement(V))
21004 return true;
21005 return isUsedOutsideBlock(V);
21006 }))
21007 return std::nullopt;
21008 bool HasCopyables = S.areInstructionsWithCopyableElements();
21009 if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
21010 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
21011 // If all operands were replaced by copyables, the operands of this node
21012 // might be not, so need to recalculate dependencies for schedule data,
21013 // replaced by copyable schedule data.
21014 SmallVector<ScheduleData *> ControlDependentMembers;
21015 for (Value *V : VL) {
21016 auto *I = dyn_cast<Instruction>(V);
21017 if (!I || (HasCopyables && S.isCopyableElement(V)))
21018 continue;
21019 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21020 for (const Use &U : I->operands()) {
21021 unsigned &NumOps =
21022 UserOpToNumOps.try_emplace(std::make_pair(I, U.get()), 0)
21023 .first->getSecond();
21024 ++NumOps;
21025 if (auto *Op = dyn_cast<Instruction>(U.get());
21026 Op && areAllOperandsReplacedByCopyableData(I, Op, *SLP, NumOps)) {
21027 if (ScheduleData *OpSD = getScheduleData(Op);
21028 OpSD && OpSD->hasValidDependencies()) {
21029 OpSD->clearDirectDependencies();
21030 if (RegionHasStackSave ||
21032 ControlDependentMembers.push_back(OpSD);
21033 }
21034 }
21035 }
21036 }
21037 if (!ControlDependentMembers.empty()) {
21038 ScheduleBundle Invalid = ScheduleBundle::invalid();
21039 calculateDependencies(Invalid, /*InsertInReadyList=*/true, SLP,
21040 ControlDependentMembers);
21041 }
21042 return nullptr;
21043 }
21044
21045 // Initialize the instruction bundle.
21046 Instruction *OldScheduleEnd = ScheduleEnd;
21047 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
21048
21049 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
21050 // Clear deps or recalculate the region, if the memory instruction is a
21051 // copyable. It may have memory deps, which must be recalculated.
21052 SmallVector<ScheduleData *> ControlDependentMembers;
21053 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
21054 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21055 for (ScheduleEntity *SE : Bundle.getBundle()) {
21056 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
21057 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
21058 BundleMember && BundleMember->hasValidDependencies()) {
21059 BundleMember->clearDirectDependencies();
21060 if (RegionHasStackSave ||
21062 BundleMember->getInst()))
21063 ControlDependentMembers.push_back(BundleMember);
21064 }
21065 continue;
21066 }
21067 auto *SD = cast<ScheduleData>(SE);
21068 if (SD->hasValidDependencies() &&
21069 (!S.areInstructionsWithCopyableElements() ||
21070 !S.isCopyableElement(SD->getInst())) &&
21071 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21072 EI.UserTE->hasState() &&
21073 (!EI.UserTE->hasCopyableElements() ||
21074 !EI.UserTE->isCopyableElement(SD->getInst())))
21075 SD->clearDirectDependencies();
21076 for (const Use &U : SD->getInst()->operands()) {
21077 unsigned &NumOps =
21078 UserOpToNumOps
21079 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
21080 .first->getSecond();
21081 ++NumOps;
21082 if (auto *Op = dyn_cast<Instruction>(U.get());
21083 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
21084 *SLP, NumOps)) {
21085 if (ScheduleData *OpSD = getScheduleData(Op);
21086 OpSD && OpSD->hasValidDependencies()) {
21087 OpSD->clearDirectDependencies();
21088 if (RegionHasStackSave ||
21090 ControlDependentMembers.push_back(OpSD);
21091 }
21092 }
21093 }
21094 }
21095 };
21096 // The scheduling region got new instructions at the lower end (or it is a
21097 // new region for the first bundle). This makes it necessary to
21098 // recalculate all dependencies.
21099 // It is seldom that this needs to be done a second time after adding the
21100 // initial bundle to the region.
21101 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21102 for_each(ScheduleDataMap, [&](auto &P) {
21103 if (BB != P.first->getParent())
21104 return;
21105 ScheduleData *SD = P.second;
21106 if (isInSchedulingRegion(*SD))
21107 SD->clearDependencies();
21108 });
21109 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21110 for_each(P.second, [&](ScheduleCopyableData *SD) {
21111 if (isInSchedulingRegion(*SD))
21112 SD->clearDependencies();
21113 });
21114 });
21115 ReSchedule = true;
21116 }
21117 // Check if the bundle data has deps for copyable elements already. In
21118 // this case need to reset deps and recalculate it.
21119 if (Bundle && !Bundle.getBundle().empty()) {
21120 if (S.areInstructionsWithCopyableElements() ||
21121 !ScheduleCopyableDataMap.empty())
21122 CheckIfNeedToClearDeps(Bundle);
21123 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
21124 << BB->getName() << "\n");
21125 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
21126 ControlDependentMembers);
21127 } else if (!ControlDependentMembers.empty()) {
21128 ScheduleBundle Invalid = ScheduleBundle::invalid();
21129 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
21130 ControlDependentMembers);
21131 }
21132
21133 if (ReSchedule) {
21134 resetSchedule();
21135 initialFillReadyList(ReadyInsts);
21136 }
21137
21138 // Now try to schedule the new bundle or (if no bundle) just calculate
21139 // dependencies. As soon as the bundle is "ready" it means that there are no
21140 // cyclic dependencies and we can schedule it. Note that's important that we
21141 // don't "schedule" the bundle yet.
21142 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21143 !ReadyInsts.empty()) {
21144 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21145 assert(Picked->isReady() && "must be ready to schedule");
21146 schedule(*SLP, S, EI, Picked, ReadyInsts);
21147 if (Picked == &Bundle)
21148 break;
21149 }
21150 };
21151
21152 // Make sure that the scheduling region contains all
21153 // instructions of the bundle.
21154 for (Value *V : VL) {
21155 if (S.isNonSchedulable(V))
21156 continue;
21157 if (!extendSchedulingRegion(V, S)) {
21158 // If the scheduling region got new instructions at the lower end (or it
21159 // is a new region for the first bundle). This makes it necessary to
21160 // recalculate all dependencies.
21161 // Otherwise the compiler may crash trying to incorrectly calculate
21162 // dependencies and emit instruction in the wrong order at the actual
21163 // scheduling.
21164 ScheduleBundle Invalid = ScheduleBundle::invalid();
21165 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
21166 return std::nullopt;
21167 }
21168 }
21169
21170 bool ReSchedule = false;
21171 for (Value *V : VL) {
21172 if (S.isNonSchedulable(V))
21173 continue;
21175 getScheduleCopyableData(cast<Instruction>(V));
21176 if (!CopyableData.empty()) {
21177 for (ScheduleCopyableData *SD : CopyableData)
21178 ReadyInsts.remove(SD);
21179 }
21180 ScheduleData *BundleMember = getScheduleData(V);
21181 assert((BundleMember || S.isCopyableElement(V)) &&
21182 "no ScheduleData for bundle member (maybe not in same basic block)");
21183 if (!BundleMember)
21184 continue;
21185
21186 // Make sure we don't leave the pieces of the bundle in the ready list when
21187 // whole bundle might not be ready.
21188 ReadyInsts.remove(BundleMember);
21189 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
21190 !Bundles.empty()) {
21191 for (ScheduleBundle *B : Bundles)
21192 ReadyInsts.remove(B);
21193 }
21194
21195 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21196 continue;
21197 // A bundle member was scheduled as single instruction before and now
21198 // needs to be scheduled as part of the bundle. We just get rid of the
21199 // existing schedule.
21200 // A bundle member has deps calculated before it was copyable element - need
21201 // to reschedule.
21202 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
21203 << " was already scheduled\n");
21204 ReSchedule = true;
21205 }
21206
21207 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21208 TryScheduleBundleImpl(ReSchedule, Bundle);
21209 if (!Bundle.isReady()) {
21210 for (ScheduleEntity *BD : Bundle.getBundle()) {
21211 // Copyable data scheduling is just removed.
21213 continue;
21214 if (BD->isReady()) {
21215 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
21216 if (Bundles.empty()) {
21217 ReadyInsts.insert(BD);
21218 continue;
21219 }
21220 for (ScheduleBundle *B : Bundles)
21221 if (B->isReady())
21222 ReadyInsts.insert(B);
21223 }
21224 }
21225 ScheduledBundlesList.pop_back();
21226 SmallVector<ScheduleData *> ControlDependentMembers;
21227 SmallPtrSet<Instruction *, 4> Visited;
21228 for (Value *V : VL) {
21229 if (S.isNonSchedulable(V))
21230 continue;
21231 auto *I = cast<Instruction>(V);
21232 if (S.isCopyableElement(I)) {
21233 // Remove the copyable data from the scheduling region and restore
21234 // previous mappings.
21235 auto KV = std::make_pair(EI, I);
21236 assert(ScheduleCopyableDataMap.contains(KV) &&
21237 "no ScheduleCopyableData for copyable element");
21238 ScheduleCopyableData *SD =
21239 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
21240 ScheduleCopyableDataMapByUsers[I].remove(SD);
21241 if (EI.UserTE) {
21242 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21243 const auto *It = find(Op, I);
21244 assert(It != Op.end() && "Lane not set");
21245 SmallPtrSet<Instruction *, 4> Visited;
21246 do {
21247 int Lane = std::distance(Op.begin(), It);
21248 assert(Lane >= 0 && "Lane not set");
21249 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21250 !EI.UserTE->ReorderIndices.empty())
21251 Lane = EI.UserTE->ReorderIndices[Lane];
21252 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21253 "Couldn't find extract lane");
21254 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21255 if (!Visited.insert(In).second) {
21256 It = find(make_range(std::next(It), Op.end()), I);
21257 break;
21258 }
21259 ScheduleCopyableDataMapByInstUser
21260 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
21261 .pop_back();
21262 It = find(make_range(std::next(It), Op.end()), I);
21263 } while (It != Op.end());
21264 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
21265 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
21266 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
21267 }
21268 if (ScheduleCopyableDataMapByUsers[I].empty())
21269 ScheduleCopyableDataMapByUsers.erase(I);
21270 ScheduleCopyableDataMap.erase(KV);
21271 // Need to recalculate dependencies for the actual schedule data.
21272 if (ScheduleData *OpSD = getScheduleData(I);
21273 OpSD && OpSD->hasValidDependencies()) {
21274 OpSD->clearDirectDependencies();
21275 if (RegionHasStackSave ||
21277 ControlDependentMembers.push_back(OpSD);
21278 }
21279 continue;
21280 }
21281 ScheduledBundles.find(I)->getSecond().pop_back();
21282 }
21283 if (!ControlDependentMembers.empty()) {
21284 ScheduleBundle Invalid = ScheduleBundle::invalid();
21285 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
21286 ControlDependentMembers);
21287 }
21288 return std::nullopt;
21289 }
21290 return &Bundle;
21291}
21292
21293BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21294 // Allocate a new ScheduleData for the instruction.
21295 if (ChunkPos >= ChunkSize) {
21296 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21297 ChunkPos = 0;
21298 }
21299 return &(ScheduleDataChunks.back()[ChunkPos++]);
21300}
21301
21302bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21303 Value *V, const InstructionsState &S) {
21305 assert(I && "bundle member must be an instruction");
21306 if (getScheduleData(I))
21307 return true;
21308 if (!ScheduleStart) {
21309 // It's the first instruction in the new region.
21310 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
21311 ScheduleStart = I;
21312 ScheduleEnd = I->getNextNode();
21313 assert(ScheduleEnd && "tried to vectorize a terminator?");
21314 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
21315 return true;
21316 }
21317 // Search up and down at the same time, because we don't know if the new
21318 // instruction is above or below the existing scheduling region.
21319 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
21320 // against the budget. Otherwise debug info could affect codegen.
21322 ++ScheduleStart->getIterator().getReverse();
21323 BasicBlock::reverse_iterator UpperEnd = BB->rend();
21324 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
21325 BasicBlock::iterator LowerEnd = BB->end();
21326 auto IsAssumeLikeIntr = [](const Instruction &I) {
21327 if (auto *II = dyn_cast<IntrinsicInst>(&I))
21328 return II->isAssumeLikeIntrinsic();
21329 return false;
21330 };
21331 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21332 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21333 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
21334 &*DownIter != I) {
21335 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21336 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
21337 return false;
21338 }
21339
21340 ++UpIter;
21341 ++DownIter;
21342
21343 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21344 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21345 }
21346 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
21347 assert(I->getParent() == ScheduleStart->getParent() &&
21348 "Instruction is in wrong basic block.");
21349 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
21350 ScheduleStart = I;
21351 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
21352 << "\n");
21353 return true;
21354 }
21355 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
21356 "Expected to reach top of the basic block or instruction down the "
21357 "lower end.");
21358 assert(I->getParent() == ScheduleEnd->getParent() &&
21359 "Instruction is in wrong basic block.");
21360 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
21361 nullptr);
21362 ScheduleEnd = I->getNextNode();
21363 assert(ScheduleEnd && "tried to vectorize a terminator?");
21364 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
21365 return true;
21366}
21367
21368void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21369 Instruction *ToI,
21370 ScheduleData *PrevLoadStore,
21371 ScheduleData *NextLoadStore) {
21372 ScheduleData *CurrentLoadStore = PrevLoadStore;
21373 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
21374 // No need to allocate data for non-schedulable instructions.
21375 if (isa<PHINode>(I))
21376 continue;
21377 ScheduleData *SD = ScheduleDataMap.lookup(I);
21378 if (!SD) {
21379 SD = allocateScheduleDataChunks();
21380 ScheduleDataMap[I] = SD;
21381 }
21382 assert(!isInSchedulingRegion(*SD) &&
21383 "new ScheduleData already in scheduling region");
21384 SD->init(SchedulingRegionID, I);
21385
21386 if (I->mayReadOrWriteMemory() &&
21387 (!isa<IntrinsicInst>(I) ||
21388 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
21389 cast<IntrinsicInst>(I)->getIntrinsicID() !=
21390 Intrinsic::pseudoprobe))) {
21391 // Update the linked list of memory accessing instructions.
21392 if (CurrentLoadStore) {
21393 CurrentLoadStore->setNextLoadStore(SD);
21394 } else {
21395 FirstLoadStoreInRegion = SD;
21396 }
21397 CurrentLoadStore = SD;
21398 }
21399
21402 RegionHasStackSave = true;
21403 }
21404 if (NextLoadStore) {
21405 if (CurrentLoadStore)
21406 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21407 } else {
21408 LastLoadStoreInRegion = CurrentLoadStore;
21409 }
21410}
21411
21412void BoUpSLP::BlockScheduling::calculateDependencies(
21413 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
21414 ArrayRef<ScheduleData *> ControlDeps) {
21415 SmallVector<ScheduleEntity *> WorkList;
21416 auto ProcessNode = [&](ScheduleEntity *SE) {
21417 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
21418 if (CD->hasValidDependencies())
21419 return;
21420 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
21421 CD->initDependencies();
21422 CD->resetUnscheduledDeps();
21423 const EdgeInfo &EI = CD->getEdgeInfo();
21424 if (EI.UserTE) {
21425 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21426 const auto *It = find(Op, CD->getInst());
21427 assert(It != Op.end() && "Lane not set");
21428 SmallPtrSet<Instruction *, 4> Visited;
21429 do {
21430 int Lane = std::distance(Op.begin(), It);
21431 assert(Lane >= 0 && "Lane not set");
21432 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21433 !EI.UserTE->ReorderIndices.empty())
21434 Lane = EI.UserTE->ReorderIndices[Lane];
21435 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21436 "Couldn't find extract lane");
21437 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21438 if (EI.UserTE->isCopyableElement(In)) {
21439 // We may have not have related copyable scheduling data, if the
21440 // instruction is non-schedulable.
21441 if (ScheduleCopyableData *UseSD =
21442 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21443 CD->incDependencies();
21444 if (!UseSD->isScheduled())
21445 CD->incrementUnscheduledDeps(1);
21446 if (!UseSD->hasValidDependencies() ||
21447 (InsertInReadyList && UseSD->isReady()))
21448 WorkList.push_back(UseSD);
21449 }
21450 } else if (Visited.insert(In).second) {
21451 if (ScheduleData *UseSD = getScheduleData(In)) {
21452 CD->incDependencies();
21453 if (!UseSD->isScheduled())
21454 CD->incrementUnscheduledDeps(1);
21455 if (!UseSD->hasValidDependencies() ||
21456 (InsertInReadyList && UseSD->isReady()))
21457 WorkList.push_back(UseSD);
21458 }
21459 }
21460 It = find(make_range(std::next(It), Op.end()), CD->getInst());
21461 } while (It != Op.end());
21462 if (CD->isReady() && CD->getDependencies() == 0 &&
21463 (EI.UserTE->hasState() &&
21464 (EI.UserTE->getMainOp()->getParent() !=
21465 CD->getInst()->getParent() ||
21466 (isa<PHINode>(EI.UserTE->getMainOp()) &&
21467 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
21468 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21469 auto *IU = dyn_cast<Instruction>(U);
21470 if (!IU)
21471 return true;
21472 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21473 })))))) {
21474 // If no uses in the block - mark as having pseudo-use, which cannot
21475 // be scheduled.
21476 // Prevents incorrect def-use tracking between external user and
21477 // actual instruction.
21478 CD->incDependencies();
21479 CD->incrementUnscheduledDeps(1);
21480 }
21481 }
21482 return;
21483 }
21484 auto *BundleMember = cast<ScheduleData>(SE);
21485 if (BundleMember->hasValidDependencies())
21486 return;
21487 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
21488 BundleMember->initDependencies();
21489 BundleMember->resetUnscheduledDeps();
21490 // Handle def-use chain dependencies.
21491 SmallDenseMap<Value *, unsigned> UserToNumOps;
21492 for (User *U : BundleMember->getInst()->users()) {
21493 if (isa<PHINode>(U))
21494 continue;
21495 if (ScheduleData *UseSD = getScheduleData(U)) {
21496 // The operand is a copyable element - skip.
21497 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
21498 ++NumOps;
21499 if (areAllOperandsReplacedByCopyableData(
21500 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
21501 continue;
21502 BundleMember->incDependencies();
21503 if (!UseSD->isScheduled())
21504 BundleMember->incrementUnscheduledDeps(1);
21505 if (!UseSD->hasValidDependencies() ||
21506 (InsertInReadyList && UseSD->isReady()))
21507 WorkList.push_back(UseSD);
21508 }
21509 }
21510 for (ScheduleCopyableData *UseSD :
21511 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21512 BundleMember->incDependencies();
21513 if (!UseSD->isScheduled())
21514 BundleMember->incrementUnscheduledDeps(1);
21515 if (!UseSD->hasValidDependencies() ||
21516 (InsertInReadyList && UseSD->isReady()))
21517 WorkList.push_back(UseSD);
21518 }
21519
21520 SmallPtrSet<const Instruction *, 4> Visited;
21521 auto MakeControlDependent = [&](Instruction *I) {
21522 // Do not mark control dependent twice.
21523 if (!Visited.insert(I).second)
21524 return;
21525 auto *DepDest = getScheduleData(I);
21526 assert(DepDest && "must be in schedule window");
21527 DepDest->addControlDependency(BundleMember);
21528 BundleMember->incDependencies();
21529 if (!DepDest->isScheduled())
21530 BundleMember->incrementUnscheduledDeps(1);
21531 if (!DepDest->hasValidDependencies() ||
21532 (InsertInReadyList && DepDest->isReady()))
21533 WorkList.push_back(DepDest);
21534 };
21535
21536 // Any instruction which isn't safe to speculate at the beginning of the
21537 // block is control depend on any early exit or non-willreturn call
21538 // which proceeds it.
21539 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
21540 for (Instruction *I = BundleMember->getInst()->getNextNode();
21541 I != ScheduleEnd; I = I->getNextNode()) {
21542 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
21543 continue;
21544
21545 // Add the dependency
21546 MakeControlDependent(I);
21547
21549 // Everything past here must be control dependent on I.
21550 break;
21551 }
21552 }
21553
21554 if (RegionHasStackSave) {
21555 // If we have an inalloc alloca instruction, it needs to be scheduled
21556 // after any preceeding stacksave. We also need to prevent any alloca
21557 // from reordering above a preceeding stackrestore.
21558 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
21559 match(BundleMember->getInst(),
21561 for (Instruction *I = BundleMember->getInst()->getNextNode();
21562 I != ScheduleEnd; I = I->getNextNode()) {
21565 // Any allocas past here must be control dependent on I, and I
21566 // must be memory dependend on BundleMember->Inst.
21567 break;
21568
21569 if (!isa<AllocaInst>(I))
21570 continue;
21571
21572 // Add the dependency
21573 MakeControlDependent(I);
21574 }
21575 }
21576
21577 // In addition to the cases handle just above, we need to prevent
21578 // allocas and loads/stores from moving below a stacksave or a
21579 // stackrestore. Avoiding moving allocas below stackrestore is currently
21580 // thought to be conservatism. Moving loads/stores below a stackrestore
21581 // can lead to incorrect code.
21582 if (isa<AllocaInst>(BundleMember->getInst()) ||
21583 BundleMember->getInst()->mayReadOrWriteMemory()) {
21584 for (Instruction *I = BundleMember->getInst()->getNextNode();
21585 I != ScheduleEnd; I = I->getNextNode()) {
21588 continue;
21589
21590 // Add the dependency
21591 MakeControlDependent(I);
21592 break;
21593 }
21594 }
21595 }
21596
21597 // Handle the memory dependencies (if any).
21598 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21599 if (!NextLoadStore)
21600 return;
21601 Instruction *SrcInst = BundleMember->getInst();
21602 assert(SrcInst->mayReadOrWriteMemory() &&
21603 "NextLoadStore list for non memory effecting bundle?");
21604 MemoryLocation SrcLoc = getLocation(SrcInst);
21605 bool SrcMayWrite = SrcInst->mayWriteToMemory();
21606 unsigned NumAliased = 0;
21607 unsigned DistToSrc = 1;
21608 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
21609
21610 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21611 DepDest = DepDest->getNextLoadStore()) {
21612 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
21613
21614 // We have two limits to reduce the complexity:
21615 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
21616 // SLP->isAliased (which is the expensive part in this loop).
21617 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
21618 // the whole loop (even if the loop is fast, it's quadratic).
21619 // It's important for the loop break condition (see below) to
21620 // check this limit even between two read-only instructions.
21621 if (DistToSrc >= MaxMemDepDistance ||
21622 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21623 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
21624 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21625
21626 // We increment the counter only if the locations are aliased
21627 // (instead of counting all alias checks). This gives a better
21628 // balance between reduced runtime and accurate dependencies.
21629 NumAliased++;
21630
21631 DepDest->addMemoryDependency(BundleMember);
21632 BundleMember->incDependencies();
21633 if (!DepDest->isScheduled())
21634 BundleMember->incrementUnscheduledDeps(1);
21635 if (!DepDest->hasValidDependencies() ||
21636 (InsertInReadyList && DepDest->isReady()))
21637 WorkList.push_back(DepDest);
21638 }
21639
21640 // Example, explaining the loop break condition: Let's assume our
21641 // starting instruction is i0 and MaxMemDepDistance = 3.
21642 //
21643 // +--------v--v--v
21644 // i0,i1,i2,i3,i4,i5,i6,i7,i8
21645 // +--------^--^--^
21646 //
21647 // MaxMemDepDistance let us stop alias-checking at i3 and we add
21648 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
21649 // Previously we already added dependencies from i3 to i6,i7,i8
21650 // (because of MaxMemDepDistance). As we added a dependency from
21651 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
21652 // and we can abort this loop at i6.
21653 if (DistToSrc >= 2 * MaxMemDepDistance)
21654 break;
21655 DistToSrc++;
21656 }
21657 };
21658
21659 assert((Bundle || !ControlDeps.empty()) &&
21660 "expected at least one instruction to schedule");
21661 if (Bundle)
21662 WorkList.push_back(Bundle.getBundle().front());
21663 WorkList.append(ControlDeps.begin(), ControlDeps.end());
21664 SmallPtrSet<ScheduleBundle *, 16> Visited;
21665 while (!WorkList.empty()) {
21666 ScheduleEntity *SD = WorkList.pop_back_val();
21667 SmallVector<ScheduleBundle *, 1> CopyableBundle;
21669 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
21670 CopyableBundle.push_back(&CD->getBundle());
21671 Bundles = CopyableBundle;
21672 } else {
21673 Bundles = getScheduleBundles(SD->getInst());
21674 }
21675 if (Bundles.empty()) {
21676 if (!SD->hasValidDependencies())
21677 ProcessNode(SD);
21678 if (InsertInReadyList && SD->isReady()) {
21679 ReadyInsts.insert(SD);
21680 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
21681 }
21682 continue;
21683 }
21684 for (ScheduleBundle *Bundle : Bundles) {
21685 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
21686 continue;
21687 assert(isInSchedulingRegion(*Bundle) &&
21688 "ScheduleData not in scheduling region");
21689 for_each(Bundle->getBundle(), ProcessNode);
21690 }
21691 if (InsertInReadyList && SD->isReady()) {
21692 for (ScheduleBundle *Bundle : Bundles) {
21693 assert(isInSchedulingRegion(*Bundle) &&
21694 "ScheduleData not in scheduling region");
21695 if (!Bundle->isReady())
21696 continue;
21697 ReadyInsts.insert(Bundle);
21698 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
21699 << "\n");
21700 }
21701 }
21702 }
21703}
21704
21705void BoUpSLP::BlockScheduling::resetSchedule() {
21706 assert(ScheduleStart &&
21707 "tried to reset schedule on block which has not been scheduled");
21708 for_each(ScheduleDataMap, [&](auto &P) {
21709 if (BB != P.first->getParent())
21710 return;
21711 ScheduleData *SD = P.second;
21712 if (isInSchedulingRegion(*SD)) {
21713 SD->setScheduled(/*Scheduled=*/false);
21714 SD->resetUnscheduledDeps();
21715 }
21716 });
21717 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21718 for_each(P.second, [&](ScheduleCopyableData *SD) {
21719 if (isInSchedulingRegion(*SD)) {
21720 SD->setScheduled(/*Scheduled=*/false);
21721 SD->resetUnscheduledDeps();
21722 }
21723 });
21724 });
21725 for_each(ScheduledBundles, [&](auto &P) {
21726 for_each(P.second, [&](ScheduleBundle *Bundle) {
21727 if (isInSchedulingRegion(*Bundle))
21728 Bundle->setScheduled(/*Scheduled=*/false);
21729 });
21730 });
21731 // Reset schedule data for copyable elements.
21732 for (auto &P : ScheduleCopyableDataMap) {
21733 if (isInSchedulingRegion(*P.second)) {
21734 P.second->setScheduled(/*Scheduled=*/false);
21735 P.second->resetUnscheduledDeps();
21736 }
21737 }
21738 ReadyInsts.clear();
21739}
21740
21741void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
21742 if (!BS->ScheduleStart)
21743 return;
21744
21745 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
21746
21747 // A key point - if we got here, pre-scheduling was able to find a valid
21748 // scheduling of the sub-graph of the scheduling window which consists
21749 // of all vector bundles and their transitive users. As such, we do not
21750 // need to reschedule anything *outside of* that subgraph.
21751
21752 BS->resetSchedule();
21753
21754 // For the real scheduling we use a more sophisticated ready-list: it is
21755 // sorted by the original instruction location. This lets the final schedule
21756 // be as close as possible to the original instruction order.
21757 // WARNING: If changing this order causes a correctness issue, that means
21758 // there is some missing dependence edge in the schedule data graph.
21759 struct ScheduleDataCompare {
21760 bool operator()(const ScheduleEntity *SD1,
21761 const ScheduleEntity *SD2) const {
21762 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21763 }
21764 };
21765 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21766
21767 // Ensure that all dependency data is updated (for nodes in the sub-graph)
21768 // and fill the ready-list with initial instructions.
21769 int Idx = 0;
21770 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21771 I = I->getNextNode()) {
21772 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21773 if (!Bundles.empty()) {
21774 for (ScheduleBundle *Bundle : Bundles) {
21775 Bundle->setSchedulingPriority(Idx++);
21776 if (!Bundle->hasValidDependencies())
21777 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
21778 }
21779 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
21780 for (ScheduleCopyableData *SD : reverse(SDs)) {
21781 ScheduleBundle &Bundle = SD->getBundle();
21782 Bundle.setSchedulingPriority(Idx++);
21783 if (!Bundle.hasValidDependencies())
21784 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21785 }
21786 continue;
21787 }
21789 BS->getScheduleCopyableDataUsers(I);
21790 if (ScheduleData *SD = BS->getScheduleData(I)) {
21791 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
21792 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
21793 SDTEs.front()->doesNotNeedToSchedule() ||
21795 "scheduler and vectorizer bundle mismatch");
21796 SD->setSchedulingPriority(Idx++);
21797 if (!SD->hasValidDependencies() &&
21798 (!CopyableData.empty() ||
21799 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
21800 assert(TE->isGather() && "expected gather node");
21801 return TE->hasState() && TE->hasCopyableElements() &&
21802 TE->isCopyableElement(I);
21803 }))) {
21804 // Need to calculate deps for these nodes to correctly handle copyable
21805 // dependencies, even if they were cancelled.
21806 // If copyables bundle was cancelled, the deps are cleared and need to
21807 // recalculate them.
21808 ScheduleBundle Bundle;
21809 Bundle.add(SD);
21810 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21811 }
21812 }
21813 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
21814 ScheduleBundle &Bundle = SD->getBundle();
21815 Bundle.setSchedulingPriority(Idx++);
21816 if (!Bundle.hasValidDependencies())
21817 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21818 }
21819 }
21820 BS->initialFillReadyList(ReadyInsts);
21821
21822 Instruction *LastScheduledInst = BS->ScheduleEnd;
21823
21824 // Do the "real" scheduling.
21825 SmallPtrSet<Instruction *, 16> Scheduled;
21826 while (!ReadyInsts.empty()) {
21827 auto *Picked = *ReadyInsts.begin();
21828 ReadyInsts.erase(ReadyInsts.begin());
21829
21830 // Move the scheduled instruction(s) to their dedicated places, if not
21831 // there yet.
21832 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
21833 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21834 Instruction *PickedInst = BundleMember->getInst();
21835 // If copyable must be schedule as part of something else, skip it.
21836 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21837 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21838 (!IsCopyable && !Scheduled.insert(PickedInst).second))
21839 continue;
21840 if (PickedInst->getNextNode() != LastScheduledInst)
21841 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21842 LastScheduledInst = PickedInst;
21843 }
21844 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21845 LastScheduledInst);
21846 } else {
21847 auto *SD = cast<ScheduleData>(Picked);
21848 Instruction *PickedInst = SD->getInst();
21849 if (PickedInst->getNextNode() != LastScheduledInst)
21850 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21851 LastScheduledInst = PickedInst;
21852 }
21853 auto Invalid = InstructionsState::invalid();
21854 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
21855 }
21856
21857 // Check that we didn't break any of our invariants.
21858#ifdef EXPENSIVE_CHECKS
21859 BS->verify();
21860#endif
21861
21862#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21863 // Check that all schedulable entities got scheduled
21864 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21865 I = I->getNextNode()) {
21866 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21867 assert(all_of(Bundles,
21868 [](const ScheduleBundle *Bundle) {
21869 return Bundle->isScheduled();
21870 }) &&
21871 "must be scheduled at this point");
21872 }
21873#endif
21874
21875 // Avoid duplicate scheduling of the block.
21876 BS->ScheduleStart = nullptr;
21877}
21878
21880 // If V is a store, just return the width of the stored value (or value
21881 // truncated just before storing) without traversing the expression tree.
21882 // This is the common case.
21883 if (auto *Store = dyn_cast<StoreInst>(V))
21884 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21885
21886 if (auto *IEI = dyn_cast<InsertElementInst>(V))
21887 return getVectorElementSize(IEI->getOperand(1));
21888
21889 auto E = InstrElementSize.find(V);
21890 if (E != InstrElementSize.end())
21891 return E->second;
21892
21893 // If V is not a store, we can traverse the expression tree to find loads
21894 // that feed it. The type of the loaded value may indicate a more suitable
21895 // width than V's type. We want to base the vector element size on the width
21896 // of memory operations where possible.
21899 if (auto *I = dyn_cast<Instruction>(V)) {
21900 Worklist.emplace_back(I, I->getParent(), 0);
21901 Visited.insert(I);
21902 }
21903
21904 // Traverse the expression tree in bottom-up order looking for loads. If we
21905 // encounter an instruction we don't yet handle, we give up.
21906 auto Width = 0u;
21907 Value *FirstNonBool = nullptr;
21908 while (!Worklist.empty()) {
21909 auto [I, Parent, Level] = Worklist.pop_back_val();
21910
21911 // We should only be looking at scalar instructions here. If the current
21912 // instruction has a vector type, skip.
21913 auto *Ty = I->getType();
21914 if (isa<VectorType>(Ty))
21915 continue;
21916 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21917 FirstNonBool = I;
21918 if (Level > RecursionMaxDepth)
21919 continue;
21920
21921 // If the current instruction is a load, update MaxWidth to reflect the
21922 // width of the loaded value.
21924 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21925
21926 // Otherwise, we need to visit the operands of the instruction. We only
21927 // handle the interesting cases from buildTree here. If an operand is an
21928 // instruction we haven't yet visited and from the same basic block as the
21929 // user or the use is a PHI node, we add it to the worklist.
21932 for (Use &U : I->operands()) {
21933 if (auto *J = dyn_cast<Instruction>(U.get()))
21934 if (Visited.insert(J).second &&
21935 (isa<PHINode>(I) || J->getParent() == Parent)) {
21936 Worklist.emplace_back(J, J->getParent(), Level + 1);
21937 continue;
21938 }
21939 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
21940 FirstNonBool = U.get();
21941 }
21942 } else {
21943 break;
21944 }
21945 }
21946
21947 // If we didn't encounter a memory access in the expression tree, or if we
21948 // gave up for some reason, just return the width of V. Otherwise, return the
21949 // maximum width we found.
21950 if (!Width) {
21951 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21952 V = FirstNonBool;
21953 Width = DL->getTypeSizeInBits(V->getType());
21954 }
21955
21956 for (Instruction *I : Visited)
21957 InstrElementSize[I] = Width;
21958
21959 return Width;
21960}
21961
21962bool BoUpSLP::collectValuesToDemote(
21963 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
21965 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
21966 bool &IsProfitableToDemote, bool IsTruncRoot) const {
21967 // We can always demote constants.
21968 if (all_of(E.Scalars, IsaPred<Constant>))
21969 return true;
21970
21971 unsigned OrigBitWidth =
21972 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21973 if (OrigBitWidth == BitWidth) {
21974 MaxDepthLevel = 1;
21975 return true;
21976 }
21977
21978 // Check if the node was analyzed already and must keep its original bitwidth.
21979 if (NodesToKeepBWs.contains(E.Idx))
21980 return false;
21981
21982 // If the value is not a vectorized instruction in the expression and not used
21983 // by the insertelement instruction and not used in multiple vector nodes, it
21984 // cannot be demoted.
21985 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
21986 if (isa<PoisonValue>(R))
21987 return false;
21988 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21989 });
21990 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
21991 if (isa<PoisonValue>(V))
21992 return true;
21993 if (getTreeEntries(V).size() > 1)
21994 return false;
21995 // For lat shuffle of sext/zext with many uses need to check the extra bit
21996 // for unsigned values, otherwise may have incorrect casting for reused
21997 // scalars.
21998 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
21999 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
22000 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22001 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
22002 return true;
22003 }
22004 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22005 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
22006 if (IsSignedNode)
22007 ++BitWidth1;
22008 if (auto *I = dyn_cast<Instruction>(V)) {
22009 APInt Mask = DB->getDemandedBits(I);
22010 unsigned BitWidth2 =
22011 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
22012 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
22013 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
22014 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
22015 break;
22016 BitWidth2 *= 2;
22017 }
22018 BitWidth1 = std::min(BitWidth1, BitWidth2);
22019 }
22020 BitWidth = std::max(BitWidth, BitWidth1);
22021 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
22022 };
22023 auto FinalAnalysis = [&, TTI = TTI]() {
22024 if (!IsProfitableToDemote)
22025 return false;
22026 bool Res = all_of(
22027 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
22028 // Demote gathers.
22029 if (Res && E.isGather()) {
22030 if (E.hasState()) {
22031 if (const TreeEntry *SameTE =
22032 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
22033 SameTE)
22034 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
22035 ToDemote, Visited, NodesToKeepBWs,
22036 MaxDepthLevel, IsProfitableToDemote,
22037 IsTruncRoot)) {
22038 ToDemote.push_back(E.Idx);
22039 return true;
22040 }
22041 }
22042 // Check possible extractelement instructions bases and final vector
22043 // length.
22044 SmallPtrSet<Value *, 4> UniqueBases;
22045 for (Value *V : E.Scalars) {
22046 auto *EE = dyn_cast<ExtractElementInst>(V);
22047 if (!EE)
22048 continue;
22049 UniqueBases.insert(EE->getVectorOperand());
22050 }
22051 const unsigned VF = E.Scalars.size();
22052 Type *OrigScalarTy = E.Scalars.front()->getType();
22053 if (UniqueBases.size() <= 2 ||
22054 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
22056 *TTI,
22058 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
22059 VF))) {
22060 ToDemote.push_back(E.Idx);
22061 return true;
22062 }
22063 }
22064 return Res;
22065 };
22066 if (E.isGather() || !Visited.insert(&E).second ||
22067 any_of(E.Scalars, [&](Value *V) {
22068 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22069 return isa<InsertElementInst>(U) && !isVectorized(U);
22070 });
22071 }))
22072 return FinalAnalysis();
22073
22074 if (any_of(E.Scalars, [&](Value *V) {
22075 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22076 return isVectorized(U) ||
22077 (E.Idx == 0 && UserIgnoreList &&
22078 UserIgnoreList->contains(U)) ||
22079 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22080 !U->getType()->isScalableTy() &&
22081 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22082 }) && !IsPotentiallyTruncated(V, BitWidth);
22083 }))
22084 return false;
22085
22086 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
22087 bool &NeedToExit) {
22088 NeedToExit = false;
22089 unsigned InitLevel = MaxDepthLevel;
22090 for (const TreeEntry *Op : Operands) {
22091 unsigned Level = InitLevel;
22092 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
22093 ToDemote, Visited, NodesToKeepBWs, Level,
22094 IsProfitableToDemote, IsTruncRoot)) {
22095 if (!IsProfitableToDemote)
22096 return false;
22097 NeedToExit = true;
22098 if (!FinalAnalysis())
22099 return false;
22100 continue;
22101 }
22102 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22103 }
22104 return true;
22105 };
22106 auto AttemptCheckBitwidth =
22107 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
22108 // Try all bitwidth < OrigBitWidth.
22109 NeedToExit = false;
22110 unsigned BestFailBitwidth = 0;
22111 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
22112 if (Checker(BitWidth, OrigBitWidth))
22113 return true;
22114 if (BestFailBitwidth == 0 && FinalAnalysis())
22115 BestFailBitwidth = BitWidth;
22116 }
22117 if (BitWidth >= OrigBitWidth) {
22118 if (BestFailBitwidth == 0) {
22119 BitWidth = OrigBitWidth;
22120 return false;
22121 }
22122 MaxDepthLevel = 1;
22123 BitWidth = BestFailBitwidth;
22124 NeedToExit = true;
22125 return true;
22126 }
22127 return false;
22128 };
22129 auto TryProcessInstruction =
22130 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
22131 function_ref<bool(unsigned, unsigned)> Checker = {}) {
22132 if (Operands.empty()) {
22133 if (!IsTruncRoot)
22134 MaxDepthLevel = 1;
22135 for (Value *V : E.Scalars)
22136 (void)IsPotentiallyTruncated(V, BitWidth);
22137 } else {
22138 // Several vectorized uses? Check if we can truncate it, otherwise -
22139 // exit.
22140 if (any_of(E.Scalars, [&](Value *V) {
22141 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22142 }))
22143 return false;
22144 bool NeedToExit = false;
22145 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22146 return false;
22147 if (NeedToExit)
22148 return true;
22149 if (!ProcessOperands(Operands, NeedToExit))
22150 return false;
22151 if (NeedToExit)
22152 return true;
22153 }
22154
22155 ++MaxDepthLevel;
22156 // Record the entry that we can demote.
22157 ToDemote.push_back(E.Idx);
22158 return IsProfitableToDemote;
22159 };
22160
22161 if (E.State == TreeEntry::SplitVectorize)
22162 return TryProcessInstruction(
22163 BitWidth,
22164 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
22165 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
22166
22167 if (E.isAltShuffle()) {
22168 // Combining these opcodes may lead to incorrect analysis, skip for now.
22169 auto IsDangerousOpcode = [](unsigned Opcode) {
22170 switch (Opcode) {
22171 case Instruction::Shl:
22172 case Instruction::AShr:
22173 case Instruction::LShr:
22174 case Instruction::UDiv:
22175 case Instruction::SDiv:
22176 case Instruction::URem:
22177 case Instruction::SRem:
22178 return true;
22179 default:
22180 break;
22181 }
22182 return false;
22183 };
22184 if (IsDangerousOpcode(E.getAltOpcode()))
22185 return FinalAnalysis();
22186 }
22187
22188 switch (E.getOpcode()) {
22189
22190 // We can always demote truncations and extensions. Since truncations can
22191 // seed additional demotion, we save the truncated value.
22192 case Instruction::Trunc:
22193 if (IsProfitableToDemoteRoot)
22194 IsProfitableToDemote = true;
22195 return TryProcessInstruction(BitWidth);
22196 case Instruction::ZExt:
22197 case Instruction::SExt:
22198 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
22199 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22200 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22201 return false;
22202 IsProfitableToDemote = true;
22203 return TryProcessInstruction(BitWidth);
22204
22205 // We can demote certain binary operations if we can demote both of their
22206 // operands.
22207 case Instruction::Add:
22208 case Instruction::Sub:
22209 case Instruction::Mul:
22210 case Instruction::And:
22211 case Instruction::Or:
22212 case Instruction::Xor: {
22213 return TryProcessInstruction(
22214 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
22215 }
22216 case Instruction::Freeze:
22217 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
22218 case Instruction::Shl: {
22219 // If we are truncating the result of this SHL, and if it's a shift of an
22220 // inrange amount, we can always perform a SHL in a smaller type.
22221 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
22222 return all_of(E.Scalars, [&](Value *V) {
22223 if (isa<PoisonValue>(V))
22224 return true;
22225 if (E.isCopyableElement(V))
22226 return true;
22227 auto *I = cast<Instruction>(V);
22228 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22229 return AmtKnownBits.getMaxValue().ult(BitWidth);
22230 });
22231 };
22232 return TryProcessInstruction(
22233 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
22234 }
22235 case Instruction::LShr: {
22236 // If this is a truncate of a logical shr, we can truncate it to a smaller
22237 // lshr iff we know that the bits we would otherwise be shifting in are
22238 // already zeros.
22239 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22240 return all_of(E.Scalars, [&](Value *V) {
22241 if (isa<PoisonValue>(V))
22242 return true;
22243 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22244 if (E.isCopyableElement(V))
22245 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22246 auto *I = cast<Instruction>(V);
22247 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22248 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22249 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22250 SimplifyQuery(*DL));
22251 });
22252 };
22253 return TryProcessInstruction(
22254 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22255 LShrChecker);
22256 }
22257 case Instruction::AShr: {
22258 // If this is a truncate of an arithmetic shr, we can truncate it to a
22259 // smaller ashr iff we know that all the bits from the sign bit of the
22260 // original type and the sign bit of the truncate type are similar.
22261 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22262 return all_of(E.Scalars, [&](Value *V) {
22263 if (isa<PoisonValue>(V))
22264 return true;
22265 auto *I = cast<Instruction>(V);
22266 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22267 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22268 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22269 ShiftedBits <
22270 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22271 });
22272 };
22273 return TryProcessInstruction(
22274 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22275 AShrChecker);
22276 }
22277 case Instruction::UDiv:
22278 case Instruction::URem: {
22279 // UDiv and URem can be truncated if all the truncated bits are zero.
22280 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22281 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22282 return all_of(E.Scalars, [&](Value *V) {
22283 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22284 if (E.hasCopyableElements() && E.isCopyableElement(V))
22285 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22286 auto *I = cast<Instruction>(V);
22287 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22288 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22289 });
22290 };
22291 return TryProcessInstruction(
22292 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
22293 }
22294
22295 // We can demote selects if we can demote their true and false values.
22296 case Instruction::Select: {
22297 return TryProcessInstruction(
22298 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
22299 }
22300
22301 // We can demote phis if we can demote all their incoming operands.
22302 case Instruction::PHI: {
22303 const unsigned NumOps = E.getNumOperands();
22305 transform(seq<unsigned>(0, NumOps), Ops.begin(),
22306 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
22307
22308 return TryProcessInstruction(BitWidth, Ops);
22309 }
22310
22311 case Instruction::Call: {
22312 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
22313 if (!IC)
22314 break;
22316 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
22317 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
22318 break;
22319 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
22320 function_ref<bool(unsigned, unsigned)> CallChecker;
22321 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22322 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22323 return all_of(E.Scalars, [&](Value *V) {
22324 auto *I = cast<Instruction>(V);
22325 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22326 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22327 return MaskedValueIsZero(I->getOperand(0), Mask,
22328 SimplifyQuery(*DL)) &&
22329 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22330 }
22331 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
22332 "Expected min/max intrinsics only.");
22333 unsigned SignBits = OrigBitWidth - BitWidth;
22334 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22335 unsigned Op0SignBits =
22336 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22337 unsigned Op1SignBits =
22338 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
22339 return SignBits <= Op0SignBits &&
22340 ((SignBits != Op0SignBits &&
22341 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22342 MaskedValueIsZero(I->getOperand(0), Mask,
22343 SimplifyQuery(*DL))) &&
22344 SignBits <= Op1SignBits &&
22345 ((SignBits != Op1SignBits &&
22346 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
22347 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
22348 });
22349 };
22350 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22351 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22352 return all_of(E.Scalars, [&](Value *V) {
22353 auto *I = cast<Instruction>(V);
22354 unsigned SignBits = OrigBitWidth - BitWidth;
22355 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22356 unsigned Op0SignBits =
22357 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22358 return SignBits <= Op0SignBits &&
22359 ((SignBits != Op0SignBits &&
22360 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22361 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22362 });
22363 };
22364 if (ID != Intrinsic::abs) {
22365 Operands.push_back(getOperandEntry(&E, 1));
22366 CallChecker = CompChecker;
22367 } else {
22368 CallChecker = AbsChecker;
22369 }
22370 InstructionCost BestCost =
22371 std::numeric_limits<InstructionCost::CostType>::max();
22372 unsigned BestBitWidth = BitWidth;
22373 unsigned VF = E.Scalars.size();
22374 // Choose the best bitwidth based on cost estimations.
22375 auto Checker = [&](unsigned BitWidth, unsigned) {
22376 unsigned MinBW = PowerOf2Ceil(BitWidth);
22377 SmallVector<Type *> ArgTys =
22378 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
22379 auto VecCallCosts = getVectorCallCosts(
22380 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
22381 TTI, TLI, ArgTys);
22382 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
22383 if (Cost < BestCost) {
22384 BestCost = Cost;
22385 BestBitWidth = BitWidth;
22386 }
22387 return false;
22388 };
22389 [[maybe_unused]] bool NeedToExit;
22390 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22391 BitWidth = BestBitWidth;
22392 return TryProcessInstruction(BitWidth, Operands, CallChecker);
22393 }
22394
22395 // Otherwise, conservatively give up.
22396 default:
22397 break;
22398 }
22399 MaxDepthLevel = 1;
22400 return FinalAnalysis();
22401}
22402
22403static RecurKind getRdxKind(Value *V);
22404
22406 // We only attempt to truncate integer expressions.
22407 bool IsStoreOrInsertElt =
22408 VectorizableTree.front()->hasState() &&
22409 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
22410 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22411 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22412 ExtraBitWidthNodes.size() <= 1 &&
22413 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22414 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22415 return;
22416
22417 unsigned NodeIdx = 0;
22418 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22419 NodeIdx = 1;
22420
22421 // Ensure the roots of the vectorizable tree don't form a cycle.
22422 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
22423 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22424 "Unexpected tree is graph.");
22425
22426 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
22427 // resize to the final type.
22428 bool IsTruncRoot = false;
22429 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22430 SmallVector<unsigned> RootDemotes;
22431 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
22432 if (NodeIdx != 0 &&
22433 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22434 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22435 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
22436 IsTruncRoot = true;
22437 RootDemotes.push_back(NodeIdx);
22438 IsProfitableToDemoteRoot = true;
22439 ++NodeIdx;
22440 }
22441
22442 // Analyzed the reduction already and not profitable - exit.
22443 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22444 return;
22445
22446 SmallVector<unsigned> ToDemote;
22447 auto ComputeMaxBitWidth =
22448 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
22449 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
22450 ToDemote.clear();
22451 // Check if the root is trunc and the next node is gather/buildvector, then
22452 // keep trunc in scalars, which is free in most cases.
22453 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22454 !NodesToKeepBWs.contains(E.Idx) &&
22455 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22456 all_of(E.Scalars, [&](Value *V) {
22457 return V->hasOneUse() || isa<Constant>(V) ||
22458 (!V->hasNUsesOrMore(UsesLimit) &&
22459 none_of(V->users(), [&](User *U) {
22460 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22461 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22462 if (TEs.empty() || is_contained(TEs, UserTE))
22463 return false;
22464 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22465 SelectInst>(U) ||
22466 isa<SIToFPInst, UIToFPInst>(U) ||
22467 (UserTE->hasState() &&
22468 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22469 SelectInst>(UserTE->getMainOp()) ||
22470 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22471 return true;
22472 unsigned UserTESz = DL->getTypeSizeInBits(
22473 UserTE->Scalars.front()->getType());
22474 if (all_of(TEs, [&](const TreeEntry *TE) {
22475 auto It = MinBWs.find(TE);
22476 return It != MinBWs.end() &&
22477 It->second.first > UserTESz;
22478 }))
22479 return true;
22480 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22481 }));
22482 })) {
22483 ToDemote.push_back(E.Idx);
22484 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22485 auto It = MinBWs.find(UserTE);
22486 if (It != MinBWs.end())
22487 return It->second.first;
22488 unsigned MaxBitWidth =
22489 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22490 MaxBitWidth = bit_ceil(MaxBitWidth);
22491 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22492 MaxBitWidth = 8;
22493 return MaxBitWidth;
22494 }
22495
22496 if (!E.hasState())
22497 return 0u;
22498
22499 unsigned VF = E.getVectorFactor();
22500 Type *ScalarTy = E.Scalars.front()->getType();
22501 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
22502 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
22503 if (!TreeRootIT)
22504 return 0u;
22505
22506 if (any_of(E.Scalars,
22507 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
22508 return 0u;
22509
22510 unsigned NumParts = ::getNumberOfParts(
22511 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
22512
22513 // The maximum bit width required to represent all the values that can be
22514 // demoted without loss of precision. It would be safe to truncate the roots
22515 // of the expression to this width.
22516 unsigned MaxBitWidth = 1u;
22517
22518 // True if the roots can be zero-extended back to their original type,
22519 // rather than sign-extended. We know that if the leading bits are not
22520 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
22521 // True.
22522 // Determine if the sign bit of all the roots is known to be zero. If not,
22523 // IsKnownPositive is set to False.
22524 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
22525 if (isa<PoisonValue>(R))
22526 return true;
22527 KnownBits Known = computeKnownBits(R, *DL);
22528 return Known.isNonNegative();
22529 });
22530
22531 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22532 E.UserTreeIndex.UserTE->hasState() &&
22533 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22534 MaxBitWidth =
22535 std::min(DL->getTypeSizeInBits(
22536 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22537 DL->getTypeSizeInBits(ScalarTy));
22538
22539 // We first check if all the bits of the roots are demanded. If they're not,
22540 // we can truncate the roots to this narrower type.
22541 for (Value *Root : E.Scalars) {
22542 if (isa<PoisonValue>(Root))
22543 continue;
22544 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
22545 TypeSize NumTypeBits =
22546 DL->getTypeSizeInBits(Root->getType()->getScalarType());
22547 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22548 // If we can't prove that the sign bit is zero, we must add one to the
22549 // maximum bit width to account for the unknown sign bit. This preserves
22550 // the existing sign bit so we can safely sign-extend the root back to the
22551 // original type. Otherwise, if we know the sign bit is zero, we will
22552 // zero-extend the root instead.
22553 //
22554 // FIXME: This is somewhat suboptimal, as there will be cases where adding
22555 // one to the maximum bit width will yield a larger-than-necessary
22556 // type. In general, we need to add an extra bit only if we can't
22557 // prove that the upper bit of the original type is equal to the
22558 // upper bit of the proposed smaller type. If these two bits are
22559 // the same (either zero or one) we know that sign-extending from
22560 // the smaller type will result in the same value. Here, since we
22561 // can't yet prove this, we are just making the proposed smaller
22562 // type larger to ensure correctness.
22563 if (!IsKnownPositive)
22564 ++BitWidth1;
22565
22566 auto *I = dyn_cast<Instruction>(Root);
22567 if (!I) {
22568 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22569 continue;
22570 }
22571 APInt Mask = DB->getDemandedBits(I);
22572 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22573 MaxBitWidth =
22574 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22575 }
22576
22577 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22578 MaxBitWidth = 8;
22579
22580 // If the original type is large, but reduced type does not improve the reg
22581 // use - ignore it.
22582 if (NumParts > 1 &&
22583 NumParts ==
22585 *TTI, getWidenedType(IntegerType::get(F->getContext(),
22586 bit_ceil(MaxBitWidth)),
22587 VF)))
22588 return 0u;
22589
22590 unsigned Opcode = E.getOpcode();
22591 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22592 Opcode == Instruction::SExt ||
22593 Opcode == Instruction::ZExt || NumParts > 1;
22594 // Conservatively determine if we can actually truncate the roots of the
22595 // expression. Collect the values that can be demoted in ToDemote and
22596 // additional roots that require investigating in Roots.
22598 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22599 bool NeedToDemote = IsProfitableToDemote;
22600
22601 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22602 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22603 NeedToDemote, IsTruncRoot) ||
22604 (MaxDepthLevel <= Limit &&
22605 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22606 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22607 DL->getTypeSizeInBits(TreeRootIT) /
22608 DL->getTypeSizeInBits(
22609 E.getMainOp()->getOperand(0)->getType()) >
22610 2)))))
22611 return 0u;
22612 // Round MaxBitWidth up to the next power-of-two.
22613 MaxBitWidth = bit_ceil(MaxBitWidth);
22614
22615 return MaxBitWidth;
22616 };
22617
22618 // If we can truncate the root, we must collect additional values that might
22619 // be demoted as a result. That is, those seeded by truncations we will
22620 // modify.
22621 // Add reduction ops sizes, if any.
22622 if (UserIgnoreList &&
22623 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
22624 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
22625 // x i1> to in)).
22626 if (all_of(*UserIgnoreList,
22627 [](Value *V) {
22628 return isa<PoisonValue>(V) ||
22629 cast<Instruction>(V)->getOpcode() == Instruction::Add;
22630 }) &&
22631 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22632 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22633 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22634 Builder.getInt1Ty()) {
22635 ReductionBitWidth = 1;
22636 } else {
22637 for (Value *V : *UserIgnoreList) {
22638 if (isa<PoisonValue>(V))
22639 continue;
22640 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22641 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
22642 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22644 ++BitWidth1;
22645 unsigned BitWidth2 = BitWidth1;
22648 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22649 }
22650 ReductionBitWidth =
22651 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22652 }
22653 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22654 ReductionBitWidth = 8;
22655
22656 ReductionBitWidth = bit_ceil(ReductionBitWidth);
22657 }
22658 }
22659 bool IsTopRoot = NodeIdx == 0;
22660 while (NodeIdx < VectorizableTree.size() &&
22661 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22662 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22663 RootDemotes.push_back(NodeIdx);
22664 ++NodeIdx;
22665 IsTruncRoot = true;
22666 }
22667 bool IsSignedCmp = false;
22668 if (UserIgnoreList &&
22669 all_of(*UserIgnoreList,
22671 m_SMax(m_Value(), m_Value())))))
22672 IsSignedCmp = true;
22673 while (NodeIdx < VectorizableTree.size()) {
22674 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
22675 unsigned Limit = 2;
22676 if (IsTopRoot &&
22677 ReductionBitWidth ==
22678 DL->getTypeSizeInBits(
22679 VectorizableTree.front()->Scalars.front()->getType()))
22680 Limit = 3;
22681 unsigned MaxBitWidth = ComputeMaxBitWidth(
22682 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22683 IsTruncRoot, IsSignedCmp);
22684 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22685 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22686 ReductionBitWidth = bit_ceil(MaxBitWidth);
22687 else if (MaxBitWidth == 0)
22688 ReductionBitWidth = 0;
22689 }
22690
22691 for (unsigned Idx : RootDemotes) {
22692 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
22693 uint32_t OrigBitWidth =
22694 DL->getTypeSizeInBits(V->getType()->getScalarType());
22695 if (OrigBitWidth > MaxBitWidth) {
22696 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
22697 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22698 }
22699 return false;
22700 }))
22701 ToDemote.push_back(Idx);
22702 }
22703 RootDemotes.clear();
22704 IsTopRoot = false;
22705 IsProfitableToDemoteRoot = true;
22706
22707 if (ExtraBitWidthNodes.empty()) {
22708 NodeIdx = VectorizableTree.size();
22709 } else {
22710 unsigned NewIdx = 0;
22711 do {
22712 NewIdx = *ExtraBitWidthNodes.begin();
22713 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22714 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22715 NodeIdx = NewIdx;
22716 IsTruncRoot =
22717 NodeIdx < VectorizableTree.size() &&
22718 VectorizableTree[NodeIdx]->UserTreeIndex &&
22719 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22720 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22721 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22722 Instruction::Trunc &&
22723 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22724 IsSignedCmp =
22725 NodeIdx < VectorizableTree.size() &&
22726 VectorizableTree[NodeIdx]->UserTreeIndex &&
22727 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22728 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22729 Instruction::ICmp &&
22730 any_of(
22731 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22732 [&](Value *V) {
22733 auto *IC = dyn_cast<ICmpInst>(V);
22734 return IC && (IC->isSigned() ||
22735 !isKnownNonNegative(IC->getOperand(0),
22736 SimplifyQuery(*DL)) ||
22737 !isKnownNonNegative(IC->getOperand(1),
22738 SimplifyQuery(*DL)));
22739 });
22740 }
22741
22742 // If the maximum bit width we compute is less than the width of the roots'
22743 // type, we can proceed with the narrowing. Otherwise, do nothing.
22744 if (MaxBitWidth == 0 ||
22745 MaxBitWidth >=
22746 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
22747 ->getBitWidth()) {
22748 if (UserIgnoreList)
22749 AnalyzedMinBWVals.insert_range(TreeRoot);
22750 NodesToKeepBWs.insert_range(ToDemote);
22751 continue;
22752 }
22753
22754 // Finally, map the values we can demote to the maximum bit with we
22755 // computed.
22756 for (unsigned Idx : ToDemote) {
22757 TreeEntry *TE = VectorizableTree[Idx].get();
22758 if (MinBWs.contains(TE))
22759 continue;
22760 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
22761 if (isa<PoisonValue>(R))
22762 return false;
22763 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22764 });
22765 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22766 }
22767 }
22768}
22769
22771 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
22772 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
22774 auto *AA = &AM.getResult<AAManager>(F);
22775 auto *LI = &AM.getResult<LoopAnalysis>(F);
22776 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
22777 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
22778 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
22780
22781 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
22782 if (!Changed)
22783 return PreservedAnalyses::all();
22784
22787 return PA;
22788}
22789
22791 TargetTransformInfo *TTI_,
22792 TargetLibraryInfo *TLI_, AAResults *AA_,
22793 LoopInfo *LI_, DominatorTree *DT_,
22794 AssumptionCache *AC_, DemandedBits *DB_,
22797 return false;
22798 SE = SE_;
22799 TTI = TTI_;
22800 TLI = TLI_;
22801 AA = AA_;
22802 LI = LI_;
22803 DT = DT_;
22804 AC = AC_;
22805 DB = DB_;
22806 DL = &F.getDataLayout();
22807
22808 Stores.clear();
22809 GEPs.clear();
22810 bool Changed = false;
22811
22812 // If the target claims to have no vector registers don't attempt
22813 // vectorization.
22814 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
22815 LLVM_DEBUG(
22816 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
22817 return false;
22818 }
22819
22820 // Don't vectorize when the attribute NoImplicitFloat is used.
22821 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
22822 return false;
22823
22824 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
22825
22826 // Use the bottom up slp vectorizer to construct chains that start with
22827 // store instructions.
22828 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
22829
22830 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
22831 // delete instructions.
22832
22833 // Update DFS numbers now so that we can use them for ordering.
22834 DT->updateDFSNumbers();
22835
22836 // Scan the blocks in the function in post order.
22837 for (auto *BB : post_order(&F.getEntryBlock())) {
22839 continue;
22840
22841 // Start new block - clear the list of reduction roots.
22842 R.clearReductionData();
22843 collectSeedInstructions(BB);
22844
22845 // Vectorize trees that end at stores.
22846 if (!Stores.empty()) {
22847 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
22848 << " underlying objects.\n");
22849 Changed |= vectorizeStoreChains(R);
22850 }
22851
22852 // Vectorize trees that end at reductions.
22853 Changed |= vectorizeChainsInBlock(BB, R);
22854
22855 // Vectorize the index computations of getelementptr instructions. This
22856 // is primarily intended to catch gather-like idioms ending at
22857 // non-consecutive loads.
22858 if (!GEPs.empty()) {
22859 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
22860 << " underlying objects.\n");
22861 Changed |= vectorizeGEPIndices(BB, R);
22862 }
22863 }
22864
22865 if (Changed) {
22866 R.optimizeGatherSequence();
22867 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
22868 }
22869 return Changed;
22870}
22871
22872std::optional<bool>
22873SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
22874 unsigned Idx, unsigned MinVF,
22875 unsigned &Size) {
22876 Size = 0;
22877 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
22878 << "\n");
22879 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22880 unsigned VF = Chain.size();
22881
22882 if (!has_single_bit(Sz) ||
22884 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
22885 VF) ||
22886 VF < 2 || VF < MinVF) {
22887 // Check if vectorizing with a non-power-of-2 VF should be considered. At
22888 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
22889 // all vector lanes are used.
22890 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
22891 return false;
22892 }
22893
22894 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
22895 << "\n");
22896
22897 SetVector<Value *> ValOps;
22898 for (Value *V : Chain)
22899 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
22900 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
22901 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
22902 InstructionsState S = Analysis.buildInstructionsState(
22903 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
22904 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
22905 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
22906 bool IsAllowedSize =
22907 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
22908 ValOps.size()) ||
22909 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
22910 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22911 (!S.getMainOp()->isSafeToRemove() ||
22912 any_of(ValOps.getArrayRef(),
22913 [&](Value *V) {
22914 return !isa<ExtractElementInst>(V) &&
22915 (V->getNumUses() > Chain.size() ||
22916 any_of(V->users(), [&](User *U) {
22917 return !Stores.contains(U);
22918 }));
22919 }))) ||
22920 (ValOps.size() > Chain.size() / 2 && !S)) {
22921 Size = (!IsAllowedSize && S) ? 1 : 2;
22922 return false;
22923 }
22924 }
22925 if (R.isLoadCombineCandidate(Chain))
22926 return true;
22927 R.buildTree(Chain);
22928 // Check if tree tiny and store itself or its value is not vectorized.
22929 if (R.isTreeTinyAndNotFullyVectorizable()) {
22930 if (R.isGathered(Chain.front()) ||
22931 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
22932 return std::nullopt;
22933 Size = R.getCanonicalGraphSize();
22934 return false;
22935 }
22936 if (R.isProfitableToReorder()) {
22937 R.reorderTopToBottom();
22938 R.reorderBottomToTop();
22939 }
22940 R.transformNodes();
22941 R.buildExternalUses();
22942
22943 R.computeMinimumValueSizes();
22944
22945 Size = R.getCanonicalGraphSize();
22946 if (S && S.getOpcode() == Instruction::Load)
22947 Size = 2; // cut off masked gather small trees
22948 InstructionCost Cost = R.getTreeCost();
22949
22950 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
22951 if (Cost < -SLPCostThreshold) {
22952 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
22953
22954 using namespace ore;
22955
22956 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
22957 cast<StoreInst>(Chain[0]))
22958 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
22959 << " and with tree size "
22960 << NV("TreeSize", R.getTreeSize()));
22961
22962 R.vectorizeTree();
22963 return true;
22964 }
22965
22966 return false;
22967}
22968
22969/// Checks if the quadratic mean deviation is less than 90% of the mean size.
22970static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
22971 bool First) {
22972 unsigned Num = 0;
22973 uint64_t Sum = std::accumulate(
22974 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22975 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22976 unsigned Size = First ? Val.first : Val.second;
22977 if (Size == 1)
22978 return V;
22979 ++Num;
22980 return V + Size;
22981 });
22982 if (Num == 0)
22983 return true;
22984 uint64_t Mean = Sum / Num;
22985 if (Mean == 0)
22986 return true;
22987 uint64_t Dev = std::accumulate(
22988 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22989 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22990 unsigned P = First ? Val.first : Val.second;
22991 if (P == 1)
22992 return V;
22993 return V + (P - Mean) * (P - Mean);
22994 }) /
22995 Num;
22996 return Dev * 96 / (Mean * Mean) == 0;
22997}
22998
22999namespace {
23000
23001/// A group of stores that we'll try to bundle together using vector ops.
23002/// They are ordered using the signed distance of their address operand to the
23003/// address of this group's BaseInstr.
23004class RelatedStoreInsts {
23005public:
23006 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
23007 : AllStores(AllStores) {
23008 reset(BaseInstrIdx);
23009 }
23010
23011 void reset(unsigned NewBaseInstr) {
23012 assert(NewBaseInstr < AllStores.size() &&
23013 "Instruction index out of bounds");
23014 BaseInstrIdx = NewBaseInstr;
23015 Instrs.clear();
23016 insertOrLookup(NewBaseInstr, 0);
23017 }
23018
23019 /// Tries to insert \p InstrIdx as the store with a pointer distance of
23020 /// \p PtrDist.
23021 /// Does nothing if there is already a store with that \p PtrDist.
23022 /// \returns The previously associated Instruction index, or std::nullopt
23023 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
23024 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
23025 return Inserted ? std::nullopt : std::make_optional(It->second);
23026 }
23027
23028 using DistToInstMap = std::map<int64_t, unsigned>;
23029 const DistToInstMap &getStores() const { return Instrs; }
23030
23031 /// If \p SI is related to this group of stores, return the distance of its
23032 /// pointer operand to the one the group's BaseInstr.
23033 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
23034 ScalarEvolution &SE) const {
23035 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
23036 return getPointersDiff(
23037 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
23038 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
23039 /*StrictCheck=*/true);
23040 }
23041
23042 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
23043 /// Stores whose index is less than \p MinSafeIdx will be dropped.
23044 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
23045 int64_t DistFromCurBase) {
23046 DistToInstMap PrevSet = std::move(Instrs);
23047 reset(NewBaseInstIdx);
23048
23049 // Re-insert stores that come after MinSafeIdx to try and vectorize them
23050 // again. Their distance will be "rebased" to use NewBaseInstIdx as
23051 // reference.
23052 for (auto [Dist, InstIdx] : PrevSet) {
23053 if (InstIdx >= MinSafeIdx)
23054 insertOrLookup(InstIdx, Dist - DistFromCurBase);
23055 }
23056 }
23057
23058 /// Remove all stores that have been vectorized from this group.
23059 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
23060 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
23061 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
23062 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
23063 });
23064
23065 // Get a forward iterator pointing after the last vectorized store and erase
23066 // all stores before it so we don't try to vectorize them again.
23067 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
23068 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
23069 }
23070
23071private:
23072 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
23073 unsigned BaseInstrIdx;
23074
23075 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
23076 DistToInstMap Instrs;
23077
23078 /// Reference to all the stores in the BB being analyzed.
23079 ArrayRef<StoreInst *> AllStores;
23080};
23081
23082} // end anonymous namespace
23083
23084bool SLPVectorizerPass::vectorizeStores(
23085 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
23086 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23087 &Visited) {
23088 // We may run into multiple chains that merge into a single chain. We mark the
23089 // stores that we vectorized so that we don't visit the same store twice.
23090 BoUpSLP::ValueSet VectorizedStores;
23091 bool Changed = false;
23092
23093 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23094 int64_t PrevDist = -1;
23095 BoUpSLP::ValueList Operands;
23096 // Collect the chain into a list.
23097 for (auto [Idx, Data] : enumerate(StoreSeq)) {
23098 auto &[Dist, InstIdx] = Data;
23099 if (Operands.empty() || Dist - PrevDist == 1) {
23100 Operands.push_back(Stores[InstIdx]);
23101 PrevDist = Dist;
23102 if (Idx != StoreSeq.size() - 1)
23103 continue;
23104 }
23105 auto E = make_scope_exit([&, &Dist = Dist, &InstIdx = InstIdx]() {
23106 Operands.clear();
23107 Operands.push_back(Stores[InstIdx]);
23108 PrevDist = Dist;
23109 });
23110
23111 if (Operands.size() <= 1 ||
23112 !Visited
23113 .insert({Operands.front(),
23114 cast<StoreInst>(Operands.front())->getValueOperand(),
23115 Operands.back(),
23116 cast<StoreInst>(Operands.back())->getValueOperand(),
23117 Operands.size()})
23118 .second)
23119 continue;
23120
23121 unsigned MaxVecRegSize = R.getMaxVecRegSize();
23122 unsigned EltSize = R.getVectorElementSize(Operands[0]);
23123 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
23124
23125 unsigned MaxVF =
23126 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23127 auto *Store = cast<StoreInst>(Operands[0]);
23128 Type *StoreTy = Store->getValueOperand()->getType();
23129 Type *ValueTy = StoreTy;
23130 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
23131 ValueTy = Trunc->getSrcTy();
23132 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
23133 // getStoreMinimumVF only support scalar type as arguments. As a result,
23134 // we need to use the element type of StoreTy and ValueTy to retrieve the
23135 // VF and then transform it back.
23136 // Remember: VF is defined as the number we want to vectorize, not the
23137 // number of elements in the final vector.
23138 Type *StoreScalarTy = StoreTy->getScalarType();
23139 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
23140 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23141 ValueTy->getScalarType()));
23142 MinVF /= getNumElements(StoreTy);
23143 MinVF = std::max<unsigned>(2, MinVF);
23144
23145 if (MaxVF < MinVF) {
23146 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23147 << ") < "
23148 << "MinVF (" << MinVF << ")\n");
23149 continue;
23150 }
23151
23152 unsigned NonPowerOf2VF = 0;
23154 // First try vectorizing with a non-power-of-2 VF. At the moment, only
23155 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
23156 // lanes are used.
23157 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
23158 if (has_single_bit(CandVF + 1)) {
23159 NonPowerOf2VF = CandVF;
23160 assert(NonPowerOf2VF != MaxVF &&
23161 "Non-power-of-2 VF should not be equal to MaxVF");
23162 }
23163 }
23164
23165 // MaxRegVF represents the number of instructions (scalar, or vector in
23166 // case of revec) that can be vectorized to naturally fit in a vector
23167 // register.
23168 unsigned MaxRegVF = MaxVF;
23169
23170 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
23171 if (MaxVF < MinVF) {
23172 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23173 << ") < "
23174 << "MinVF (" << MinVF << ")\n");
23175 continue;
23176 }
23177
23178 SmallVector<unsigned> CandidateVFs;
23179 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23180 VF = divideCeil(VF, 2))
23181 CandidateVFs.push_back(VF);
23182
23183 unsigned End = Operands.size();
23184 unsigned Repeat = 0;
23185 constexpr unsigned MaxAttempts = 4;
23186 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
23187 for (std::pair<unsigned, unsigned> &P : RangeSizes)
23188 P.first = P.second = 1;
23189 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23190 auto IsNotVectorized = [](bool First,
23191 const std::pair<unsigned, unsigned> &P) {
23192 return First ? P.first > 0 : P.second > 0;
23193 };
23194 auto IsVectorized = [](bool First,
23195 const std::pair<unsigned, unsigned> &P) {
23196 return First ? P.first == 0 : P.second == 0;
23197 };
23198 auto VFIsProfitable = [](bool First, unsigned Size,
23199 const std::pair<unsigned, unsigned> &P) {
23200 return First ? Size >= P.first : Size >= P.second;
23201 };
23202 auto FirstSizeSame = [](unsigned Size,
23203 const std::pair<unsigned, unsigned> &P) {
23204 return Size == P.first;
23205 };
23206 while (true) {
23207 ++Repeat;
23208 bool RepeatChanged = false;
23209 bool AnyProfitableGraph = false;
23210 for (unsigned VF : CandidateVFs) {
23211 AnyProfitableGraph = false;
23212 unsigned FirstUnvecStore =
23213 std::distance(RangeSizes.begin(),
23214 find_if(RangeSizes, std::bind(IsNotVectorized,
23215 VF >= MaxRegVF, _1)));
23216
23217 // Form slices of size VF starting from FirstUnvecStore and try to
23218 // vectorize them.
23219 while (FirstUnvecStore < End) {
23220 unsigned FirstVecStore = std::distance(
23221 RangeSizes.begin(),
23222 find_if(RangeSizes.drop_front(FirstUnvecStore),
23223 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23224 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23225 for (unsigned SliceStartIdx = FirstUnvecStore;
23226 SliceStartIdx + VF <= MaxSliceEnd;) {
23227 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
23228 VF >= MaxRegVF)) {
23229 ++SliceStartIdx;
23230 continue;
23231 }
23232 ArrayRef<Value *> Slice =
23233 ArrayRef(Operands).slice(SliceStartIdx, VF);
23234 assert(all_of(Slice,
23235 [&](Value *V) {
23236 return cast<StoreInst>(V)
23237 ->getValueOperand()
23238 ->getType() ==
23239 cast<StoreInst>(Slice.front())
23240 ->getValueOperand()
23241 ->getType();
23242 }) &&
23243 "Expected all operands of same type.");
23244 if (!NonSchedulable.empty()) {
23245 auto [NonSchedSizeMax, NonSchedSizeMin] =
23246 NonSchedulable.lookup(Slice.front());
23247 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23248 // VF is too ambitious. Try to vectorize another slice before
23249 // trying a smaller VF.
23250 SliceStartIdx += NonSchedSizeMax;
23251 continue;
23252 }
23253 }
23254 unsigned TreeSize;
23255 std::optional<bool> Res =
23256 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23257 if (!Res) {
23258 // Update the range of non schedulable VFs for slices starting
23259 // at SliceStartIdx.
23260 NonSchedulable
23261 .try_emplace(Slice.front(), std::make_pair(VF, VF))
23262 .first->getSecond()
23263 .second = VF;
23264 } else if (*Res) {
23265 // Mark the vectorized stores so that we don't vectorize them
23266 // again.
23267 VectorizedStores.insert_range(Slice);
23268 // Mark the vectorized stores so that we don't vectorize them
23269 // again.
23270 AnyProfitableGraph = RepeatChanged = Changed = true;
23271 // If we vectorized initial block, no need to try to vectorize
23272 // it again.
23273 for (std::pair<unsigned, unsigned> &P :
23274 RangeSizes.slice(SliceStartIdx, VF))
23275 P.first = P.second = 0;
23276 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23277 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
23278 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23279 P.first = P.second = 0;
23280 FirstUnvecStore = SliceStartIdx + VF;
23281 }
23282 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23283 for (std::pair<unsigned, unsigned> &P :
23284 RangeSizes.slice(SliceStartIdx + VF,
23285 MaxSliceEnd - (SliceStartIdx + VF)))
23286 P.first = P.second = 0;
23287 if (MaxSliceEnd == End)
23288 End = SliceStartIdx;
23289 MaxSliceEnd = SliceStartIdx;
23290 }
23291 SliceStartIdx += VF;
23292 continue;
23293 }
23294 if (VF > 2 && Res &&
23295 !all_of(RangeSizes.slice(SliceStartIdx, VF),
23296 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23297 _1))) {
23298 SliceStartIdx += VF;
23299 continue;
23300 }
23301 // Check for the very big VFs that we're not rebuilding same
23302 // trees, just with larger number of elements.
23303 if (VF > MaxRegVF && TreeSize > 1 &&
23304 all_of(RangeSizes.slice(SliceStartIdx, VF),
23305 std::bind(FirstSizeSame, TreeSize, _1))) {
23306 SliceStartIdx += VF;
23307 while (SliceStartIdx != MaxSliceEnd &&
23308 RangeSizes[SliceStartIdx].first == TreeSize)
23309 ++SliceStartIdx;
23310 continue;
23311 }
23312 if (TreeSize > 1) {
23313 for (std::pair<unsigned, unsigned> &P :
23314 RangeSizes.slice(SliceStartIdx, VF)) {
23315 if (VF >= MaxRegVF)
23316 P.second = std::max(P.second, TreeSize);
23317 else
23318 P.first = std::max(P.first, TreeSize);
23319 }
23320 }
23321 ++SliceStartIdx;
23322 AnyProfitableGraph = true;
23323 }
23324 if (FirstUnvecStore >= End)
23325 break;
23326 if (MaxSliceEnd - FirstUnvecStore < VF &&
23327 MaxSliceEnd - FirstUnvecStore >= MinVF)
23328 AnyProfitableGraph = true;
23329 FirstUnvecStore = std::distance(
23330 RangeSizes.begin(),
23331 find_if(RangeSizes.drop_front(MaxSliceEnd),
23332 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23333 }
23334 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
23335 break;
23336 }
23337 // All values vectorized - exit.
23338 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
23339 return P.first == 0 && P.second == 0;
23340 }))
23341 break;
23342 // Check if tried all attempts or no need for the last attempts at all.
23343 if (Repeat >= MaxAttempts ||
23344 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23345 break;
23346 constexpr unsigned StoresLimit = 64;
23347 const unsigned MaxTotalNum = std::min<unsigned>(
23348 Operands.size(),
23349 static_cast<unsigned>(
23350 End -
23351 std::distance(
23352 RangeSizes.begin(),
23353 find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +
23354 1));
23355 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
23356 unsigned Limit =
23357 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
23358 CandidateVFs.clear();
23359 if (bit_floor(Limit) == VF)
23360 CandidateVFs.push_back(Limit);
23361 if (VF > MaxTotalNum || VF >= StoresLimit)
23362 break;
23363 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
23364 if (P.first != 0)
23365 P.first = std::max(P.second, P.first);
23366 }
23367 // Last attempt to vectorize max number of elements, if all previous
23368 // attempts were unsuccessful because of the cost issues.
23369 CandidateVFs.push_back(VF);
23370 }
23371 }
23372 };
23373
23374 /// Groups of stores to vectorize
23375 SmallVector<RelatedStoreInsts> SortedStores;
23376
23377 // Inserts the specified store SI with the given index Idx to the set of the
23378 // stores. If the store with the same distance is found already - stop
23379 // insertion, try to vectorize already found stores. If some stores from this
23380 // sequence were not vectorized - try to vectorize them with the new store
23381 // later. But this logic is applied only to the stores, that come before the
23382 // previous store with the same distance.
23383 // Example:
23384 // 1. store x, %p
23385 // 2. store y, %p+1
23386 // 3. store z, %p+2
23387 // 4. store a, %p
23388 // 5. store b, %p+3
23389 // - Scan this from the last to first store. The very first bunch of stores is
23390 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
23391 // vector).
23392 // - The next store in the list - #1 - has the same distance from store #5 as
23393 // the store #4.
23394 // - Try to vectorize sequence of stores 4,2,3,5.
23395 // - If all these stores are vectorized - just drop them.
23396 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
23397 // - Start new stores sequence.
23398 // The new bunch of stores is {1, {1, 0}}.
23399 // - Add the stores from previous sequence, that were not vectorized.
23400 // Here we consider the stores in the reversed order, rather they are used in
23401 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
23402 // Store #3 can be added -> comes after store #4 with the same distance as
23403 // store #1.
23404 // Store #5 cannot be added - comes before store #4.
23405 // This logic allows to improve the compile time, we assume that the stores
23406 // after previous store with the same distance most likely have memory
23407 // dependencies and no need to waste compile time to try to vectorize them.
23408 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
23409 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
23410 std::optional<int64_t> PtrDist;
23411 auto *RelatedStores = find_if(
23412 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
23413 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23414 return PtrDist.has_value();
23415 });
23416
23417 // We did not find a comparable store, start a new group.
23418 if (RelatedStores == SortedStores.end()) {
23419 SortedStores.emplace_back(Idx, Stores);
23420 return;
23421 }
23422
23423 // If there is already a store in the group with the same PtrDiff, try to
23424 // vectorize the existing instructions before adding the current store.
23425 // Otherwise, insert this store and keep collecting.
23426 if (std::optional<unsigned> PrevInst =
23427 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23428 TryToVectorize(RelatedStores->getStores());
23429 RelatedStores->clearVectorizedStores(VectorizedStores);
23430 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
23431 /*NewBaseInstIdx=*/Idx,
23432 /*DistFromCurBase=*/*PtrDist);
23433 }
23434 };
23435 Type *PrevValTy = nullptr;
23436 for (auto [I, SI] : enumerate(Stores)) {
23437 if (R.isDeleted(SI))
23438 continue;
23439 if (!PrevValTy)
23440 PrevValTy = SI->getValueOperand()->getType();
23441 // Check that we do not try to vectorize stores of different types.
23442 if (PrevValTy != SI->getValueOperand()->getType()) {
23443 for (RelatedStoreInsts &StoreSeq : SortedStores)
23444 TryToVectorize(StoreSeq.getStores());
23445 SortedStores.clear();
23446 PrevValTy = SI->getValueOperand()->getType();
23447 }
23448 FillStoresSet(I, SI);
23449 }
23450
23451 // Final vectorization attempt.
23452 for (RelatedStoreInsts &StoreSeq : SortedStores)
23453 TryToVectorize(StoreSeq.getStores());
23454
23455 return Changed;
23456}
23457
23458void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23459 // Initialize the collections. We will make a single pass over the block.
23460 Stores.clear();
23461 GEPs.clear();
23462
23463 // Visit the store and getelementptr instructions in BB and organize them in
23464 // Stores and GEPs according to the underlying objects of their pointer
23465 // operands.
23466 for (Instruction &I : *BB) {
23467 // Ignore store instructions that are volatile or have a pointer operand
23468 // that doesn't point to a scalar type.
23469 if (auto *SI = dyn_cast<StoreInst>(&I)) {
23470 if (!SI->isSimple())
23471 continue;
23472 if (!isValidElementType(SI->getValueOperand()->getType()))
23473 continue;
23474 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
23475 }
23476
23477 // Ignore getelementptr instructions that have more than one index, a
23478 // constant index, or a pointer operand that doesn't point to a scalar
23479 // type.
23480 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
23481 if (GEP->getNumIndices() != 1)
23482 continue;
23483 Value *Idx = GEP->idx_begin()->get();
23484 if (isa<Constant>(Idx))
23485 continue;
23486 if (!isValidElementType(Idx->getType()))
23487 continue;
23488 if (GEP->getType()->isVectorTy())
23489 continue;
23490 GEPs[GEP->getPointerOperand()].push_back(GEP);
23491 }
23492 }
23493}
23494
23495bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
23496 bool MaxVFOnly) {
23497 if (VL.size() < 2)
23498 return false;
23499
23500 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
23501 << VL.size() << ".\n");
23502
23503 // Check that all of the parts are instructions of the same type,
23504 // we permit an alternate opcode via InstructionsState.
23505 InstructionsState S = getSameOpcode(VL, *TLI);
23506 if (!S)
23507 return false;
23508
23509 Instruction *I0 = S.getMainOp();
23510 // Make sure invalid types (including vector type) are rejected before
23511 // determining vectorization factor for scalar instructions.
23512 for (Value *V : VL) {
23513 Type *Ty = V->getType();
23515 // NOTE: the following will give user internal llvm type name, which may
23516 // not be useful.
23517 R.getORE()->emit([&]() {
23518 std::string TypeStr;
23519 llvm::raw_string_ostream OS(TypeStr);
23520 Ty->print(OS);
23521 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
23522 << "Cannot SLP vectorize list: type "
23523 << TypeStr + " is unsupported by vectorizer";
23524 });
23525 return false;
23526 }
23527 }
23528
23529 Type *ScalarTy = getValueType(VL[0]);
23530 unsigned Sz = R.getVectorElementSize(I0);
23531 unsigned MinVF = R.getMinVF(Sz);
23532 unsigned MaxVF = std::max<unsigned>(
23533 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
23534 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23535 if (MaxVF < 2) {
23536 R.getORE()->emit([&]() {
23537 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
23538 << "Cannot SLP vectorize list: vectorization factor "
23539 << "less than 2 is not supported";
23540 });
23541 return false;
23542 }
23543
23544 bool Changed = false;
23545 bool CandidateFound = false;
23546 InstructionCost MinCost = SLPCostThreshold.getValue();
23547
23548 unsigned NextInst = 0, MaxInst = VL.size();
23549 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23550 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
23551 // No actual vectorization should happen, if number of parts is the same as
23552 // provided vectorization factor (i.e. the scalar type is used for vector
23553 // code during codegen).
23554 auto *VecTy = getWidenedType(ScalarTy, VF);
23555 if (TTI->getNumberOfParts(VecTy) == VF)
23556 continue;
23557 for (unsigned I = NextInst; I < MaxInst; ++I) {
23558 unsigned ActualVF = std::min(MaxInst - I, VF);
23559
23560 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
23561 continue;
23562
23563 if (MaxVFOnly && ActualVF < MaxVF)
23564 break;
23565 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23566 break;
23567
23568 SmallVector<Value *> Ops(ActualVF, nullptr);
23569 unsigned Idx = 0;
23570 for (Value *V : VL.drop_front(I)) {
23571 // Check that a previous iteration of this loop did not delete the
23572 // Value.
23573 if (auto *Inst = dyn_cast<Instruction>(V);
23574 !Inst || !R.isDeleted(Inst)) {
23575 Ops[Idx] = V;
23576 ++Idx;
23577 if (Idx == ActualVF)
23578 break;
23579 }
23580 }
23581 // Not enough vectorizable instructions - exit.
23582 if (Idx != ActualVF)
23583 break;
23584
23585 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
23586 << "\n");
23587
23588 R.buildTree(Ops);
23589 if (R.isTreeTinyAndNotFullyVectorizable())
23590 continue;
23591 if (R.isProfitableToReorder()) {
23592 R.reorderTopToBottom();
23593 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
23594 }
23595 R.transformNodes();
23596 R.buildExternalUses();
23597
23598 R.computeMinimumValueSizes();
23599 InstructionCost Cost = R.getTreeCost();
23600 CandidateFound = true;
23601 MinCost = std::min(MinCost, Cost);
23602
23603 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
23604 << " for VF=" << ActualVF << "\n");
23605 if (Cost < -SLPCostThreshold) {
23606 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
23607 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
23609 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
23610 << " and with tree size "
23611 << ore::NV("TreeSize", R.getTreeSize()));
23612
23613 R.vectorizeTree();
23614 // Move to the next bundle.
23615 I += VF - 1;
23616 NextInst = I + 1;
23617 Changed = true;
23618 }
23619 }
23620 }
23621
23622 if (!Changed && CandidateFound) {
23623 R.getORE()->emit([&]() {
23624 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
23625 << "List vectorization was possible but not beneficial with cost "
23626 << ore::NV("Cost", MinCost) << " >= "
23627 << ore::NV("Treshold", -SLPCostThreshold);
23628 });
23629 } else if (!Changed) {
23630 R.getORE()->emit([&]() {
23631 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
23632 << "Cannot SLP vectorize list: vectorization was impossible"
23633 << " with available vectorization factors";
23634 });
23635 }
23636 return Changed;
23637}
23638
23639namespace {
23640
23641/// Model horizontal reductions.
23642///
23643/// A horizontal reduction is a tree of reduction instructions that has values
23644/// that can be put into a vector as its leaves. For example:
23645///
23646/// mul mul mul mul
23647/// \ / \ /
23648/// + +
23649/// \ /
23650/// +
23651/// This tree has "mul" as its leaf values and "+" as its reduction
23652/// instructions. A reduction can feed into a store or a binary operation
23653/// feeding a phi.
23654/// ...
23655/// \ /
23656/// +
23657/// |
23658/// phi +=
23659///
23660/// Or:
23661/// ...
23662/// \ /
23663/// +
23664/// |
23665/// *p =
23666///
23667class HorizontalReduction {
23668 using ReductionOpsType = SmallVector<Value *, 16>;
23669 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23670 ReductionOpsListType ReductionOps;
23671 /// List of possibly reduced values.
23673 /// Maps reduced value to the corresponding reduction operation.
23674 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23675 WeakTrackingVH ReductionRoot;
23676 /// The type of reduction operation.
23677 RecurKind RdxKind;
23678 /// Checks if the optimization of original scalar identity operations on
23679 /// matched horizontal reductions is enabled and allowed.
23680 bool IsSupportedHorRdxIdentityOp = false;
23681 /// The minimum number of the reduced values.
23682 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
23683 /// Contains vector values for reduction including their scale factor and
23684 /// signedness.
23686
23687 static bool isCmpSelMinMax(Instruction *I) {
23688 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
23690 }
23691
23692 // And/or are potentially poison-safe logical patterns like:
23693 // select x, y, false
23694 // select x, true, y
23695 static bool isBoolLogicOp(Instruction *I) {
23696 return isa<SelectInst>(I) &&
23697 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
23698 }
23699
23700 /// Checks if instruction is associative and can be vectorized.
23701 static bool isVectorizable(RecurKind Kind, Instruction *I,
23702 bool TwoElementReduction = false) {
23703 if (Kind == RecurKind::None)
23704 return false;
23705
23706 // Integer ops that map to select instructions or intrinsics are fine.
23708 isBoolLogicOp(I))
23709 return true;
23710
23711 // No need to check for associativity, if 2 reduced values.
23712 if (TwoElementReduction)
23713 return true;
23714
23715 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23716 // FP min/max are associative except for NaN and -0.0. We do not
23717 // have to rule out -0.0 here because the intrinsic semantics do not
23718 // specify a fixed result for it.
23719 return I->getFastMathFlags().noNaNs();
23720 }
23721
23722 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23723 return true;
23724
23725 return I->isAssociative();
23726 }
23727
23728 static Value *getRdxOperand(Instruction *I, unsigned Index) {
23729 // Poison-safe 'or' takes the form: select X, true, Y
23730 // To make that work with the normal operand processing, we skip the
23731 // true value operand.
23732 // TODO: Change the code and data structures to handle this without a hack.
23733 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
23734 return I->getOperand(2);
23735 return I->getOperand(Index);
23736 }
23737
23738 /// Creates reduction operation with the current opcode.
23739 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
23740 Value *RHS, const Twine &Name, bool UseSelect) {
23741 Type *OpTy = LHS->getType();
23742 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
23743 switch (Kind) {
23744 case RecurKind::Or: {
23745 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23746 return Builder.CreateSelectWithUnknownProfile(
23747 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
23748 RHS, DEBUG_TYPE, Name);
23749 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23750 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23751 Name);
23752 }
23753 case RecurKind::And: {
23754 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23755 return Builder.CreateSelectWithUnknownProfile(
23756 LHS, RHS,
23757 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
23758 DEBUG_TYPE, Name);
23759 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23760 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23761 Name);
23762 }
23763 case RecurKind::Add:
23764 case RecurKind::Mul:
23765 case RecurKind::Xor:
23766 case RecurKind::FAdd:
23767 case RecurKind::FMul: {
23768 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23769 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23770 Name);
23771 }
23772 case RecurKind::SMax:
23773 case RecurKind::SMin:
23774 case RecurKind::UMax:
23775 case RecurKind::UMin:
23776 if (UseSelect) {
23778 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
23779 return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
23780 Name);
23781 }
23782 [[fallthrough]];
23783 case RecurKind::FMax:
23784 case RecurKind::FMin:
23785 case RecurKind::FMaximum:
23786 case RecurKind::FMinimum:
23787 case RecurKind::FMaximumNum:
23788 case RecurKind::FMinimumNum: {
23790 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
23791 }
23792 default:
23793 llvm_unreachable("Unknown reduction operation.");
23794 }
23795 }
23796
23797 /// Creates reduction operation with the current opcode with the IR flags
23798 /// from \p ReductionOps, dropping nuw/nsw flags.
23799 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
23800 Value *RHS, const Twine &Name,
23801 const ReductionOpsListType &ReductionOps) {
23802 bool UseSelect = ReductionOps.size() == 2 ||
23803 // Logical or/and.
23804 (ReductionOps.size() == 1 &&
23805 any_of(ReductionOps.front(), IsaPred<SelectInst>));
23806 assert((!UseSelect || ReductionOps.size() != 2 ||
23807 isa<SelectInst>(ReductionOps[1][0])) &&
23808 "Expected cmp + select pairs for reduction");
23809 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
23811 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
23812 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
23813 /*IncludeWrapFlags=*/false);
23814 propagateIRFlags(Op, ReductionOps[1], nullptr,
23815 /*IncludeWrapFlags=*/false);
23816 return Op;
23817 }
23818 }
23819 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
23820 return Op;
23821 }
23822
23823public:
23824 static RecurKind getRdxKind(Value *V) {
23825 auto *I = dyn_cast<Instruction>(V);
23826 if (!I)
23827 return RecurKind::None;
23828 if (match(I, m_Add(m_Value(), m_Value())))
23829 return RecurKind::Add;
23830 if (match(I, m_Mul(m_Value(), m_Value())))
23831 return RecurKind::Mul;
23832 if (match(I, m_And(m_Value(), m_Value())) ||
23834 return RecurKind::And;
23835 if (match(I, m_Or(m_Value(), m_Value())) ||
23837 return RecurKind::Or;
23838 if (match(I, m_Xor(m_Value(), m_Value())))
23839 return RecurKind::Xor;
23840 if (match(I, m_FAdd(m_Value(), m_Value())))
23841 return RecurKind::FAdd;
23842 if (match(I, m_FMul(m_Value(), m_Value())))
23843 return RecurKind::FMul;
23844
23846 return RecurKind::FMax;
23848 return RecurKind::FMin;
23849
23850 if (match(I, m_FMaximum(m_Value(), m_Value())))
23851 return RecurKind::FMaximum;
23852 if (match(I, m_FMinimum(m_Value(), m_Value())))
23853 return RecurKind::FMinimum;
23854 // This matches either cmp+select or intrinsics. SLP is expected to handle
23855 // either form.
23856 // TODO: If we are canonicalizing to intrinsics, we can remove several
23857 // special-case paths that deal with selects.
23858 if (match(I, m_SMax(m_Value(), m_Value())))
23859 return RecurKind::SMax;
23860 if (match(I, m_SMin(m_Value(), m_Value())))
23861 return RecurKind::SMin;
23862 if (match(I, m_UMax(m_Value(), m_Value())))
23863 return RecurKind::UMax;
23864 if (match(I, m_UMin(m_Value(), m_Value())))
23865 return RecurKind::UMin;
23866
23867 if (auto *Select = dyn_cast<SelectInst>(I)) {
23868 // Try harder: look for min/max pattern based on instructions producing
23869 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
23870 // During the intermediate stages of SLP, it's very common to have
23871 // pattern like this (since optimizeGatherSequence is run only once
23872 // at the end):
23873 // %1 = extractelement <2 x i32> %a, i32 0
23874 // %2 = extractelement <2 x i32> %a, i32 1
23875 // %cond = icmp sgt i32 %1, %2
23876 // %3 = extractelement <2 x i32> %a, i32 0
23877 // %4 = extractelement <2 x i32> %a, i32 1
23878 // %select = select i1 %cond, i32 %3, i32 %4
23879 CmpPredicate Pred;
23880 Instruction *L1;
23881 Instruction *L2;
23882
23883 Value *LHS = Select->getTrueValue();
23884 Value *RHS = Select->getFalseValue();
23885 Value *Cond = Select->getCondition();
23886
23887 // TODO: Support inverse predicates.
23888 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
23891 return RecurKind::None;
23892 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
23895 return RecurKind::None;
23896 } else {
23898 return RecurKind::None;
23899 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
23902 return RecurKind::None;
23903 }
23904
23905 switch (Pred) {
23906 default:
23907 return RecurKind::None;
23908 case CmpInst::ICMP_SGT:
23909 case CmpInst::ICMP_SGE:
23910 return RecurKind::SMax;
23911 case CmpInst::ICMP_SLT:
23912 case CmpInst::ICMP_SLE:
23913 return RecurKind::SMin;
23914 case CmpInst::ICMP_UGT:
23915 case CmpInst::ICMP_UGE:
23916 return RecurKind::UMax;
23917 case CmpInst::ICMP_ULT:
23918 case CmpInst::ICMP_ULE:
23919 return RecurKind::UMin;
23920 }
23921 }
23922 return RecurKind::None;
23923 }
23924
23925 /// Get the index of the first operand.
23926 static unsigned getFirstOperandIndex(Instruction *I) {
23927 return isCmpSelMinMax(I) ? 1 : 0;
23928 }
23929
23930private:
23931 /// Total number of operands in the reduction operation.
23932 static unsigned getNumberOfOperands(Instruction *I) {
23933 return isCmpSelMinMax(I) ? 3 : 2;
23934 }
23935
23936 /// Checks if the instruction is in basic block \p BB.
23937 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
23938 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
23939 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
23940 auto *Sel = cast<SelectInst>(I);
23941 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
23942 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
23943 }
23944 return I->getParent() == BB;
23945 }
23946
23947 /// Expected number of uses for reduction operations/reduced values.
23948 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
23949 if (IsCmpSelMinMax) {
23950 // SelectInst must be used twice while the condition op must have single
23951 // use only.
23952 if (auto *Sel = dyn_cast<SelectInst>(I))
23953 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23954 return I->hasNUses(2);
23955 }
23956
23957 // Arithmetic reduction operation must be used once only.
23958 return I->hasOneUse();
23959 }
23960
23961 /// Initializes the list of reduction operations.
23962 void initReductionOps(Instruction *I) {
23963 if (isCmpSelMinMax(I))
23964 ReductionOps.assign(2, ReductionOpsType());
23965 else
23966 ReductionOps.assign(1, ReductionOpsType());
23967 }
23968
23969 /// Add all reduction operations for the reduction instruction \p I.
23970 void addReductionOps(Instruction *I) {
23971 if (isCmpSelMinMax(I)) {
23972 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
23973 ReductionOps[1].emplace_back(I);
23974 } else {
23975 ReductionOps[0].emplace_back(I);
23976 }
23977 }
23978
23979 static bool isGoodForReduction(ArrayRef<Value *> Data) {
23980 int Sz = Data.size();
23981 auto *I = dyn_cast<Instruction>(Data.front());
23982 return Sz > 1 || isConstant(Data.front()) ||
23983 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
23984 }
23985
23986public:
23987 HorizontalReduction() = default;
23989 : ReductionRoot(I), ReductionLimit(2) {
23990 RdxKind = HorizontalReduction::getRdxKind(I);
23991 ReductionOps.emplace_back().push_back(I);
23992 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
23993 for (Value *V : Ops)
23994 ReducedValsToOps[V].push_back(I);
23995 }
23996
23997 bool matchReductionForOperands() const {
23998 // Analyze "regular" integer/FP types for reductions - no target-specific
23999 // types or pointers.
24000 assert(ReductionRoot && "Reduction root is not set!");
24001 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
24002 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
24003 return Ops.size() == 2;
24004 })))
24005 return false;
24006
24007 return true;
24008 }
24009
24010 /// Try to find a reduction tree.
24011 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
24012 ScalarEvolution &SE, const DataLayout &DL,
24013 const TargetLibraryInfo &TLI) {
24014 RdxKind = HorizontalReduction::getRdxKind(Root);
24015 if (!isVectorizable(RdxKind, Root))
24016 return false;
24017
24018 // Analyze "regular" integer/FP types for reductions - no target-specific
24019 // types or pointers.
24020 Type *Ty = Root->getType();
24021 if (!isValidElementType(Ty) || Ty->isPointerTy())
24022 return false;
24023
24024 // Though the ultimate reduction may have multiple uses, its condition must
24025 // have only single use.
24026 if (auto *Sel = dyn_cast<SelectInst>(Root))
24027 if (!Sel->getCondition()->hasOneUse())
24028 return false;
24029
24030 ReductionRoot = Root;
24031
24032 // Iterate through all the operands of the possible reduction tree and
24033 // gather all the reduced values, sorting them by their value id.
24034 BasicBlock *BB = Root->getParent();
24035 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
24037 1, std::make_pair(Root, 0));
24038 // Checks if the operands of the \p TreeN instruction are also reduction
24039 // operations or should be treated as reduced values or an extra argument,
24040 // which is not part of the reduction.
24041 auto CheckOperands = [&](Instruction *TreeN,
24042 SmallVectorImpl<Value *> &PossibleReducedVals,
24043 SmallVectorImpl<Instruction *> &ReductionOps,
24044 unsigned Level) {
24045 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
24046 getNumberOfOperands(TreeN)))) {
24047 Value *EdgeVal = getRdxOperand(TreeN, I);
24048 ReducedValsToOps[EdgeVal].push_back(TreeN);
24049 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
24050 // If the edge is not an instruction, or it is different from the main
24051 // reduction opcode or has too many uses - possible reduced value.
24052 // Also, do not try to reduce const values, if the operation is not
24053 // foldable.
24054 if (!EdgeInst || Level > RecursionMaxDepth ||
24055 getRdxKind(EdgeInst) != RdxKind ||
24056 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
24057 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
24058 !isVectorizable(RdxKind, EdgeInst) ||
24059 (R.isAnalyzedReductionRoot(EdgeInst) &&
24060 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
24061 PossibleReducedVals.push_back(EdgeVal);
24062 continue;
24063 }
24064 ReductionOps.push_back(EdgeInst);
24065 }
24066 };
24067 // Try to regroup reduced values so that it gets more profitable to try to
24068 // reduce them. Values are grouped by their value ids, instructions - by
24069 // instruction op id and/or alternate op id, plus do extra analysis for
24070 // loads (grouping them by the distance between pointers) and cmp
24071 // instructions (grouping them by the predicate).
24072 SmallMapVector<
24073 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
24074 8>
24075 PossibleReducedVals;
24076 initReductionOps(Root);
24077 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
24078 SmallSet<size_t, 2> LoadKeyUsed;
24079
24080 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
24082 Value *Ptr =
24084 if (!LoadKeyUsed.insert(Key).second) {
24085 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
24086 if (LIt != LoadsMap.end()) {
24087 for (LoadInst *RLI : LIt->second) {
24088 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
24089 LI->getType(), LI->getPointerOperand(), DL, SE,
24090 /*StrictCheck=*/true))
24091 return hash_value(RLI->getPointerOperand());
24092 }
24093 for (LoadInst *RLI : LIt->second) {
24095 LI->getPointerOperand(), TLI)) {
24096 hash_code SubKey = hash_value(RLI->getPointerOperand());
24097 return SubKey;
24098 }
24099 }
24100 if (LIt->second.size() > 2) {
24101 hash_code SubKey =
24102 hash_value(LIt->second.back()->getPointerOperand());
24103 return SubKey;
24104 }
24105 }
24106 }
24107 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
24108 .first->second.push_back(LI);
24109 return hash_value(LI->getPointerOperand());
24110 };
24111
24112 while (!Worklist.empty()) {
24113 auto [TreeN, Level] = Worklist.pop_back_val();
24114 SmallVector<Value *> PossibleRedVals;
24115 SmallVector<Instruction *> PossibleReductionOps;
24116 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24117 addReductionOps(TreeN);
24118 // Add reduction values. The values are sorted for better vectorization
24119 // results.
24120 for (Value *V : PossibleRedVals) {
24121 size_t Key, Idx;
24122 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
24123 /*AllowAlternate=*/false);
24124 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
24125 }
24126 for (Instruction *I : reverse(PossibleReductionOps))
24127 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
24128 }
24129 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
24130 // Sort values by the total number of values kinds to start the reduction
24131 // from the longest possible reduced values sequences.
24132 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
24133 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
24134 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
24135 for (auto &Slice : PossibleRedVals) {
24136 PossibleRedValsVect.emplace_back();
24137 auto RedValsVect = Slice.second.takeVector();
24138 stable_sort(RedValsVect, llvm::less_second());
24139 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
24140 PossibleRedValsVect.back().append(Data.second, Data.first);
24141 }
24142 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
24143 return P1.size() > P2.size();
24144 });
24145 bool First = true;
24146 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
24147 if (First) {
24148 First = false;
24149 ReducedVals.emplace_back();
24150 } else if (!isGoodForReduction(Data)) {
24151 auto *LI = dyn_cast<LoadInst>(Data.front());
24152 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
24153 if (!LI || !LastLI ||
24155 getUnderlyingObject(LastLI->getPointerOperand()))
24156 ReducedVals.emplace_back();
24157 }
24158 ReducedVals.back().append(Data.rbegin(), Data.rend());
24159 }
24160 }
24161 // Sort the reduced values by number of same/alternate opcode and/or pointer
24162 // operand.
24163 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
24164 return P1.size() > P2.size();
24165 });
24166 return true;
24167 }
24168
24169 /// Attempt to vectorize the tree found by matchAssociativeReduction.
24170 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
24171 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24172 DominatorTree &DT) {
24173 constexpr unsigned RegMaxNumber = 4;
24174 constexpr unsigned RedValsMaxNumber = 128;
24175 // If there are a sufficient number of reduction values, reduce
24176 // to a nearby power-of-2. We can safely generate oversized
24177 // vectors and rely on the backend to split them to legal sizes.
24178 if (unsigned NumReducedVals = std::accumulate(
24179 ReducedVals.begin(), ReducedVals.end(), 0,
24180 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
24181 if (!isGoodForReduction(Vals))
24182 return Num;
24183 return Num + Vals.size();
24184 });
24185 NumReducedVals < ReductionLimit &&
24186 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
24187 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
24188 })) {
24189 for (ReductionOpsType &RdxOps : ReductionOps)
24190 for (Value *RdxOp : RdxOps)
24191 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24192 return nullptr;
24193 }
24194
24195 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24196 TargetFolder(DL));
24197 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
24198
24199 // Track the reduced values in case if they are replaced by extractelement
24200 // because of the vectorization.
24201 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
24202 ReducedVals.front().size());
24203
24204 // The compare instruction of a min/max is the insertion point for new
24205 // instructions and may be replaced with a new compare instruction.
24206 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
24207 assert(isa<SelectInst>(RdxRootInst) &&
24208 "Expected min/max reduction to have select root instruction");
24209 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
24210 assert(isa<Instruction>(ScalarCond) &&
24211 "Expected min/max reduction to have compare condition");
24212 return cast<Instruction>(ScalarCond);
24213 };
24214
24215 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
24216 return isBoolLogicOp(cast<Instruction>(V));
24217 });
24218 // Return new VectorizedTree, based on previous value.
24219 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
24220 if (VectorizedTree) {
24221 // Update the final value in the reduction.
24223 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
24224 if (AnyBoolLogicOp) {
24225 auto It = ReducedValsToOps.find(VectorizedTree);
24226 auto It1 = ReducedValsToOps.find(Res);
24227 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
24228 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
24229 (It != ReducedValsToOps.end() &&
24230 any_of(It->getSecond(), [&](Instruction *I) {
24231 return isBoolLogicOp(I) &&
24232 getRdxOperand(I, 0) == VectorizedTree;
24233 }))) {
24234 ;
24235 } else if (isGuaranteedNotToBePoison(Res, AC) ||
24236 (It1 != ReducedValsToOps.end() &&
24237 any_of(It1->getSecond(), [&](Instruction *I) {
24238 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24239 }))) {
24240 std::swap(VectorizedTree, Res);
24241 } else {
24242 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
24243 }
24244 }
24245
24246 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
24247 ReductionOps);
24248 }
24249 // Initialize the final value in the reduction.
24250 return Res;
24251 };
24252 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24253 ReductionOps.front().size());
24254 for (ReductionOpsType &RdxOps : ReductionOps)
24255 for (Value *RdxOp : RdxOps) {
24256 if (!RdxOp)
24257 continue;
24258 IgnoreList.insert(RdxOp);
24259 }
24260 // Intersect the fast-math-flags from all reduction operations.
24261 FastMathFlags RdxFMF;
24262 RdxFMF.set();
24263 for (Value *U : IgnoreList)
24264 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
24265 RdxFMF &= FPMO->getFastMathFlags();
24266 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
24267
24268 // Need to track reduced vals, they may be changed during vectorization of
24269 // subvectors.
24270 for (ArrayRef<Value *> Candidates : ReducedVals)
24271 for (Value *V : Candidates)
24272 TrackedVals.try_emplace(V, V);
24273
24274 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24275 Value *V) -> unsigned & {
24276 auto *It = MV.find(V);
24277 assert(It != MV.end() && "Unable to find given key.");
24278 return It->second;
24279 };
24280
24281 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
24282 // List of the values that were reduced in other trees as part of gather
24283 // nodes and thus requiring extract if fully vectorized in other trees.
24284 SmallPtrSet<Value *, 4> RequiredExtract;
24285 WeakTrackingVH VectorizedTree = nullptr;
24286 bool CheckForReusedReductionOps = false;
24287 // Try to vectorize elements based on their type.
24289 for (ArrayRef<Value *> RV : ReducedVals)
24290 States.push_back(getSameOpcode(RV, TLI));
24291 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
24292 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
24293 InstructionsState S = States[I];
24294 SmallVector<Value *> Candidates;
24295 Candidates.reserve(2 * OrigReducedVals.size());
24296 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
24297 for (Value *ReducedVal : OrigReducedVals) {
24298 Value *RdxVal = TrackedVals.at(ReducedVal);
24299 // Check if the reduction value was not overriden by the extractelement
24300 // instruction because of the vectorization and exclude it, if it is not
24301 // compatible with other values.
24302 // Also check if the instruction was folded to constant/other value.
24303 auto *Inst = dyn_cast<Instruction>(RdxVal);
24304 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
24305 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24306 (S && !Inst))
24307 continue;
24308 Candidates.push_back(RdxVal);
24309 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24310 }
24311 bool ShuffledExtracts = false;
24312 // Try to handle shuffled extractelements.
24313 if (S && S.getOpcode() == Instruction::ExtractElement &&
24314 !S.isAltShuffle() && I + 1 < E) {
24315 SmallVector<Value *> CommonCandidates(Candidates);
24316 for (Value *RV : ReducedVals[I + 1]) {
24317 Value *RdxVal = TrackedVals.at(RV);
24318 // Check if the reduction value was not overriden by the
24319 // extractelement instruction because of the vectorization and
24320 // exclude it, if it is not compatible with other values.
24321 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
24322 if (!Inst)
24323 continue;
24324 CommonCandidates.push_back(RdxVal);
24325 TrackedToOrig.try_emplace(RdxVal, RV);
24326 }
24327 SmallVector<int> Mask;
24328 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
24329 ++I;
24330 Candidates.swap(CommonCandidates);
24331 ShuffledExtracts = true;
24332 }
24333 }
24334
24335 // Emit code for constant values.
24336 if (Candidates.size() > 1 && allConstant(Candidates)) {
24337 Value *Res = Candidates.front();
24338 Value *OrigV = TrackedToOrig.at(Candidates.front());
24339 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24340 for (Value *VC : ArrayRef(Candidates).drop_front()) {
24341 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
24342 Value *OrigV = TrackedToOrig.at(VC);
24343 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24344 if (auto *ResI = dyn_cast<Instruction>(Res))
24345 V.analyzedReductionRoot(ResI);
24346 }
24347 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24348 continue;
24349 }
24350
24351 unsigned NumReducedVals = Candidates.size();
24352 if (NumReducedVals < ReductionLimit &&
24353 (NumReducedVals < 2 || !isSplat(Candidates)))
24354 continue;
24355
24356 // Check if we support repeated scalar values processing (optimization of
24357 // original scalar identity operations on matched horizontal reductions).
24358 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24359 RdxKind != RecurKind::FMul &&
24360 RdxKind != RecurKind::FMulAdd;
24361 // Gather same values.
24362 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24363 if (IsSupportedHorRdxIdentityOp)
24364 for (Value *V : Candidates) {
24365 Value *OrigV = TrackedToOrig.at(V);
24366 ++SameValuesCounter.try_emplace(OrigV).first->second;
24367 }
24368 // Used to check if the reduced values used same number of times. In this
24369 // case the compiler may produce better code. E.g. if reduced values are
24370 // aabbccdd (8 x values), then the first node of the tree will have a node
24371 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
24372 // Plus, the final reduction will be performed on <8 x aabbccdd>.
24373 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
24374 // x abcd) * 2.
24375 // Currently it only handles add/fadd/xor. and/or/min/max do not require
24376 // this analysis, other operations may require an extra estimation of
24377 // the profitability.
24378 bool SameScaleFactor = false;
24379 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24380 SameValuesCounter.size() != Candidates.size();
24381 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
24382 if (OptReusedScalars) {
24383 SameScaleFactor =
24384 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24385 RdxKind == RecurKind::Xor) &&
24386 all_of(drop_begin(SameValuesCounter),
24387 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
24388 return P.second == SameValuesCounter.front().second;
24389 });
24390 Candidates.resize(SameValuesCounter.size());
24391 transform(SameValuesCounter, Candidates.begin(),
24392 [&](const auto &P) { return TrackedVals.at(P.first); });
24393 NumReducedVals = Candidates.size();
24394 // Have a reduction of the same element.
24395 if (NumReducedVals == 1) {
24396 Value *OrigV = TrackedToOrig.at(Candidates.front());
24397 unsigned Cnt = At(SameValuesCounter, OrigV);
24398 Value *RedVal =
24399 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24400 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24401 VectorizedVals.try_emplace(OrigV, Cnt);
24402 ExternallyUsedValues.insert(OrigV);
24403 continue;
24404 }
24405 }
24406
24407 unsigned MaxVecRegSize = V.getMaxVecRegSize();
24408 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
24409 const unsigned MaxElts = std::clamp<unsigned>(
24410 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
24411 RegMaxNumber * RedValsMaxNumber);
24412
24413 unsigned ReduxWidth = NumReducedVals;
24414 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
24415 unsigned NumParts, NumRegs;
24416 Type *ScalarTy = Candidates.front()->getType();
24417 ReduxWidth =
24418 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
24419 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24420 NumParts = ::getNumberOfParts(TTI, Tp);
24421 NumRegs =
24423 while (NumParts > NumRegs) {
24424 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
24425 ReduxWidth = bit_floor(ReduxWidth - 1);
24426 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24427 NumParts = ::getNumberOfParts(TTI, Tp);
24428 NumRegs =
24430 }
24431 if (NumParts > NumRegs / 2)
24432 ReduxWidth = bit_floor(ReduxWidth);
24433 return ReduxWidth;
24434 };
24435 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
24436 ReduxWidth = GetVectorFactor(ReduxWidth);
24437 ReduxWidth = std::min(ReduxWidth, MaxElts);
24438
24439 unsigned Start = 0;
24440 unsigned Pos = Start;
24441 // Restarts vectorization attempt with lower vector factor.
24442 unsigned PrevReduxWidth = ReduxWidth;
24443 bool CheckForReusedReductionOpsLocal = false;
24444 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
24445 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
24446 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24447 // Check if any of the reduction ops are gathered. If so, worth
24448 // trying again with less number of reduction ops.
24449 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24450 }
24451 ++Pos;
24452 if (Pos < NumReducedVals - ReduxWidth + 1)
24453 return IsAnyRedOpGathered;
24454 Pos = Start;
24455 --ReduxWidth;
24456 if (ReduxWidth > 1)
24457 ReduxWidth = GetVectorFactor(ReduxWidth);
24458 return IsAnyRedOpGathered;
24459 };
24460 bool AnyVectorized = false;
24461 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24462 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24463 ReduxWidth >= ReductionLimit) {
24464 // Dependency in tree of the reduction ops - drop this attempt, try
24465 // later.
24466 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24467 Start == 0) {
24468 CheckForReusedReductionOps = true;
24469 break;
24470 }
24471 PrevReduxWidth = ReduxWidth;
24472 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
24473 // Been analyzed already - skip.
24474 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
24475 (!has_single_bit(ReduxWidth) &&
24476 (IgnoredCandidates.contains(
24477 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
24478 IgnoredCandidates.contains(
24479 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
24480 bit_floor(ReduxWidth))))) ||
24481 V.areAnalyzedReductionVals(VL)) {
24482 (void)AdjustReducedVals(/*IgnoreVL=*/true);
24483 continue;
24484 }
24485 // Early exit if any of the reduction values were deleted during
24486 // previous vectorization attempts.
24487 if (any_of(VL, [&V](Value *RedVal) {
24488 auto *RedValI = dyn_cast<Instruction>(RedVal);
24489 return RedValI && V.isDeleted(RedValI);
24490 }))
24491 break;
24492 V.buildTree(VL, IgnoreList);
24493 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
24494 if (!AdjustReducedVals())
24495 V.analyzedReductionVals(VL);
24496 continue;
24497 }
24498 if (V.isLoadCombineReductionCandidate(RdxKind)) {
24499 if (!AdjustReducedVals())
24500 V.analyzedReductionVals(VL);
24501 continue;
24502 }
24503 V.reorderTopToBottom();
24504 // No need to reorder the root node at all for reassociative reduction.
24505 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
24506 VL.front()->getType()->isIntOrIntVectorTy() ||
24507 ReductionLimit > 2);
24508 // Keep extracted other reduction values, if they are used in the
24509 // vectorization trees.
24510 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
24511 ExternallyUsedValues);
24512 // The reduction root is used as the insertion point for new
24513 // instructions, so set it as externally used to prevent it from being
24514 // deleted.
24515 LocalExternallyUsedValues.insert(ReductionRoot);
24516 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
24517 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
24518 continue;
24519 for (Value *V : ReducedVals[Cnt])
24520 if (isa<Instruction>(V))
24521 LocalExternallyUsedValues.insert(TrackedVals[V]);
24522 }
24523 if (!IsSupportedHorRdxIdentityOp) {
24524 // Number of uses of the candidates in the vector of values.
24525 assert(SameValuesCounter.empty() &&
24526 "Reused values counter map is not empty");
24527 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24528 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24529 continue;
24530 Value *V = Candidates[Cnt];
24531 Value *OrigV = TrackedToOrig.at(V);
24532 ++SameValuesCounter.try_emplace(OrigV).first->second;
24533 }
24534 }
24535 V.transformNodes();
24536 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
24537 // Gather externally used values.
24538 SmallPtrSet<Value *, 4> Visited;
24539 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24540 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24541 continue;
24542 Value *RdxVal = Candidates[Cnt];
24543 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24544 RdxVal = It->second;
24545 if (!Visited.insert(RdxVal).second)
24546 continue;
24547 // Check if the scalar was vectorized as part of the vectorization
24548 // tree but not the top node.
24549 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
24550 LocalExternallyUsedValues.insert(RdxVal);
24551 continue;
24552 }
24553 Value *OrigV = TrackedToOrig.at(RdxVal);
24554 unsigned NumOps =
24555 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24556 if (NumOps != ReducedValsToOps.at(OrigV).size())
24557 LocalExternallyUsedValues.insert(RdxVal);
24558 }
24559 // Do not need the list of reused scalars in regular mode anymore.
24560 if (!IsSupportedHorRdxIdentityOp)
24561 SameValuesCounter.clear();
24562 for (Value *RdxVal : VL)
24563 if (RequiredExtract.contains(RdxVal))
24564 LocalExternallyUsedValues.insert(RdxVal);
24565 V.buildExternalUses(LocalExternallyUsedValues);
24566
24567 V.computeMinimumValueSizes();
24568
24569 // Estimate cost.
24570 InstructionCost ReductionCost =
24571 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
24572 InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
24573 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24574 << " for reduction\n");
24575 if (!Cost.isValid())
24576 break;
24577 if (Cost >= -SLPCostThreshold) {
24578 V.getORE()->emit([&]() {
24579 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
24580 ReducedValsToOps.at(VL[0]).front())
24581 << "Vectorizing horizontal reduction is possible "
24582 << "but not beneficial with cost " << ore::NV("Cost", Cost)
24583 << " and threshold "
24584 << ore::NV("Threshold", -SLPCostThreshold);
24585 });
24586 if (!AdjustReducedVals()) {
24587 V.analyzedReductionVals(VL);
24588 unsigned Offset = Pos == Start ? Pos : Pos - 1;
24589 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
24590 // Add subvectors of VL to the list of the analyzed values.
24591 for (unsigned VF = getFloorFullVectorNumberOfElements(
24592 *TTI, VL.front()->getType(), ReduxWidth - 1);
24593 VF >= ReductionLimit;
24595 *TTI, VL.front()->getType(), VF - 1)) {
24596 if (has_single_bit(VF) &&
24597 V.getCanonicalGraphSize() != V.getTreeSize())
24598 continue;
24599 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
24600 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
24601 }
24602 }
24603 }
24604 continue;
24605 }
24606
24607 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
24608 << Cost << ". (HorRdx)\n");
24609 V.getORE()->emit([&]() {
24610 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
24611 ReducedValsToOps.at(VL[0]).front())
24612 << "Vectorized horizontal reduction with cost "
24613 << ore::NV("Cost", Cost) << " and with tree size "
24614 << ore::NV("TreeSize", V.getTreeSize());
24615 });
24616
24617 Builder.setFastMathFlags(RdxFMF);
24618
24619 // Emit a reduction. If the root is a select (min/max idiom), the insert
24620 // point is the compare condition of that select.
24621 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
24622 Instruction *InsertPt = RdxRootInst;
24623 if (IsCmpSelMinMax)
24624 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24625
24626 // Vectorize a tree.
24627 Value *VectorizedRoot = V.vectorizeTree(
24628 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24629 // Update TrackedToOrig mapping, since the tracked values might be
24630 // updated.
24631 for (Value *RdxVal : Candidates) {
24632 Value *OrigVal = TrackedToOrig.at(RdxVal);
24633 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24634 if (TransformedRdxVal != RdxVal)
24635 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24636 }
24637
24638 Builder.SetInsertPoint(InsertPt);
24639
24640 // To prevent poison from leaking across what used to be sequential,
24641 // safe, scalar boolean logic operations, the reduction operand must be
24642 // frozen.
24643 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
24644 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
24645
24646 // Emit code to correctly handle reused reduced values, if required.
24647 if (OptReusedScalars && !SameScaleFactor) {
24648 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24649 SameValuesCounter, TrackedToOrig);
24650 }
24651
24652 Type *ScalarTy = VL.front()->getType();
24653 Type *VecTy = VectorizedRoot->getType();
24654 Type *RedScalarTy = VecTy->getScalarType();
24655 VectorValuesAndScales.emplace_back(
24656 VectorizedRoot,
24657 OptReusedScalars && SameScaleFactor
24658 ? SameValuesCounter.front().second
24659 : 1,
24660 RedScalarTy != ScalarTy->getScalarType()
24661 ? V.isSignedMinBitwidthRootNode()
24662 : true);
24663
24664 // Count vectorized reduced values to exclude them from final reduction.
24665 for (Value *RdxVal : VL) {
24666 Value *OrigV = TrackedToOrig.at(RdxVal);
24667 if (IsSupportedHorRdxIdentityOp) {
24668 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24669 continue;
24670 }
24671 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24672 if (!V.isVectorized(RdxVal))
24673 RequiredExtract.insert(RdxVal);
24674 }
24675 Pos += ReduxWidth;
24676 Start = Pos;
24677 ReduxWidth = NumReducedVals - Pos;
24678 if (ReduxWidth > 1)
24679 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24680 AnyVectorized = true;
24681 }
24682 if (OptReusedScalars && !AnyVectorized) {
24683 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
24684 Value *RdxVal = TrackedVals.at(P.first);
24685 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
24686 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24687 VectorizedVals.try_emplace(P.first, P.second);
24688 }
24689 continue;
24690 }
24691 }
24692 if (!VectorValuesAndScales.empty())
24693 VectorizedTree = GetNewVectorizedTree(
24694 VectorizedTree,
24695 emitReduction(Builder, *TTI, ReductionRoot->getType()));
24696
24697 if (!VectorizedTree) {
24698 if (!CheckForReusedReductionOps) {
24699 for (ReductionOpsType &RdxOps : ReductionOps)
24700 for (Value *RdxOp : RdxOps)
24701 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24702 }
24703 return nullptr;
24704 }
24705
24706 // Reorder operands of bool logical op in the natural order to avoid
24707 // possible problem with poison propagation. If not possible to reorder
24708 // (both operands are originally RHS), emit an extra freeze instruction
24709 // for the LHS operand.
24710 // I.e., if we have original code like this:
24711 // RedOp1 = select i1 ?, i1 LHS, i1 false
24712 // RedOp2 = select i1 RHS, i1 ?, i1 false
24713
24714 // Then, we swap LHS/RHS to create a new op that matches the poison
24715 // semantics of the original code.
24716
24717 // If we have original code like this and both values could be poison:
24718 // RedOp1 = select i1 ?, i1 LHS, i1 false
24719 // RedOp2 = select i1 ?, i1 RHS, i1 false
24720
24721 // Then, we must freeze LHS in the new op.
24722 auto FixBoolLogicalOps =
24723 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
24724 Instruction *RedOp2, bool InitStep) {
24725 if (!AnyBoolLogicOp)
24726 return;
24727 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
24728 getRdxOperand(RedOp1, 0) == LHS ||
24730 return;
24731 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
24732 getRdxOperand(RedOp2, 0) == RHS ||
24734 std::swap(LHS, RHS);
24735 return;
24736 }
24737 if (LHS != VectorizedTree)
24738 LHS = Builder.CreateFreeze(LHS);
24739 };
24740 // Finish the reduction.
24741 // Need to add extra arguments and not vectorized possible reduction values.
24742 // Try to avoid dependencies between the scalar remainders after reductions.
24743 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
24744 bool InitStep) {
24745 unsigned Sz = InstVals.size();
24746 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
24747 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
24748 Instruction *RedOp = InstVals[I + 1].first;
24749 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
24750 Value *RdxVal1 = InstVals[I].second;
24751 Value *StableRdxVal1 = RdxVal1;
24752 auto It1 = TrackedVals.find(RdxVal1);
24753 if (It1 != TrackedVals.end())
24754 StableRdxVal1 = It1->second;
24755 Value *RdxVal2 = InstVals[I + 1].second;
24756 Value *StableRdxVal2 = RdxVal2;
24757 auto It2 = TrackedVals.find(RdxVal2);
24758 if (It2 != TrackedVals.end())
24759 StableRdxVal2 = It2->second;
24760 // To prevent poison from leaking across what used to be sequential,
24761 // safe, scalar boolean logic operations, the reduction operand must be
24762 // frozen.
24763 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
24764 RedOp, InitStep);
24765 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24766 StableRdxVal2, "op.rdx", ReductionOps);
24767 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
24768 }
24769 if (Sz % 2 == 1)
24770 ExtraReds[Sz / 2] = InstVals.back();
24771 return ExtraReds;
24772 };
24774 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
24775 VectorizedTree);
24776 SmallPtrSet<Value *, 8> Visited;
24777 for (ArrayRef<Value *> Candidates : ReducedVals) {
24778 for (Value *RdxVal : Candidates) {
24779 if (!Visited.insert(RdxVal).second)
24780 continue;
24781 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24782 for (Instruction *RedOp :
24783 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
24784 ExtraReductions.emplace_back(RedOp, RdxVal);
24785 }
24786 }
24787 // Iterate through all not-vectorized reduction values/extra arguments.
24788 bool InitStep = true;
24789 while (ExtraReductions.size() > 1) {
24791 FinalGen(ExtraReductions, InitStep);
24792 ExtraReductions.swap(NewReds);
24793 InitStep = false;
24794 }
24795 VectorizedTree = ExtraReductions.front().second;
24796
24797 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24798
24799 // The original scalar reduction is expected to have no remaining
24800 // uses outside the reduction tree itself. Assert that we got this
24801 // correct, replace internal uses with undef, and mark for eventual
24802 // deletion.
24803#ifndef NDEBUG
24804 SmallPtrSet<Value *, 4> IgnoreSet;
24805 for (ArrayRef<Value *> RdxOps : ReductionOps)
24806 IgnoreSet.insert_range(RdxOps);
24807#endif
24808 for (ArrayRef<Value *> RdxOps : ReductionOps) {
24809 for (Value *Ignore : RdxOps) {
24810 if (!Ignore)
24811 continue;
24812#ifndef NDEBUG
24813 for (auto *U : Ignore->users()) {
24814 assert(IgnoreSet.count(U) &&
24815 "All users must be either in the reduction ops list.");
24816 }
24817#endif
24818 if (!Ignore->use_empty()) {
24819 Value *P = PoisonValue::get(Ignore->getType());
24820 Ignore->replaceAllUsesWith(P);
24821 }
24822 }
24823 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24824 }
24825 return VectorizedTree;
24826 }
24827
24828private:
24829 /// Creates the reduction from the given \p Vec vector value with the given
24830 /// scale \p Scale and signedness \p IsSigned.
24831 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24832 Value *Vec, unsigned Scale, bool IsSigned,
24833 Type *DestTy) {
24834 Value *Rdx;
24835 if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
24836 unsigned DestTyNumElements = getNumElements(VecTy);
24837 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
24838 Rdx = PoisonValue::get(
24839 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
24840 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
24841 // Do reduction for each lane.
24842 // e.g., do reduce add for
24843 // VL[0] = <4 x Ty> <a, b, c, d>
24844 // VL[1] = <4 x Ty> <e, f, g, h>
24845 // Lane[0] = <2 x Ty> <a, e>
24846 // Lane[1] = <2 x Ty> <b, f>
24847 // Lane[2] = <2 x Ty> <c, g>
24848 // Lane[3] = <2 x Ty> <d, h>
24849 // result[0] = reduce add Lane[0]
24850 // result[1] = reduce add Lane[1]
24851 // result[2] = reduce add Lane[2]
24852 // result[3] = reduce add Lane[3]
24853 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
24854 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
24855 Rdx = Builder.CreateInsertElement(
24856 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
24857 }
24858 } else {
24859 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
24860 }
24861 if (Rdx->getType() != DestTy)
24862 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
24863 // Improved analysis for add/fadd/xor reductions with same scale
24864 // factor for all operands of reductions. We can emit scalar ops for
24865 // them instead.
24866 if (Scale > 1)
24867 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24868 return Rdx;
24869 }
24870
24871 /// Calculate the cost of a reduction.
24872 InstructionCost getReductionCost(TargetTransformInfo *TTI,
24873 ArrayRef<Value *> ReducedVals,
24874 bool IsCmpSelMinMax, FastMathFlags FMF,
24875 const BoUpSLP &R, DominatorTree &DT,
24876 const DataLayout &DL,
24877 const TargetLibraryInfo &TLI) {
24879 Type *ScalarTy = ReducedVals.front()->getType();
24880 unsigned ReduxWidth = ReducedVals.size();
24881 FixedVectorType *VectorTy = R.getReductionType();
24882 InstructionCost VectorCost = 0, ScalarCost;
24883 // If all of the reduced values are constant, the vector cost is 0, since
24884 // the reduction value can be calculated at the compile time.
24885 bool AllConsts = allConstant(ReducedVals);
24886 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
24888 // Scalar cost is repeated for N-1 elements.
24889 int Cnt = ReducedVals.size();
24890 for (Value *RdxVal : ReducedVals) {
24891 if (Cnt == 1)
24892 break;
24893 --Cnt;
24894 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
24895 Cost += GenCostFn();
24896 continue;
24897 }
24898 InstructionCost ScalarCost = 0;
24899 for (User *U : RdxVal->users()) {
24900 auto *RdxOp = cast<Instruction>(U);
24901 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24902 if (RdxKind == RecurKind::FAdd) {
24904 RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
24905 if (FMACost.isValid()) {
24906 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
24907 if (auto *I = dyn_cast<Instruction>(RdxVal)) {
24908 // Also, exclude scalar fmul cost.
24909 InstructionCost FMulCost =
24911 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
24912 FMACost -= FMulCost;
24913 }
24914 ScalarCost += FMACost;
24915 continue;
24916 }
24917 }
24918 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
24919 continue;
24920 }
24921 ScalarCost = InstructionCost::getInvalid();
24922 break;
24923 }
24924 if (ScalarCost.isValid())
24925 Cost += ScalarCost;
24926 else
24927 Cost += GenCostFn();
24928 }
24929 return Cost;
24930 };
24931 // Require reduction cost if:
24932 // 1. This type is not a full register type and no other vectors with the
24933 // same type in the storage (first vector with small type).
24934 // 2. The storage does not have any vector with full vector use (first
24935 // vector with full register use).
24936 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
24937 switch (RdxKind) {
24938 case RecurKind::Add:
24939 case RecurKind::Mul:
24940 case RecurKind::Or:
24941 case RecurKind::And:
24942 case RecurKind::Xor:
24943 case RecurKind::FAdd:
24944 case RecurKind::FMul: {
24945 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
24946 if (!AllConsts) {
24947 if (DoesRequireReductionOp) {
24948 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
24949 assert(SLPReVec && "FixedVectorType is not expected.");
24950 unsigned ScalarTyNumElements = VecTy->getNumElements();
24951 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
24952 VectorCost += TTI->getShuffleCost(
24955 ReducedVals.size()),
24956 VectorTy,
24957 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
24958 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
24959 FMF, CostKind);
24960 }
24961 VectorCost += TTI->getScalarizationOverhead(
24962 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
24963 /*Extract*/ false, TTI::TCK_RecipThroughput);
24964 } else {
24965 Type *RedTy = VectorTy->getElementType();
24966 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24967 std::make_pair(RedTy, true));
24968 if (RType == RedTy) {
24969 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
24970 FMF, CostKind);
24971 } else {
24972 VectorCost = TTI->getExtendedReductionCost(
24973 RdxOpcode, !IsSigned, RedTy,
24974 getWidenedType(RType, ReduxWidth), FMF, CostKind);
24975 }
24976 }
24977 } else {
24978 Type *RedTy = VectorTy->getElementType();
24979 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24980 std::make_pair(RedTy, true));
24981 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24982 InstructionCost FMACost = InstructionCost::getInvalid();
24983 if (RdxKind == RecurKind::FAdd) {
24984 // Check if the reduction operands can be converted to FMA.
24986 FastMathFlags FMF;
24987 FMF.set();
24988 for (Value *RdxVal : ReducedVals) {
24989 if (!RdxVal->hasOneUse()) {
24990 Ops.clear();
24991 break;
24992 }
24993 if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
24994 FMF &= FPCI->getFastMathFlags();
24995 Ops.push_back(RdxVal->user_back());
24996 }
24997 if (!Ops.empty()) {
24998 FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
24999 *TTI, TLI);
25000 if (FMACost.isValid()) {
25001 // Calculate actual FMAD cost.
25002 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
25003 {RVecTy, RVecTy, RVecTy}, FMF);
25004 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
25005
25006 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
25007 // Also, exclude vector fmul cost.
25009 Instruction::FMul, RVecTy, CostKind);
25011 << "Minus vector FMul cost: " << FMulCost << "\n");
25012 FMACost -= FMulCost;
25013 }
25014 }
25015 }
25016 if (FMACost.isValid())
25017 VectorCost += FMACost;
25018 else
25019 VectorCost +=
25020 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
25021 if (RType != RedTy) {
25022 unsigned Opcode = Instruction::Trunc;
25023 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
25024 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25025 VectorCost += TTI->getCastInstrCost(
25026 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
25027 }
25028 }
25029 }
25030 ScalarCost = EvaluateScalarCost([&]() {
25031 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
25032 });
25033 break;
25034 }
25035 case RecurKind::FMax:
25036 case RecurKind::FMin:
25037 case RecurKind::FMaximum:
25038 case RecurKind::FMinimum:
25039 case RecurKind::SMax:
25040 case RecurKind::SMin:
25041 case RecurKind::UMax:
25042 case RecurKind::UMin: {
25044 if (!AllConsts) {
25045 if (DoesRequireReductionOp) {
25046 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
25047 } else {
25048 // Check if the previous reduction already exists and account it as
25049 // series of operations + single reduction.
25050 Type *RedTy = VectorTy->getElementType();
25051 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25052 std::make_pair(RedTy, true));
25053 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
25054 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
25055 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
25056 if (RType != RedTy) {
25057 unsigned Opcode = Instruction::Trunc;
25058 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
25059 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25060 VectorCost += TTI->getCastInstrCost(
25061 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
25062 }
25063 }
25064 }
25065 ScalarCost = EvaluateScalarCost([&]() {
25066 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
25067 return TTI->getIntrinsicInstrCost(ICA, CostKind);
25068 });
25069 break;
25070 }
25071 default:
25072 llvm_unreachable("Expected arithmetic or min/max reduction operation");
25073 }
25074
25075 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
25076 << " for reduction of " << shortBundleName(ReducedVals)
25077 << " (It is a splitting reduction)\n");
25078 return VectorCost - ScalarCost;
25079 }
25080
25081 /// Splits the values, stored in VectorValuesAndScales, into registers/free
25082 /// sub-registers, combines them with the given reduction operation as a
25083 /// vector operation and then performs single (small enough) reduction.
25084 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
25085 Type *DestTy) {
25086 Value *ReducedSubTree = nullptr;
25087 // Creates reduction and combines with the previous reduction.
25088 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
25089 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
25090 if (ReducedSubTree)
25091 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
25092 "op.rdx", ReductionOps);
25093 else
25094 ReducedSubTree = Rdx;
25095 };
25096 if (VectorValuesAndScales.size() == 1) {
25097 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
25098 CreateSingleOp(Vec, Scale, IsSigned);
25099 return ReducedSubTree;
25100 }
25101 // Scales Vec using given Cnt scale factor and then performs vector combine
25102 // with previous value of VecOp.
25103 Value *VecRes = nullptr;
25104 bool VecResSignedness = false;
25105 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
25106 Type *ScalarTy = Vec->getType()->getScalarType();
25107 // Scale Vec using given Cnt scale factor.
25108 if (Cnt > 1) {
25109 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
25110 switch (RdxKind) {
25111 case RecurKind::Add: {
25112 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
25113 unsigned VF = getNumElements(Vec->getType());
25114 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
25115 << ". (HorRdx)\n");
25116 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
25117 for (unsigned I : seq<unsigned>(Cnt))
25118 std::iota(std::next(Mask.begin(), VF * I),
25119 std::next(Mask.begin(), VF * (I + 1)), 0);
25120 ++NumVectorInstructions;
25121 Vec = Builder.CreateShuffleVector(Vec, Mask);
25122 break;
25123 }
25124 // res = mul vv, n
25125 if (ScalarTy != DestTy->getScalarType())
25126 Vec = Builder.CreateIntCast(
25127 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
25128 IsSigned);
25130 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
25131 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
25132 << ". (HorRdx)\n");
25133 ++NumVectorInstructions;
25134 Vec = Builder.CreateMul(Vec, Scale);
25135 break;
25136 }
25137 case RecurKind::Xor: {
25138 // res = n % 2 ? 0 : vv
25140 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
25141 if (Cnt % 2 == 0)
25142 Vec = Constant::getNullValue(Vec->getType());
25143 break;
25144 }
25145 case RecurKind::FAdd: {
25146 // res = fmul v, n
25147 Value *Scale =
25148 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
25149 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
25150 << ". (HorRdx)\n");
25151 ++NumVectorInstructions;
25152 Vec = Builder.CreateFMul(Vec, Scale);
25153 break;
25154 }
25155 case RecurKind::And:
25156 case RecurKind::Or:
25157 case RecurKind::SMax:
25158 case RecurKind::SMin:
25159 case RecurKind::UMax:
25160 case RecurKind::UMin:
25161 case RecurKind::FMax:
25162 case RecurKind::FMin:
25163 case RecurKind::FMaximum:
25164 case RecurKind::FMinimum:
25165 // res = vv
25166 break;
25167 case RecurKind::Sub:
25168 case RecurKind::AddChainWithSubs:
25169 case RecurKind::Mul:
25170 case RecurKind::FMul:
25171 case RecurKind::FMulAdd:
25172 case RecurKind::AnyOf:
25173 case RecurKind::FindFirstIVSMin:
25174 case RecurKind::FindFirstIVUMin:
25175 case RecurKind::FindLastIVSMax:
25176 case RecurKind::FindLastIVUMax:
25177 case RecurKind::FMaxNum:
25178 case RecurKind::FMinNum:
25179 case RecurKind::FMaximumNum:
25180 case RecurKind::FMinimumNum:
25181 case RecurKind::None:
25182 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
25183 }
25184 }
25185 // Combine Vec with the previous VecOp.
25186 if (!VecRes) {
25187 VecRes = Vec;
25188 VecResSignedness = IsSigned;
25189 } else {
25190 ++NumVectorInstructions;
25191 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
25192 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
25193 // Handle ctpop.
25194 unsigned VecResVF = getNumElements(VecRes->getType());
25195 unsigned VecVF = getNumElements(Vec->getType());
25196 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
25197 std::iota(Mask.begin(), Mask.end(), 0);
25198 // Ensure that VecRes is always larger than Vec
25199 if (VecResVF < VecVF) {
25200 std::swap(VecRes, Vec);
25201 std::swap(VecResVF, VecVF);
25202 }
25203 if (VecResVF != VecVF) {
25204 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
25205 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
25206 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
25207 }
25208 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
25209 return;
25210 }
25211 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
25212 VecRes = Builder.CreateIntCast(
25213 VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
25214 VecResSignedness);
25215 if (ScalarTy != DestTy->getScalarType())
25216 Vec = Builder.CreateIntCast(
25217 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
25218 IsSigned);
25219 unsigned VecResVF = getNumElements(VecRes->getType());
25220 unsigned VecVF = getNumElements(Vec->getType());
25221 // Ensure that VecRes is always larger than Vec
25222 if (VecResVF < VecVF) {
25223 std::swap(VecRes, Vec);
25224 std::swap(VecResVF, VecVF);
25225 }
25226 // extract + op + insert
25227 Value *Op = VecRes;
25228 if (VecResVF != VecVF)
25229 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
25230 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
25231 if (VecResVF != VecVF)
25232 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
25233 VecRes = Op;
25234 }
25235 };
25236 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25237 CreateVecOp(Vec, Scale, IsSigned);
25238 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
25239
25240 return ReducedSubTree;
25241 }
25242
25243 /// Emit a horizontal reduction of the vectorized value.
25244 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
25245 const TargetTransformInfo *TTI, Type *DestTy) {
25246 assert(VectorizedValue && "Need to have a vectorized tree node");
25247 assert(RdxKind != RecurKind::FMulAdd &&
25248 "A call to the llvm.fmuladd intrinsic is not handled yet");
25249
25250 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
25251 if (FTy->getScalarType() == Builder.getInt1Ty() &&
25252 RdxKind == RecurKind::Add &&
25253 DestTy->getScalarType() != FTy->getScalarType()) {
25254 // Convert vector_reduce_add(ZExt(<n x i1>)) to
25255 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
25256 Value *V = Builder.CreateBitCast(
25257 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
25258 ++NumVectorInstructions;
25259 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
25260 }
25261 ++NumVectorInstructions;
25262 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
25263 }
25264
25265 /// Emits optimized code for unique scalar value reused \p Cnt times.
25266 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
25267 unsigned Cnt) {
25268 assert(IsSupportedHorRdxIdentityOp &&
25269 "The optimization of matched scalar identity horizontal reductions "
25270 "must be supported.");
25271 if (Cnt == 1)
25272 return VectorizedValue;
25273 switch (RdxKind) {
25274 case RecurKind::Add: {
25275 // res = mul vv, n
25276 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
25277 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
25278 << VectorizedValue << ". (HorRdx)\n");
25279 return Builder.CreateMul(VectorizedValue, Scale);
25280 }
25281 case RecurKind::Xor: {
25282 // res = n % 2 ? 0 : vv
25283 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
25284 << ". (HorRdx)\n");
25285 if (Cnt % 2 == 0)
25286 return Constant::getNullValue(VectorizedValue->getType());
25287 return VectorizedValue;
25288 }
25289 case RecurKind::FAdd: {
25290 // res = fmul v, n
25291 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
25292 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
25293 << VectorizedValue << ". (HorRdx)\n");
25294 return Builder.CreateFMul(VectorizedValue, Scale);
25295 }
25296 case RecurKind::And:
25297 case RecurKind::Or:
25298 case RecurKind::SMax:
25299 case RecurKind::SMin:
25300 case RecurKind::UMax:
25301 case RecurKind::UMin:
25302 case RecurKind::FMax:
25303 case RecurKind::FMin:
25304 case RecurKind::FMaximum:
25305 case RecurKind::FMinimum:
25306 // res = vv
25307 return VectorizedValue;
25308 case RecurKind::Sub:
25309 case RecurKind::AddChainWithSubs:
25310 case RecurKind::Mul:
25311 case RecurKind::FMul:
25312 case RecurKind::FMulAdd:
25313 case RecurKind::AnyOf:
25314 case RecurKind::FindFirstIVSMin:
25315 case RecurKind::FindFirstIVUMin:
25316 case RecurKind::FindLastIVSMax:
25317 case RecurKind::FindLastIVUMax:
25318 case RecurKind::FMaxNum:
25319 case RecurKind::FMinNum:
25320 case RecurKind::FMaximumNum:
25321 case RecurKind::FMinimumNum:
25322 case RecurKind::None:
25323 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
25324 }
25325 return nullptr;
25326 }
25327
25328 /// Emits actual operation for the scalar identity values, found during
25329 /// horizontal reduction analysis.
25330 Value *
25331 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25332 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25333 const DenseMap<Value *, Value *> &TrackedToOrig) {
25334 assert(IsSupportedHorRdxIdentityOp &&
25335 "The optimization of matched scalar identity horizontal reductions "
25336 "must be supported.");
25337 ArrayRef<Value *> VL = R.getRootNodeScalars();
25338 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
25339 if (VTy->getElementType() != VL.front()->getType()) {
25340 VectorizedValue = Builder.CreateIntCast(
25341 VectorizedValue,
25342 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
25343 R.isSignedMinBitwidthRootNode());
25344 }
25345 switch (RdxKind) {
25346 case RecurKind::Add: {
25347 // root = mul prev_root, <1, 1, n, 1>
25349 for (Value *V : VL) {
25350 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25351 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
25352 }
25353 auto *Scale = ConstantVector::get(Vals);
25354 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
25355 << VectorizedValue << ". (HorRdx)\n");
25356 return Builder.CreateMul(VectorizedValue, Scale);
25357 }
25358 case RecurKind::And:
25359 case RecurKind::Or:
25360 // No need for multiple or/and(s).
25361 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
25362 << ". (HorRdx)\n");
25363 return VectorizedValue;
25364 case RecurKind::SMax:
25365 case RecurKind::SMin:
25366 case RecurKind::UMax:
25367 case RecurKind::UMin:
25368 case RecurKind::FMax:
25369 case RecurKind::FMin:
25370 case RecurKind::FMaximum:
25371 case RecurKind::FMinimum:
25372 // No need for multiple min/max(s) of the same value.
25373 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
25374 << ". (HorRdx)\n");
25375 return VectorizedValue;
25376 case RecurKind::Xor: {
25377 // Replace values with even number of repeats with 0, since
25378 // x xor x = 0.
25379 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
25380 // 7>, if elements 4th and 6th elements have even number of repeats.
25381 SmallVector<int> Mask(
25382 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
25384 std::iota(Mask.begin(), Mask.end(), 0);
25385 bool NeedShuffle = false;
25386 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
25387 Value *V = VL[I];
25388 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25389 if (Cnt % 2 == 0) {
25390 Mask[I] = VF;
25391 NeedShuffle = true;
25392 }
25393 }
25394 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
25395 : Mask) dbgs()
25396 << I << " ";
25397 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
25398 if (NeedShuffle)
25399 VectorizedValue = Builder.CreateShuffleVector(
25400 VectorizedValue,
25401 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
25402 return VectorizedValue;
25403 }
25404 case RecurKind::FAdd: {
25405 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
25407 for (Value *V : VL) {
25408 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25409 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
25410 }
25411 auto *Scale = ConstantVector::get(Vals);
25412 return Builder.CreateFMul(VectorizedValue, Scale);
25413 }
25414 case RecurKind::Sub:
25415 case RecurKind::AddChainWithSubs:
25416 case RecurKind::Mul:
25417 case RecurKind::FMul:
25418 case RecurKind::FMulAdd:
25419 case RecurKind::AnyOf:
25420 case RecurKind::FindFirstIVSMin:
25421 case RecurKind::FindFirstIVUMin:
25422 case RecurKind::FindLastIVSMax:
25423 case RecurKind::FindLastIVUMax:
25424 case RecurKind::FMaxNum:
25425 case RecurKind::FMinNum:
25426 case RecurKind::FMaximumNum:
25427 case RecurKind::FMinimumNum:
25428 case RecurKind::None:
25429 llvm_unreachable("Unexpected reduction kind for reused scalars.");
25430 }
25431 return nullptr;
25432 }
25433};
25434} // end anonymous namespace
25435
25436/// Gets recurrence kind from the specified value.
25438 return HorizontalReduction::getRdxKind(V);
25439}
25440static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
25441 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
25442 return cast<FixedVectorType>(IE->getType())->getNumElements();
25443
25444 unsigned AggregateSize = 1;
25445 auto *IV = cast<InsertValueInst>(InsertInst);
25446 Type *CurrentType = IV->getType();
25447 do {
25448 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
25449 for (auto *Elt : ST->elements())
25450 if (Elt != ST->getElementType(0)) // check homogeneity
25451 return std::nullopt;
25452 AggregateSize *= ST->getNumElements();
25453 CurrentType = ST->getElementType(0);
25454 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
25455 AggregateSize *= AT->getNumElements();
25456 CurrentType = AT->getElementType();
25457 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
25458 AggregateSize *= VT->getNumElements();
25459 return AggregateSize;
25460 } else if (CurrentType->isSingleValueType()) {
25461 return AggregateSize;
25462 } else {
25463 return std::nullopt;
25464 }
25465 } while (true);
25466}
25467
25468static void findBuildAggregateRec(Instruction *LastInsertInst,
25470 SmallVectorImpl<Value *> &BuildVectorOpds,
25471 SmallVectorImpl<Value *> &InsertElts,
25472 unsigned OperandOffset, const BoUpSLP &R) {
25473 do {
25474 Value *InsertedOperand = LastInsertInst->getOperand(1);
25475 std::optional<unsigned> OperandIndex =
25476 getElementIndex(LastInsertInst, OperandOffset);
25477 if (!OperandIndex || R.isDeleted(LastInsertInst))
25478 return;
25479 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
25481 BuildVectorOpds, InsertElts, *OperandIndex, R);
25482
25483 } else {
25484 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25485 InsertElts[*OperandIndex] = LastInsertInst;
25486 }
25487 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
25488 } while (LastInsertInst != nullptr &&
25490 LastInsertInst->hasOneUse());
25491}
25492
25493/// Recognize construction of vectors like
25494/// %ra = insertelement <4 x float> poison, float %s0, i32 0
25495/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
25496/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
25497/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
25498/// starting from the last insertelement or insertvalue instruction.
25499///
25500/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
25501/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
25502/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
25503///
25504/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
25505///
25506/// \return true if it matches.
25507static bool findBuildAggregate(Instruction *LastInsertInst,
25509 SmallVectorImpl<Value *> &BuildVectorOpds,
25510 SmallVectorImpl<Value *> &InsertElts,
25511 const BoUpSLP &R) {
25512
25513 assert((isa<InsertElementInst>(LastInsertInst) ||
25514 isa<InsertValueInst>(LastInsertInst)) &&
25515 "Expected insertelement or insertvalue instruction!");
25516
25517 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
25518 "Expected empty result vectors!");
25519
25520 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
25521 if (!AggregateSize)
25522 return false;
25523 BuildVectorOpds.resize(*AggregateSize);
25524 InsertElts.resize(*AggregateSize);
25525
25526 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
25527 llvm::erase(BuildVectorOpds, nullptr);
25528 llvm::erase(InsertElts, nullptr);
25529 if (BuildVectorOpds.size() >= 2)
25530 return true;
25531
25532 return false;
25533}
25534
25535/// Try and get a reduction instruction from a phi node.
25536///
25537/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
25538/// if they come from either \p ParentBB or a containing loop latch.
25539///
25540/// \returns A candidate reduction value if possible, or \code nullptr \endcode
25541/// if not possible.
25543 BasicBlock *ParentBB, LoopInfo *LI) {
25544 // There are situations where the reduction value is not dominated by the
25545 // reduction phi. Vectorizing such cases has been reported to cause
25546 // miscompiles. See PR25787.
25547 auto DominatedReduxValue = [&](Value *R) {
25548 return isa<Instruction>(R) &&
25549 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
25550 };
25551
25552 Instruction *Rdx = nullptr;
25553
25554 // Return the incoming value if it comes from the same BB as the phi node.
25555 if (P->getIncomingBlock(0) == ParentBB) {
25556 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25557 } else if (P->getIncomingBlock(1) == ParentBB) {
25558 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25559 }
25560
25561 if (Rdx && DominatedReduxValue(Rdx))
25562 return Rdx;
25563
25564 // Otherwise, check whether we have a loop latch to look at.
25565 Loop *BBL = LI->getLoopFor(ParentBB);
25566 if (!BBL)
25567 return nullptr;
25568 BasicBlock *BBLatch = BBL->getLoopLatch();
25569 if (!BBLatch)
25570 return nullptr;
25571
25572 // There is a loop latch, return the incoming value if it comes from
25573 // that. This reduction pattern occasionally turns up.
25574 if (P->getIncomingBlock(0) == BBLatch) {
25575 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25576 } else if (P->getIncomingBlock(1) == BBLatch) {
25577 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25578 }
25579
25580 if (Rdx && DominatedReduxValue(Rdx))
25581 return Rdx;
25582
25583 return nullptr;
25584}
25585
25586static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
25587 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
25588 return true;
25589 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
25590 return true;
25591 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
25592 return true;
25593 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
25594 return true;
25595 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
25596 return true;
25598 return true;
25600 return true;
25602 return true;
25604 return true;
25605 return false;
25606}
25607
25608/// We could have an initial reduction that is not an add.
25609/// r *= v1 + v2 + v3 + v4
25610/// In such a case start looking for a tree rooted in the first '+'.
25611/// \Returns the new root if found, which may be nullptr if not an instruction.
25613 Instruction *Root) {
25614 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
25615 isa<IntrinsicInst>(Root)) &&
25616 "Expected binop, select, or intrinsic for reduction matching");
25617 Value *LHS =
25618 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25619 Value *RHS =
25620 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25621 if (LHS == Phi)
25622 return dyn_cast<Instruction>(RHS);
25623 if (RHS == Phi)
25624 return dyn_cast<Instruction>(LHS);
25625 return nullptr;
25626}
25627
25628/// \p Returns the first operand of \p I that does not match \p Phi. If
25629/// operand is not an instruction it returns nullptr.
25631 Value *Op0 = nullptr;
25632 Value *Op1 = nullptr;
25633 if (!matchRdxBop(I, Op0, Op1))
25634 return nullptr;
25635 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
25636}
25637
25638/// \Returns true if \p I is a candidate instruction for reduction vectorization.
25640 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
25641 Value *B0 = nullptr, *B1 = nullptr;
25642 bool IsBinop = matchRdxBop(I, B0, B1);
25643 return IsBinop || IsSelect;
25644}
25645
25646bool SLPVectorizerPass::vectorizeHorReduction(
25647 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
25648 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25649 if (!ShouldVectorizeHor)
25650 return false;
25651 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
25652
25653 if (Root->getParent() != BB || isa<PHINode>(Root))
25654 return false;
25655
25656 // If we can find a secondary reduction root, use that instead.
25657 auto SelectRoot = [&]() {
25658 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
25659 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
25660 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
25661 return NewRoot;
25662 return Root;
25663 };
25664
25665 // Start analysis starting from Root instruction. If horizontal reduction is
25666 // found, try to vectorize it. If it is not a horizontal reduction or
25667 // vectorization is not possible or not effective, and currently analyzed
25668 // instruction is a binary operation, try to vectorize the operands, using
25669 // pre-order DFS traversal order. If the operands were not vectorized, repeat
25670 // the same procedure considering each operand as a possible root of the
25671 // horizontal reduction.
25672 // Interrupt the process if the Root instruction itself was vectorized or all
25673 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
25674 // If a horizintal reduction was not matched or vectorized we collect
25675 // instructions for possible later attempts for vectorization.
25676 std::queue<std::pair<Instruction *, unsigned>> Stack;
25677 Stack.emplace(SelectRoot(), 0);
25678 SmallPtrSet<Value *, 8> VisitedInstrs;
25679 bool Res = false;
25680 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
25681 if (R.isAnalyzedReductionRoot(Inst))
25682 return nullptr;
25683 if (!isReductionCandidate(Inst))
25684 return nullptr;
25685 HorizontalReduction HorRdx;
25686 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25687 return nullptr;
25688 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25689 };
25690 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
25691 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25692 FutureSeed = getNonPhiOperand(Root, P);
25693 if (!FutureSeed)
25694 return false;
25695 }
25696 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
25697 // analysis is done separately.
25699 PostponedInsts.push_back(FutureSeed);
25700 return true;
25701 };
25702
25703 while (!Stack.empty()) {
25704 Instruction *Inst;
25705 unsigned Level;
25706 std::tie(Inst, Level) = Stack.front();
25707 Stack.pop();
25708 // Do not try to analyze instruction that has already been vectorized.
25709 // This may happen when we vectorize instruction operands on a previous
25710 // iteration while stack was populated before that happened.
25711 if (R.isDeleted(Inst))
25712 continue;
25713 if (Value *VectorizedV = TryToReduce(Inst)) {
25714 Res = true;
25715 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
25716 // Try to find another reduction.
25717 Stack.emplace(I, Level);
25718 continue;
25719 }
25720 if (R.isDeleted(Inst))
25721 continue;
25722 } else {
25723 // We could not vectorize `Inst` so try to use it as a future seed.
25724 if (!TryAppendToPostponedInsts(Inst)) {
25725 assert(Stack.empty() && "Expected empty stack");
25726 break;
25727 }
25728 }
25729
25730 // Try to vectorize operands.
25731 // Continue analysis for the instruction from the same basic block only to
25732 // save compile time.
25733 if (++Level < RecursionMaxDepth)
25734 for (auto *Op : Inst->operand_values())
25735 if (VisitedInstrs.insert(Op).second)
25736 if (auto *I = dyn_cast<Instruction>(Op))
25737 // Do not try to vectorize CmpInst operands, this is done
25738 // separately.
25740 !R.isDeleted(I) && I->getParent() == BB)
25741 Stack.emplace(I, Level);
25742 }
25743 return Res;
25744}
25745
25746bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
25747 if (!I)
25748 return false;
25749
25750 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
25751 return false;
25752 // Skip potential FMA candidates.
25753 if ((I->getOpcode() == Instruction::FAdd ||
25754 I->getOpcode() == Instruction::FSub) &&
25755 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
25756 .isValid())
25757 return false;
25758
25759 Value *P = I->getParent();
25760
25761 // Vectorize in current basic block only.
25762 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
25763 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
25764 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
25765 R.isDeleted(Op0) || R.isDeleted(Op1))
25766 return false;
25767
25768 // First collect all possible candidates
25770 Candidates.emplace_back(Op0, Op1);
25771
25772 auto *A = dyn_cast<BinaryOperator>(Op0);
25773 auto *B = dyn_cast<BinaryOperator>(Op1);
25774 // Try to skip B.
25775 if (A && B && B->hasOneUse()) {
25776 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
25777 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
25778 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
25779 Candidates.emplace_back(A, B0);
25780 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
25781 Candidates.emplace_back(A, B1);
25782 }
25783 // Try to skip A.
25784 if (B && A && A->hasOneUse()) {
25785 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
25786 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
25787 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
25788 Candidates.emplace_back(A0, B);
25789 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
25790 Candidates.emplace_back(A1, B);
25791 }
25792
25793 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
25795 if (!isReductionCandidate(Inst))
25796 return false;
25797 Type *Ty = Inst->getType();
25798 if (!isValidElementType(Ty) || Ty->isPointerTy())
25799 return false;
25800 HorizontalReduction HorRdx(Inst, Ops);
25801 if (!HorRdx.matchReductionForOperands())
25802 return false;
25803 // Check the cost of operations.
25804 VectorType *VecTy = getWidenedType(Ty, Ops.size());
25806 InstructionCost ScalarCost =
25807 TTI.getScalarizationOverhead(
25808 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
25809 /*Extract=*/true, CostKind) +
25810 TTI.getInstructionCost(Inst, CostKind);
25811 InstructionCost RedCost;
25812 switch (::getRdxKind(Inst)) {
25813 case RecurKind::Add:
25814 case RecurKind::Mul:
25815 case RecurKind::Or:
25816 case RecurKind::And:
25817 case RecurKind::Xor:
25818 case RecurKind::FAdd:
25819 case RecurKind::FMul: {
25820 FastMathFlags FMF;
25821 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
25822 FMF = FPCI->getFastMathFlags();
25823 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25824 CostKind);
25825 break;
25826 }
25827 default:
25828 return false;
25829 }
25830 if (RedCost >= ScalarCost)
25831 return false;
25832
25833 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
25834 };
25835 if (Candidates.size() == 1)
25836 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
25837
25838 // We have multiple options. Try to pick the single best.
25839 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
25840 if (!BestCandidate)
25841 return false;
25842 return (*BestCandidate == 0 &&
25843 TryToReduce(I, {Candidates[*BestCandidate].first,
25844 Candidates[*BestCandidate].second})) ||
25845 tryToVectorizeList({Candidates[*BestCandidate].first,
25846 Candidates[*BestCandidate].second},
25847 R);
25848}
25849
25850bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
25851 BasicBlock *BB, BoUpSLP &R) {
25852 SmallVector<WeakTrackingVH> PostponedInsts;
25853 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
25854 Res |= tryToVectorize(PostponedInsts, R);
25855 return Res;
25856}
25857
25858bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
25859 BoUpSLP &R) {
25860 bool Res = false;
25861 for (Value *V : Insts)
25862 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
25863 Res |= tryToVectorize(Inst, R);
25864 return Res;
25865}
25866
25867bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25868 BasicBlock *BB, BoUpSLP &R,
25869 bool MaxVFOnly) {
25870 if (!R.canMapToVector(IVI->getType()))
25871 return false;
25872
25873 SmallVector<Value *, 16> BuildVectorOpds;
25874 SmallVector<Value *, 16> BuildVectorInsts;
25875 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
25876 return false;
25877
25878 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
25879 R.getORE()->emit([&]() {
25880 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
25881 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
25882 "trying reduction first.";
25883 });
25884 return false;
25885 }
25886 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
25887 // Aggregate value is unlikely to be processed in vector register.
25888 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25889}
25890
25891bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25892 BasicBlock *BB, BoUpSLP &R,
25893 bool MaxVFOnly) {
25894 SmallVector<Value *, 16> BuildVectorInsts;
25895 SmallVector<Value *, 16> BuildVectorOpds;
25896 SmallVector<int> Mask;
25897 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
25899 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
25900 return false;
25901
25902 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
25903 R.getORE()->emit([&]() {
25904 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
25905 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
25906 "trying reduction first.";
25907 });
25908 return false;
25909 }
25910 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
25911 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25912}
25913
25914template <typename T>
25916 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
25917 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
25918 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
25919 bool MaxVFOnly, BoUpSLP &R) {
25920 bool Changed = false;
25921 // Sort by type, parent, operands.
25922 stable_sort(Incoming, Comparator);
25923
25924 // Try to vectorize elements base on their type.
25925 SmallVector<T *> Candidates;
25927 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
25928 VL.clear()) {
25929 // Look for the next elements with the same type, parent and operand
25930 // kinds.
25931 auto *I = dyn_cast<Instruction>(*IncIt);
25932 if (!I || R.isDeleted(I)) {
25933 ++IncIt;
25934 continue;
25935 }
25936 auto *SameTypeIt = IncIt;
25937 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
25938 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25939 AreCompatible(VL, *SameTypeIt))) {
25940 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25941 ++SameTypeIt;
25942 if (I && !R.isDeleted(I))
25943 VL.push_back(cast<T>(I));
25944 }
25945
25946 // Try to vectorize them.
25947 unsigned NumElts = VL.size();
25948 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
25949 << NumElts << ")\n");
25950 // The vectorization is a 3-state attempt:
25951 // 1. Try to vectorize instructions with the same/alternate opcodes with the
25952 // size of maximal register at first.
25953 // 2. Try to vectorize remaining instructions with the same type, if
25954 // possible. This may result in the better vectorization results rather than
25955 // if we try just to vectorize instructions with the same/alternate opcodes.
25956 // 3. Final attempt to try to vectorize all instructions with the
25957 // same/alternate ops only, this may result in some extra final
25958 // vectorization.
25959 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
25960 // Success start over because instructions might have been changed.
25961 Changed = true;
25962 VL.swap(Candidates);
25963 Candidates.clear();
25964 for (T *V : VL) {
25965 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25966 Candidates.push_back(V);
25967 }
25968 } else {
25969 /// \Returns the minimum number of elements that we will attempt to
25970 /// vectorize.
25971 auto GetMinNumElements = [&R](Value *V) {
25972 unsigned EltSize = R.getVectorElementSize(V);
25973 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25974 };
25975 if (NumElts < GetMinNumElements(*IncIt) &&
25976 (Candidates.empty() ||
25977 Candidates.front()->getType() == (*IncIt)->getType())) {
25978 for (T *V : VL) {
25979 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25980 Candidates.push_back(V);
25981 }
25982 }
25983 }
25984 // Final attempt to vectorize instructions with the same types.
25985 if (Candidates.size() > 1 &&
25986 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25987 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
25988 // Success start over because instructions might have been changed.
25989 Changed = true;
25990 } else if (MaxVFOnly) {
25991 // Try to vectorize using small vectors.
25993 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
25994 VL.clear()) {
25995 auto *I = dyn_cast<Instruction>(*It);
25996 if (!I || R.isDeleted(I)) {
25997 ++It;
25998 continue;
25999 }
26000 auto *SameTypeIt = It;
26001 while (SameTypeIt != End &&
26002 (!isa<Instruction>(*SameTypeIt) ||
26003 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
26004 AreCompatible(*SameTypeIt, *It))) {
26005 auto *I = dyn_cast<Instruction>(*SameTypeIt);
26006 ++SameTypeIt;
26007 if (I && !R.isDeleted(I))
26008 VL.push_back(cast<T>(I));
26009 }
26010 unsigned NumElts = VL.size();
26011 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
26012 /*MaxVFOnly=*/false))
26013 Changed = true;
26014 It = SameTypeIt;
26015 }
26016 }
26017 Candidates.clear();
26018 }
26019
26020 // Start over at the next instruction of a different type (or the end).
26021 IncIt = SameTypeIt;
26022 }
26023 return Changed;
26024}
26025
26026/// Compare two cmp instructions. If IsCompatibility is true, function returns
26027/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
26028/// operands. If IsCompatibility is false, function implements strict weak
26029/// ordering relation between two cmp instructions, returning true if the first
26030/// instruction is "less" than the second, i.e. its predicate is less than the
26031/// predicate of the second or the operands IDs are less than the operands IDs
26032/// of the second cmp instruction.
26033template <bool IsCompatibility>
26034static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
26035 const DominatorTree &DT) {
26036 assert(isValidElementType(V->getType()) &&
26037 isValidElementType(V2->getType()) &&
26038 "Expected valid element types only.");
26039 if (V == V2)
26040 return IsCompatibility;
26041 auto *CI1 = cast<CmpInst>(V);
26042 auto *CI2 = cast<CmpInst>(V2);
26043 if (CI1->getOperand(0)->getType()->getTypeID() <
26044 CI2->getOperand(0)->getType()->getTypeID())
26045 return !IsCompatibility;
26046 if (CI1->getOperand(0)->getType()->getTypeID() >
26047 CI2->getOperand(0)->getType()->getTypeID())
26048 return false;
26049 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
26051 return !IsCompatibility;
26052 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
26054 return false;
26055 CmpInst::Predicate Pred1 = CI1->getPredicate();
26056 CmpInst::Predicate Pred2 = CI2->getPredicate();
26059 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
26060 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
26061 if (BasePred1 < BasePred2)
26062 return !IsCompatibility;
26063 if (BasePred1 > BasePred2)
26064 return false;
26065 // Compare operands.
26066 bool CI1Preds = Pred1 == BasePred1;
26067 bool CI2Preds = Pred2 == BasePred1;
26068 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
26069 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
26070 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
26071 if (Op1 == Op2)
26072 continue;
26073 if (Op1->getValueID() < Op2->getValueID())
26074 return !IsCompatibility;
26075 if (Op1->getValueID() > Op2->getValueID())
26076 return false;
26077 if (auto *I1 = dyn_cast<Instruction>(Op1))
26078 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
26079 if (IsCompatibility) {
26080 if (I1->getParent() != I2->getParent())
26081 return false;
26082 } else {
26083 // Try to compare nodes with same parent.
26084 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
26085 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
26086 if (!NodeI1)
26087 return NodeI2 != nullptr;
26088 if (!NodeI2)
26089 return false;
26090 assert((NodeI1 == NodeI2) ==
26091 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26092 "Different nodes should have different DFS numbers");
26093 if (NodeI1 != NodeI2)
26094 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26095 }
26096 InstructionsState S = getSameOpcode({I1, I2}, TLI);
26097 if (S && (IsCompatibility || !S.isAltShuffle()))
26098 continue;
26099 if (IsCompatibility)
26100 return false;
26101 if (I1->getOpcode() != I2->getOpcode())
26102 return I1->getOpcode() < I2->getOpcode();
26103 }
26104 }
26105 return IsCompatibility;
26106}
26107
26108template <typename ItT>
26109bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
26110 BasicBlock *BB, BoUpSLP &R) {
26111 bool Changed = false;
26112 // Try to find reductions first.
26113 for (CmpInst *I : CmpInsts) {
26114 if (R.isDeleted(I))
26115 continue;
26116 for (Value *Op : I->operands())
26117 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
26118 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
26119 if (R.isDeleted(I))
26120 break;
26121 }
26122 }
26123 // Try to vectorize operands as vector bundles.
26124 for (CmpInst *I : CmpInsts) {
26125 if (R.isDeleted(I))
26126 continue;
26127 Changed |= tryToVectorize(I, R);
26128 }
26129 // Try to vectorize list of compares.
26130 // Sort by type, compare predicate, etc.
26131 auto CompareSorter = [&](Value *V, Value *V2) {
26132 if (V == V2)
26133 return false;
26134 return compareCmp<false>(V, V2, *TLI, *DT);
26135 };
26136
26137 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
26138 if (VL.empty() || VL.back() == V1)
26139 return true;
26140 return compareCmp<true>(V1, VL.back(), *TLI, *DT);
26141 };
26142
26144 for (Instruction *V : CmpInsts)
26145 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
26146 Vals.push_back(V);
26147 if (Vals.size() <= 1)
26148 return Changed;
26150 Vals, CompareSorter, AreCompatibleCompares,
26151 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26152 // Exclude possible reductions from other blocks.
26153 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
26154 return any_of(V->users(), [V](User *U) {
26155 auto *Select = dyn_cast<SelectInst>(U);
26156 return Select &&
26157 Select->getParent() != cast<Instruction>(V)->getParent();
26158 });
26159 });
26160 if (ArePossiblyReducedInOtherBlock)
26161 return false;
26162 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26163 },
26164 /*MaxVFOnly=*/true, R);
26165 return Changed;
26166}
26167
26168bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
26169 BasicBlock *BB, BoUpSLP &R) {
26171 "This function only accepts Insert instructions");
26172 bool OpsChanged = false;
26173 SmallVector<WeakTrackingVH> PostponedInsts;
26174 for (auto *I : reverse(Instructions)) {
26175 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
26176 if (R.isDeleted(I) || isa<CmpInst>(I))
26177 continue;
26178 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
26179 OpsChanged |=
26180 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
26181 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
26182 OpsChanged |=
26183 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
26184 }
26185 // pass2 - try to vectorize reductions only
26186 if (R.isDeleted(I))
26187 continue;
26188 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
26189 if (R.isDeleted(I) || isa<CmpInst>(I))
26190 continue;
26191 // pass3 - try to match and vectorize a buildvector sequence.
26192 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
26193 OpsChanged |=
26194 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
26195 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
26196 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
26197 /*MaxVFOnly=*/false);
26198 }
26199 }
26200 // Now try to vectorize postponed instructions.
26201 OpsChanged |= tryToVectorize(PostponedInsts, R);
26202
26203 Instructions.clear();
26204 return OpsChanged;
26205}
26206
26207bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
26208 bool Changed = false;
26209 SmallVector<Value *, 4> Incoming;
26210 SmallPtrSet<Value *, 16> VisitedInstrs;
26211 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
26212 // node. Allows better to identify the chains that can be vectorized in the
26213 // better way.
26214 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
26215 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
26217 isValidElementType(V2->getType()) &&
26218 "Expected vectorizable types only.");
26219 if (V1 == V2)
26220 return false;
26221 // It is fine to compare type IDs here, since we expect only vectorizable
26222 // types, like ints, floats and pointers, we don't care about other type.
26223 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
26224 return true;
26225 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
26226 return false;
26227 if (V1->getType()->getScalarSizeInBits() <
26228 V2->getType()->getScalarSizeInBits())
26229 return true;
26230 if (V1->getType()->getScalarSizeInBits() >
26231 V2->getType()->getScalarSizeInBits())
26232 return false;
26233 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26234 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26235 if (Opcodes1.size() < Opcodes2.size())
26236 return true;
26237 if (Opcodes1.size() > Opcodes2.size())
26238 return false;
26239 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26240 {
26241 // Instructions come first.
26242 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
26243 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
26244 if (I1 && I2) {
26245 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
26246 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
26247 if (!NodeI1)
26248 return NodeI2 != nullptr;
26249 if (!NodeI2)
26250 return false;
26251 assert((NodeI1 == NodeI2) ==
26252 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26253 "Different nodes should have different DFS numbers");
26254 if (NodeI1 != NodeI2)
26255 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26256 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
26257 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
26258 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
26259 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
26260 if (!E1 || !E2)
26261 continue;
26262
26263 // Sort on ExtractElementInsts primarily by vector operands. Prefer
26264 // program order of the vector operands.
26265 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
26266 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
26267 if (V1 != V2) {
26268 if (V1 && !V2)
26269 return true;
26270 if (!V1 && V2)
26271 return false;
26273 DT->getNode(V1->getParent());
26275 DT->getNode(V2->getParent());
26276 if (!NodeI1)
26277 return NodeI2 != nullptr;
26278 if (!NodeI2)
26279 return false;
26280 assert((NodeI1 == NodeI2) ==
26281 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26282 "Different nodes should have different DFS numbers");
26283 if (NodeI1 != NodeI2)
26284 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26285 return V1->comesBefore(V2);
26286 }
26287 // If we have the same vector operand, try to sort by constant
26288 // index.
26289 std::optional<unsigned> Id1 = getExtractIndex(E1);
26290 std::optional<unsigned> Id2 = getExtractIndex(E2);
26291 // Bring constants to the top
26292 if (Id1 && !Id2)
26293 return true;
26294 if (!Id1 && Id2)
26295 return false;
26296 // First elements come first.
26297 if (Id1 && Id2)
26298 return *Id1 < *Id2;
26299
26300 continue;
26301 }
26302 if (I1->getOpcode() == I2->getOpcode())
26303 continue;
26304 return I1->getOpcode() < I2->getOpcode();
26305 }
26306 if (I1)
26307 return true;
26308 if (I2)
26309 return false;
26310 }
26311 {
26312 // Non-undef constants come next.
26313 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
26314 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
26315 if (C1 && C2)
26316 continue;
26317 if (C1)
26318 return true;
26319 if (C2)
26320 return false;
26321 }
26322 bool U1 = isa<UndefValue>(Opcodes1[I]);
26323 bool U2 = isa<UndefValue>(Opcodes2[I]);
26324 {
26325 // Non-constant non-instructions come next.
26326 if (!U1 && !U2) {
26327 auto ValID1 = Opcodes1[I]->getValueID();
26328 auto ValID2 = Opcodes2[I]->getValueID();
26329 if (ValID1 == ValID2)
26330 continue;
26331 if (ValID1 < ValID2)
26332 return true;
26333 if (ValID1 > ValID2)
26334 return false;
26335 }
26336 if (!U1)
26337 return true;
26338 if (!U2)
26339 return false;
26340 }
26341 // Undefs come last.
26342 assert(U1 && U2 && "The only thing left should be undef & undef.");
26343 }
26344 return false;
26345 };
26346 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
26347 Value *V1) {
26348 if (VL.empty() || V1 == VL.back())
26349 return true;
26350 Value *V2 = VL.back();
26351 if (V1->getType() != V2->getType())
26352 return false;
26353 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26354 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26355 if (Opcodes1.size() != Opcodes2.size())
26356 return false;
26357 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26358 // Undefs are compatible with any other value.
26359 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
26360 continue;
26361 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
26362 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
26363 if (R.isDeleted(I1) || R.isDeleted(I2))
26364 return false;
26365 if (I1->getParent() != I2->getParent())
26366 return false;
26367 if (getSameOpcode({I1, I2}, *TLI))
26368 continue;
26369 return false;
26370 }
26371 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
26372 continue;
26373 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
26374 return false;
26375 }
26376 return true;
26377 };
26378
26379 bool HaveVectorizedPhiNodes = false;
26380 do {
26381 // Collect the incoming values from the PHIs.
26382 Incoming.clear();
26383 for (Instruction &I : *BB) {
26384 auto *P = dyn_cast<PHINode>(&I);
26385 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
26386 break;
26387
26388 // No need to analyze deleted, vectorized and non-vectorizable
26389 // instructions.
26390 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
26391 isValidElementType(P->getType()))
26392 Incoming.push_back(P);
26393 }
26394
26395 if (Incoming.size() <= 1)
26396 break;
26397
26398 // Find the corresponding non-phi nodes for better matching when trying to
26399 // build the tree.
26400 for (Value *V : Incoming) {
26401 SmallVectorImpl<Value *> &Opcodes =
26402 PHIToOpcodes.try_emplace(V).first->getSecond();
26403 if (!Opcodes.empty())
26404 continue;
26405 SmallVector<Value *, 4> Nodes(1, V);
26406 SmallPtrSet<Value *, 4> Visited;
26407 while (!Nodes.empty()) {
26408 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
26409 if (!Visited.insert(PHI).second)
26410 continue;
26411 for (Value *V : PHI->incoming_values()) {
26412 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
26413 Nodes.push_back(PHI1);
26414 continue;
26415 }
26416 Opcodes.emplace_back(V);
26417 }
26418 }
26419 }
26420
26421 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
26422 Incoming, PHICompare, AreCompatiblePHIs,
26423 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26424 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26425 },
26426 /*MaxVFOnly=*/true, R);
26427 Changed |= HaveVectorizedPhiNodes;
26428 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
26429 auto *PHI = dyn_cast<PHINode>(P.first);
26430 return !PHI || R.isDeleted(PHI);
26431 }))
26432 PHIToOpcodes.clear();
26433 VisitedInstrs.insert_range(Incoming);
26434 } while (HaveVectorizedPhiNodes);
26435
26436 VisitedInstrs.clear();
26437
26438 InstSetVector PostProcessInserts;
26439 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26440 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
26441 // also vectorizes `PostProcessCmps`.
26442 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
26443 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26444 if (VectorizeCmps) {
26445 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
26446 PostProcessCmps.clear();
26447 }
26448 PostProcessInserts.clear();
26449 return Changed;
26450 };
26451 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
26452 auto IsInPostProcessInstrs = [&](Instruction *I) {
26453 if (auto *Cmp = dyn_cast<CmpInst>(I))
26454 return PostProcessCmps.contains(Cmp);
26456 PostProcessInserts.contains(I);
26457 };
26458 // Returns true if `I` is an instruction without users, like terminator, or
26459 // function call with ignored return value, store. Ignore unused instructions
26460 // (basing on instruction type, except for CallInst and InvokeInst).
26461 auto HasNoUsers = [](Instruction *I) {
26462 return I->use_empty() &&
26463 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
26464 };
26465 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
26466 // Skip instructions with scalable type. The num of elements is unknown at
26467 // compile-time for scalable type.
26468 if (isa<ScalableVectorType>(It->getType()))
26469 continue;
26470
26471 // Skip instructions marked for the deletion.
26472 if (R.isDeleted(&*It))
26473 continue;
26474 // We may go through BB multiple times so skip the one we have checked.
26475 if (!VisitedInstrs.insert(&*It).second) {
26476 if (HasNoUsers(&*It) &&
26477 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
26478 // We would like to start over since some instructions are deleted
26479 // and the iterator may become invalid value.
26480 Changed = true;
26481 It = BB->begin();
26482 E = BB->end();
26483 }
26484 continue;
26485 }
26486
26487 // Try to vectorize reductions that use PHINodes.
26488 if (PHINode *P = dyn_cast<PHINode>(It)) {
26489 // Check that the PHI is a reduction PHI.
26490 if (P->getNumIncomingValues() == 2) {
26491 // Try to match and vectorize a horizontal reduction.
26492 Instruction *Root = getReductionInstr(DT, P, BB, LI);
26493 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
26494 Changed = true;
26495 It = BB->begin();
26496 E = BB->end();
26497 continue;
26498 }
26499 }
26500 // Try to vectorize the incoming values of the PHI, to catch reductions
26501 // that feed into PHIs.
26502 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
26503 // Skip if the incoming block is the current BB for now. Also, bypass
26504 // unreachable IR for efficiency and to avoid crashing.
26505 // TODO: Collect the skipped incoming values and try to vectorize them
26506 // after processing BB.
26507 if (BB == P->getIncomingBlock(I) ||
26508 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
26509 continue;
26510
26511 // Postponed instructions should not be vectorized here, delay their
26512 // vectorization.
26513 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
26514 PI && !IsInPostProcessInstrs(PI)) {
26515 bool Res =
26516 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
26517 Changed |= Res;
26518 if (Res && R.isDeleted(P)) {
26519 It = BB->begin();
26520 E = BB->end();
26521 break;
26522 }
26523 }
26524 }
26525 continue;
26526 }
26527
26528 if (HasNoUsers(&*It)) {
26529 bool OpsChanged = false;
26530 auto *SI = dyn_cast<StoreInst>(It);
26531 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
26532 if (SI) {
26533 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
26534 // Try to vectorize chain in store, if this is the only store to the
26535 // address in the block.
26536 // TODO: This is just a temporarily solution to save compile time. Need
26537 // to investigate if we can safely turn on slp-vectorize-hor-store
26538 // instead to allow lookup for reduction chains in all non-vectorized
26539 // stores (need to check side effects and compile time).
26540 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
26541 SI->getValueOperand()->hasOneUse();
26542 }
26543 if (TryToVectorizeRoot) {
26544 for (auto *V : It->operand_values()) {
26545 // Postponed instructions should not be vectorized here, delay their
26546 // vectorization.
26547 if (auto *VI = dyn_cast<Instruction>(V);
26548 VI && !IsInPostProcessInstrs(VI))
26549 // Try to match and vectorize a horizontal reduction.
26550 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
26551 }
26552 }
26553 // Start vectorization of post-process list of instructions from the
26554 // top-tree instructions to try to vectorize as many instructions as
26555 // possible.
26556 OpsChanged |=
26557 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
26558 if (OpsChanged) {
26559 // We would like to start over since some instructions are deleted
26560 // and the iterator may become invalid value.
26561 Changed = true;
26562 It = BB->begin();
26563 E = BB->end();
26564 continue;
26565 }
26566 }
26567
26569 PostProcessInserts.insert(&*It);
26570 else if (isa<CmpInst>(It))
26571 PostProcessCmps.insert(cast<CmpInst>(&*It));
26572 }
26573
26574 return Changed;
26575}
26576
26577bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
26578 auto Changed = false;
26579 for (auto &Entry : GEPs) {
26580 // If the getelementptr list has fewer than two elements, there's nothing
26581 // to do.
26582 if (Entry.second.size() < 2)
26583 continue;
26584
26585 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
26586 << Entry.second.size() << ".\n");
26587
26588 // Process the GEP list in chunks suitable for the target's supported
26589 // vector size. If a vector register can't hold 1 element, we are done. We
26590 // are trying to vectorize the index computations, so the maximum number of
26591 // elements is based on the size of the index expression, rather than the
26592 // size of the GEP itself (the target's pointer size).
26593 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
26594 return !R.isDeleted(GEP);
26595 });
26596 if (It == Entry.second.end())
26597 continue;
26598 unsigned MaxVecRegSize = R.getMaxVecRegSize();
26599 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
26600 if (MaxVecRegSize < EltSize)
26601 continue;
26602
26603 unsigned MaxElts = MaxVecRegSize / EltSize;
26604 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
26605 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26606 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
26607
26608 // Initialize a set a candidate getelementptrs. Note that we use a
26609 // SetVector here to preserve program order. If the index computations
26610 // are vectorizable and begin with loads, we want to minimize the chance
26611 // of having to reorder them later.
26612 SetVector<Value *> Candidates(llvm::from_range, GEPList);
26613
26614 // Some of the candidates may have already been vectorized after we
26615 // initially collected them or their index is optimized to constant value.
26616 // If so, they are marked as deleted, so remove them from the set of
26617 // candidates.
26618 Candidates.remove_if([&R](Value *I) {
26619 return R.isDeleted(cast<Instruction>(I)) ||
26620 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
26621 });
26622
26623 // Remove from the set of candidates all pairs of getelementptrs with
26624 // constant differences. Such getelementptrs are likely not good
26625 // candidates for vectorization in a bottom-up phase since one can be
26626 // computed from the other. We also ensure all candidate getelementptr
26627 // indices are unique.
26628 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
26629 auto *GEPI = GEPList[I];
26630 if (!Candidates.count(GEPI))
26631 continue;
26632 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
26633 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
26634 auto *GEPJ = GEPList[J];
26635 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26636 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
26637 Candidates.remove(GEPI);
26638 Candidates.remove(GEPJ);
26639 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26640 Candidates.remove(GEPJ);
26641 }
26642 }
26643 }
26644
26645 // We break out of the above computation as soon as we know there are
26646 // fewer than two candidates remaining.
26647 if (Candidates.size() < 2)
26648 continue;
26649
26650 // Add the single, non-constant index of each candidate to the bundle. We
26651 // ensured the indices met these constraints when we originally collected
26652 // the getelementptrs.
26653 SmallVector<Value *, 16> Bundle(Candidates.size());
26654 auto BundleIndex = 0u;
26655 for (auto *V : Candidates) {
26656 auto *GEP = cast<GetElementPtrInst>(V);
26657 auto *GEPIdx = GEP->idx_begin()->get();
26658 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
26659 Bundle[BundleIndex++] = GEPIdx;
26660 }
26661
26662 // Try and vectorize the indices. We are currently only interested in
26663 // gather-like cases of the form:
26664 //
26665 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
26666 //
26667 // where the loads of "a", the loads of "b", and the subtractions can be
26668 // performed in parallel. It's likely that detecting this pattern in a
26669 // bottom-up phase will be simpler and less costly than building a
26670 // full-blown top-down phase beginning at the consecutive loads.
26671 Changed |= tryToVectorizeList(Bundle, R);
26672 }
26673 }
26674 return Changed;
26675}
26676
26677bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
26678 bool Changed = false;
26679 // Sort by type, base pointers and values operand. Value operands must be
26680 // compatible (have the same opcode, same parent), otherwise it is
26681 // definitely not profitable to try to vectorize them.
26682 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
26683 if (V->getValueOperand()->getType()->getTypeID() <
26684 V2->getValueOperand()->getType()->getTypeID())
26685 return true;
26686 if (V->getValueOperand()->getType()->getTypeID() >
26687 V2->getValueOperand()->getType()->getTypeID())
26688 return false;
26689 if (V->getPointerOperandType()->getTypeID() <
26690 V2->getPointerOperandType()->getTypeID())
26691 return true;
26692 if (V->getPointerOperandType()->getTypeID() >
26693 V2->getPointerOperandType()->getTypeID())
26694 return false;
26695 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
26696 V2->getValueOperand()->getType()->getScalarSizeInBits())
26697 return true;
26698 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
26699 V2->getValueOperand()->getType()->getScalarSizeInBits())
26700 return false;
26701 // UndefValues are compatible with all other values.
26702 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
26703 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26704 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26705 DT->getNode(I1->getParent());
26706 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26707 DT->getNode(I2->getParent());
26708 assert(NodeI1 && "Should only process reachable instructions");
26709 assert(NodeI2 && "Should only process reachable instructions");
26710 assert((NodeI1 == NodeI2) ==
26711 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26712 "Different nodes should have different DFS numbers");
26713 if (NodeI1 != NodeI2)
26714 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26715 return I1->getOpcode() < I2->getOpcode();
26716 }
26717 return V->getValueOperand()->getValueID() <
26718 V2->getValueOperand()->getValueID();
26719 };
26720
26721 bool SameParent = true;
26722 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
26723 if (VL.empty()) {
26724 SameParent = true;
26725 return true;
26726 }
26727 StoreInst *V2 = VL.back();
26728 if (V1 == V2)
26729 return true;
26730 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
26731 return false;
26732 if (V1->getPointerOperandType() != V2->getPointerOperandType())
26733 return false;
26734 // Undefs are compatible with any other value.
26735 if (isa<UndefValue>(V1->getValueOperand()) ||
26737 return true;
26738 if (isa<Constant>(V1->getValueOperand()) &&
26740 return true;
26741 // Check if the operands of the stores can be vectorized. They can be
26742 // vectorized, if they have compatible operands or have operands, which can
26743 // be vectorized as copyables.
26744 auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
26745 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
26746 if (I1 || I2) {
26747 // Accept only tail-following non-compatible values for now.
26748 // TODO: investigate if it is possible to vectorize incompatible values,
26749 // if the copyables are first in the list.
26750 if (I1 && !I2)
26751 return false;
26752 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
26753 SmallVector<Value *> NewVL(VL.size() + 1);
26754 for (auto [SI, V] : zip(VL, NewVL))
26755 V = SI->getValueOperand();
26756 NewVL.back() = V1->getValueOperand();
26757 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
26758 InstructionsState S = Analysis.buildInstructionsState(
26759 NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
26760 /*SkipSameCodeCheck=*/!SameParent);
26761 if (S)
26762 return true;
26763 if (!SameParent)
26764 return false;
26765 }
26766 return V1->getValueOperand()->getValueID() ==
26767 V2->getValueOperand()->getValueID();
26768 };
26769
26770 // Attempt to sort and vectorize each of the store-groups.
26771 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26772 for (auto &Pair : Stores) {
26773 if (Pair.second.size() < 2)
26774 continue;
26775
26776 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
26777 << Pair.second.size() << ".\n");
26778
26779 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
26780 continue;
26781
26782 // Reverse stores to do bottom-to-top analysis. This is important if the
26783 // values are stores to the same addresses several times, in this case need
26784 // to follow the stores order (reversed to meet the memory dependecies).
26785 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
26786 Pair.second.rend());
26788 ReversedStores, StoreSorter, AreCompatibleStores,
26789 [&](ArrayRef<StoreInst *> Candidates, bool) {
26790 return vectorizeStores(Candidates, R, Attempted);
26791 },
26792 /*MaxVFOnly=*/false, R);
26793 }
26794 return Changed;
26795}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Early If Converter
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
Definition ExpandFp.cpp:993
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
Hexagon Common GEP
#define _
static Type * getIndexType(Value *In)
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
This file provides utility analysis objects describing memory locations.
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1407
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1331
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1112
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1397
void negate()
Negate this APInt in place.
Definition APInt.h:1469
unsigned logBase2() const
Definition APInt.h:1762
void setAllBits()
Set every bit to 1.
Definition APInt.h:1320
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1368
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:287
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:179
const T & back() const
back - Get the last element.
Definition ArrayRef.h:152
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:220
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:196
const T & front() const
front - Get the first element.
Definition ArrayRef.h:146
iterator end() const
Definition ArrayRef.h:132
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
iterator begin() const
Definition ArrayRef.h:131
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:138
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:187
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition ArrayRef.h:158
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:480
InstListType::const_reverse_iterator const_reverse_iterator
Definition BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition BasicBlock.h:707
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition InstrTypes.h:664
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:827
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
static bool shouldExecute(unsigned CounterName)
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getUnknown()
Definition DebugLoc.h:162
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:248
bool erase(const KeyT &Val)
Definition DenseMap.h:322
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:233
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:310
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
ArrayRef< Type * > params() const
Type * getReturnType() const
bool empty() const
Definition Function.h:857
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:547
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2280
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
bool isSimple() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
iterator end()
Definition MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition MapVector.h:48
iterator find(const KeyT &Key)
Definition MapVector.h:149
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:111
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:299
T & front() const
front - Get the first element.
Definition ArrayRef.h:350
iterator end() const
Definition ArrayRef.h:344
iterator begin() const
Definition ArrayRef.h:343
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
ArrayRef< value_type > getArrayRef() const
Definition SetVector.h:90
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
const value_type & front() const
Return the first element of the SetVector.
Definition SetVector.h:131
void insert_range(Range &&R)
Definition SetVector.h:175
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition SetVector.h:93
void clear()
Completely clear the SetVector.
Definition SetVector.h:266
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:99
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition SetVector.h:251
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:338
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:228
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
size_type size() const
Definition SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
Definition Type.cpp:181
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition User.h:119
op_iterator op_begin()
Definition User.h:284
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:24
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
iterator_range< value_op_iterator > operand_values()
Definition User.h:316
The Vector Function Database.
Definition VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
User * user_back()
Definition Value.h:412
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:265
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:167
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition Hashing.h:76
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
CallInst * Call
Changed
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2058
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1718
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
InstructionCost Cost
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1724
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2231
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition STLExtras.h:1981
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:95
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2128
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1968
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition STLExtras.h:1763
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:669
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:339
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition STLExtras.h:1397
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1920
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ None
Not a recurrence.
@ Xor
Bitwise or logical XOR of integers.
@ FMul
Product of floats.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2030
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1835
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1407
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
ScalarEvolution * SE
TargetTransformInfo * TTI
AssumptionCache * AC
TargetLibraryInfo * TLI
const DataLayout * DL
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:257
Describe known properties for a set of pointers.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition STLExtras.h:1425
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1434
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const