74#ifdef EXPENSIVE_CHECKS
108using namespace std::placeholders;
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
113STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
116 "Controls which SLP graphs should be vectorized.");
120 cl::desc(
"Run the SLP vectorization passes"));
124 cl::desc(
"Enable vectorization for wider vector utilization"));
128 cl::desc(
"Only vectorize if you gain more than this "
133 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
138 cl::desc(
"Attempt to vectorize horizontal reductions"));
143 "Attempt to vectorize horizontal reductions feeding into a store"));
147 cl::desc(
"Improve the code quality by splitting alternate instructions"));
151 cl::desc(
"Attempt to vectorize for this register size in bits"));
155 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
163 cl::desc(
"Limit the size of the SLP scheduling region per block"));
167 cl::desc(
"Attempt to vectorize for this register size in bits"));
171 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
175 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
181 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
190 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
194 cl::desc(
"The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
199 cl::desc(
"The maximum stride, considered to be profitable."));
203 cl::desc(
"Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
208 cl::desc(
"Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
213 cl::desc(
"Display the SLP trees with Graphviz"));
217 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
222 cl::desc(
"Try to replace values with the idempotent instructions for "
223 "better vectorization."));
255 Ty = Ty->getScalarType();
257 !Ty->isPPC_FP128Ty();
266 return SI->getValueOperand()->getType();
268 return CI->getOperand(0)->getType();
270 return IE->getOperand(1)->getType();
277 "ScalableVectorType is not supported.");
279 return VecTy->getNumElements();
293 Type *Ty,
unsigned Sz) {
298 if (NumParts == 0 || NumParts >= Sz)
313 if (NumParts == 0 || NumParts >= Sz)
318 return (Sz / RegVF) * RegVF;
330 I * VecTyNumElements, VecTyNumElements)))
332 : Mask[
I] * VecTyNumElements + J;
366 unsigned SVNumElements =
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
374 unsigned NumGroup = 0;
375 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
377 Value *Src = SV->getOperand(0);
383 if (SV->getOperand(0) != Src)
386 if (!SV->isExtractSubvectorMask(Index))
388 ExpectedIndex.
set(Index / ShuffleMaskSize);
392 if (!ExpectedIndex.
all())
396 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
415 unsigned SVNumElements =
418 unsigned AccumulateLength = 0;
419 for (
Value *V : VL) {
421 for (
int M : SV->getShuffleMask())
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
465 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
474 OS <<
"Idx: " << Idx <<
", ";
475 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
498 if (BB !=
II->getParent())
515 Value *FirstNonUndef =
nullptr;
516 for (
Value *V : VL) {
519 if (!FirstNonUndef) {
523 if (V != FirstNonUndef)
526 return FirstNonUndef !=
nullptr;
542 return Cmp->isCommutative();
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
564 (BO->getOpcode() == Instruction::FSub &&
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
570 return I->isCommutative();
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
593 return I->getNumOperands();
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
610 if (CI->getValue().uge(VT->getNumElements()))
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
635 Type *CurrentType =
IV->getType();
636 for (
unsigned I :
IV->indices()) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(
I);
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
664 return std::all_of(It, VL.
end(), [&](
Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
697 if (MaskArg == UseMask::UndefsAsMask)
701 if (MaskArg == UseMask::FirstArg &&
Value < VF)
702 UseMask.reset(
Value);
703 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
704 UseMask.reset(
Value - VF);
712template <
bool IsPoisonOnly = false>
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
724 if (!UseMask.empty()) {
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
750 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
751 if (
Constant *Elem =
C->getAggregateElement(
I))
753 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
781static std::optional<TargetTransformInfo::ShuffleKind>
788 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
795 return std::max(S, VTy->getNumElements());
798 Value *Vec1 =
nullptr;
799 Value *Vec2 =
nullptr;
804 Value *Vec = EE->getVectorOperand();
810 ShuffleMode CommonShuffleMode =
Unknown;
812 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
819 auto *Vec = EI->getVectorOperand();
833 if (Idx->getValue().uge(
Size))
835 unsigned IntIdx = Idx->getValue().getZExtValue();
842 if (!Vec1 || Vec1 == Vec) {
844 }
else if (!Vec2 || Vec2 == Vec) {
850 if (CommonShuffleMode == Permute)
854 if (Mask[
I] %
Size !=
I) {
855 CommonShuffleMode = Permute;
858 CommonShuffleMode =
Select;
861 if (CommonShuffleMode ==
Select && Vec2)
871 unsigned Opcode =
E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
879 return CI->getZExtValue();
882 if (EI->getNumIndices() != 1)
884 return *EI->idx_begin();
910bool isValidForAlternation(
unsigned Opcode) {
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
934 MainOpBIT = 0b100000000,
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(
const Instruction *
I) {
944 unsigned Opcode =
I->getOpcode();
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
957 struct InterchangeableInfo {
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(
const Instruction *I) : I(I) {}
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
979 bool equal(
unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1006 bool hasCandidateOpcode(
unsigned Opcode)
const {
1007 MaskType Candidate = Mask & SeenBefore;
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1058 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1061 : APInt::getZero(FromCIValueBitWidth);
1064 case Instruction::Mul:
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1069 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1072 : APInt::getZero(FromCIValueBitWidth);
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.
isZero()) {
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1086 case Instruction::And:
1088 ToCIValue = ToOpcode == Instruction::Mul
1090 : APInt::getZero(FromCIValueBitWidth);
1093 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1097 Value *
LHS = I->getOperand(1 - Pos);
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1103 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1104 FromOpcode == Instruction::Xor) &&
1105 ToOpcode == Instruction::Sub))
1110 InterchangeableInfo MainOp;
1111 InterchangeableInfo AltOp;
1112 bool isValidForAlternation(
const Instruction *
I)
const {
1113 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1114 ::isValidForAlternation(
I->getOpcode());
1116 bool initializeAltOp(
const Instruction *
I) {
1119 if (!isValidForAlternation(
I))
1126 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1127 const Instruction *AltOp =
nullptr)
1128 : MainOp(MainOp), AltOp(AltOp) {
1131 bool add(
const Instruction *
I) {
1133 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1134 unsigned Opcode =
I->getOpcode();
1135 MaskType OpcodeInMaskForm;
1138 case Instruction::Shl:
1139 OpcodeInMaskForm = ShlBIT;
1141 case Instruction::AShr:
1142 OpcodeInMaskForm = AShrBIT;
1144 case Instruction::Mul:
1145 OpcodeInMaskForm = MulBIT;
1147 case Instruction::Add:
1148 OpcodeInMaskForm = AddBIT;
1150 case Instruction::Sub:
1151 OpcodeInMaskForm = SubBIT;
1153 case Instruction::And:
1154 OpcodeInMaskForm = AndBIT;
1156 case Instruction::Or:
1157 OpcodeInMaskForm = OrBIT;
1159 case Instruction::Xor:
1160 OpcodeInMaskForm = XorBIT;
1163 return MainOp.equal(Opcode) ||
1164 (initializeAltOp(
I) && AltOp.equal(Opcode));
1166 MaskType InterchangeableMask = OpcodeInMaskForm;
1167 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1169 constexpr MaskType CanBeAll =
1170 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1171 const APInt &CIValue = CI->
getValue();
1173 case Instruction::Shl:
1175 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1177 case Instruction::Mul:
1178 if (CIValue.
isOne()) {
1179 InterchangeableMask = CanBeAll;
1183 InterchangeableMask = MulBIT | ShlBIT;
1185 case Instruction::Add:
1186 case Instruction::Sub:
1187 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1189 case Instruction::And:
1191 InterchangeableMask = CanBeAll;
1193 case Instruction::Xor:
1195 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1199 InterchangeableMask = CanBeAll;
1203 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1204 (initializeAltOp(
I) &&
1205 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1207 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1209 bool hasCandidateOpcode(
unsigned Opcode)
const {
1210 return MainOp.hasCandidateOpcode(Opcode);
1212 bool hasAltOp()
const {
return AltOp.I; }
1213 unsigned getAltOpcode()
const {
1214 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1217 return MainOp.getOperand(
I);
1222class InstructionsState {
1248 bool HasCopyables =
false;
1252 assert(valid() &&
"InstructionsState is invalid.");
1257 assert(valid() &&
"InstructionsState is invalid.");
1262 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1264 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1267 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1276 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1277 assert(MainOp &&
"MainOp cannot be nullptr.");
1278 if (
I->getOpcode() == MainOp->getOpcode())
1281 assert(AltOp &&
"AltOp cannot be nullptr.");
1282 if (
I->getOpcode() == AltOp->getOpcode())
1284 if (!
I->isBinaryOp())
1286 BinOpSameOpcodeHelper
Converter(MainOp);
1289 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1290 BinOpSameOpcodeHelper AltConverter(AltOp);
1291 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1292 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1295 if (
Converter.hasAltOp() && !isAltShuffle())
1297 return Converter.hasAltOp() ? AltOp : MainOp;
1301 bool isShiftOp()
const {
1302 return getMainOp()->isShift() && getAltOp()->isShift();
1307 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1311 bool isMulDivLikeOp()
const {
1312 constexpr std::array<unsigned, 8> MulDiv = {
1313 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1314 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1315 Instruction::URem, Instruction::FRem};
1321 bool isAddSubLikeOp()
const {
1322 constexpr std::array<unsigned, 4>
AddSub = {
1323 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1330 bool isCmpOp()
const {
1331 return (
getOpcode() == Instruction::ICmp ||
1337 bool valid()
const {
return MainOp && AltOp; }
1339 explicit operator bool()
const {
return valid(); }
1341 InstructionsState() =
delete;
1342 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1343 bool HasCopyables =
false)
1344 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1345 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1348 bool isCopyableElement(
Value *V)
const {
1349 assert(valid() &&
"InstructionsState is invalid.");
1352 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1357 if (
I->getParent() != MainOp->getParent() &&
1361 if (
I->getOpcode() == MainOp->getOpcode())
1363 if (!
I->isBinaryOp())
1365 BinOpSameOpcodeHelper
Converter(MainOp);
1371 bool isNonSchedulable(
Value *V)
const {
1372 assert(valid() &&
"InstructionsState is invalid.");
1379 if (getMainOp() == V)
1381 if (isCopyableElement(V)) {
1382 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1384 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1389 !MainOp->comesBefore(
I));
1392 return IsNonSchedulableCopyableElement(V);
1399 bool areInstructionsWithCopyableElements()
const {
1400 assert(valid() &&
"InstructionsState is invalid.");
1401 return HasCopyables;
1405std::pair<Instruction *, SmallVector<Value *>>
1407 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1408 assert(SelectedOp &&
"Cannot convert the instruction.");
1409 if (
I->isBinaryOp()) {
1411 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1430 for (
Value *V : VL) {
1435 if (Inst->getOpcode() == Opcode)
1449 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1460 "Assessing comparisons of different types?");
1470 return (BasePred == Pred &&
1472 (BasePred == SwappedPred &&
1483 return InstructionsState::invalid();
1487 return InstructionsState::invalid();
1492 (VL.
size() == 2 && InstCnt < 2))
1493 return InstructionsState::invalid();
1502 unsigned AltOpcode = Opcode;
1504 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1505 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1507 UniquePreds.
insert(BasePred);
1508 UniqueNonSwappedPreds.
insert(BasePred);
1509 for (
Value *V : VL) {
1516 UniqueNonSwappedPreds.
insert(CurrentPred);
1517 if (!UniquePreds.
contains(CurrentPred) &&
1518 !UniquePreds.
contains(SwappedCurrentPred))
1519 UniquePreds.
insert(CurrentPred);
1524 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1534 return InstructionsState::invalid();
1536 bool AnyPoison = InstCnt != VL.
size();
1547 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1548 return InstructionsState::invalid();
1549 unsigned InstOpcode =
I->getOpcode();
1551 if (BinOpHelper.add(
I))
1556 Value *Op1 =
I->getOperand(0);
1559 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1561 if (Opcode == AltOpcode) {
1562 assert(isValidForAlternation(Opcode) &&
1563 isValidForAlternation(InstOpcode) &&
1564 "Cast isn't safe for alternation, logic needs to be updated!");
1565 AltOpcode = InstOpcode;
1572 Type *Ty0 = BaseInst->getOperand(0)->getType();
1573 Type *Ty1 = Inst->getOperand(0)->getType();
1575 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1576 assert(InstOpcode == AltOpcode &&
1577 "Alternate instructions are only supported by BinaryOperator "
1585 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1586 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1592 if (MainOp != AltOp) {
1595 }
else if (BasePred != CurrentPred) {
1597 isValidForAlternation(InstOpcode) &&
1598 "CmpInst isn't safe for alternation, logic needs to be updated!");
1603 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1604 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1607 }
else if (InstOpcode == Opcode) {
1608 assert(InstOpcode == AltOpcode &&
1609 "Alternate instructions are only supported by BinaryOperator and "
1612 if (Gep->getNumOperands() != 2 ||
1614 return InstructionsState::invalid();
1617 return InstructionsState::invalid();
1620 if (!LI->isSimple() || !BaseLI->isSimple())
1621 return InstructionsState::invalid();
1625 return InstructionsState::invalid();
1626 if (
Call->hasOperandBundles() &&
1628 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1629 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1632 return InstructionsState::invalid();
1635 return InstructionsState::invalid();
1638 if (Mappings.
size() != BaseMappings.
size() ||
1639 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1640 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1641 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1642 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1643 Mappings.
front().Shape.Parameters !=
1644 BaseMappings.
front().Shape.Parameters)
1645 return InstructionsState::invalid();
1650 return InstructionsState::invalid();
1655 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1657 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1660 "Incorrect implementation of allSameOpcode.");
1661 InstructionsState S(MainOp, AltOp);
1667 "Invalid InstructionsState.");
1675 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1685 unsigned Opcode = UserInst->
getOpcode();
1687 case Instruction::Load: {
1691 case Instruction::Store: {
1693 return (
SI->getPointerOperand() == Scalar);
1695 case Instruction::Call: {
1699 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1700 Arg.value().get() == Scalar;
1720 return LI->isSimple();
1722 return SI->isSimple();
1724 return !
MI->isVolatile();
1732 bool ExtendingManyInputs =
false) {
1733 if (SubMask.
empty())
1736 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1739 "SubMask with many inputs support must be larger than the mask.");
1741 Mask.append(SubMask.
begin(), SubMask.
end());
1745 int TermValue = std::min(Mask.size(), SubMask.
size());
1746 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1748 (!ExtendingManyInputs &&
1749 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1751 NewMask[
I] = Mask[SubMask[
I]];
1767 const size_t Sz = Order.
size();
1770 for (
unsigned I = 0;
I < Sz; ++
I) {
1772 UnusedIndices.
reset(Order[
I]);
1774 MaskedIndices.
set(
I);
1776 if (MaskedIndices.
none())
1779 "Non-synced masked/available indices.");
1783 assert(Idx >= 0 &&
"Indices must be synced.");
1793 unsigned Opcode0,
unsigned Opcode1) {
1800 OpcodeMask.
set(Lane * ScalarTyNumElements,
1801 Lane * ScalarTyNumElements + ScalarTyNumElements);
1810 "Expected scalar constants.");
1813 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1822 const unsigned E = Indices.
size();
1824 for (
unsigned I = 0;
I < E; ++
I)
1825 Mask[Indices[
I]] =
I;
1831 assert(!Mask.empty() &&
"Expected non-empty mask.");
1835 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1837 Scalars[Mask[
I]] = Prev[
I];
1850 auto *IO = dyn_cast<Instruction>(V);
1853 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1866 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1868 auto *IU = dyn_cast<Instruction>(U);
1871 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1887 return !VL.
empty() &&
1903 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1912 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1913 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1914 if (NumParts == 0 || NumParts >= Limit)
1917 if (NumParts >= Sz || Sz % NumParts != 0 ||
1928 class ScheduleEntity;
1930 class ScheduleCopyableData;
1931 class ScheduleBundle;
1941 struct StridedPtrInfo {
1942 Value *StrideVal =
nullptr;
1943 const SCEV *StrideSCEV =
nullptr;
1969 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1970 AC(AC), DB(DB), DL(DL), ORE(ORE),
1989 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2002 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2023 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2024 return VectorizableTree.front()->Scalars;
2030 const TreeEntry &Root = *VectorizableTree.front();
2031 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2032 !Root.Scalars.
front()->getType()->isIntegerTy())
2033 return std::nullopt;
2034 auto It = MinBWs.find(&Root);
2035 if (It != MinBWs.end())
2039 if (Root.getOpcode() == Instruction::ZExt ||
2040 Root.getOpcode() == Instruction::SExt)
2041 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2042 Root.getOpcode() == Instruction::SExt);
2043 return std::nullopt;
2049 return MinBWs.at(VectorizableTree.front().get()).second;
2054 if (ReductionBitWidth == 0 ||
2055 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2056 ReductionBitWidth >=
2057 DL->getTypeSizeInBits(
2058 VectorizableTree.front()->Scalars.front()->getType()))
2060 VectorizableTree.front()->Scalars.front()->getType(),
2061 VectorizableTree.front()->getVectorFactor());
2064 VectorizableTree.front()->Scalars.front()->getContext(),
2066 VectorizableTree.front()->getVectorFactor());
2081 VectorizableTree.clear();
2082 ScalarToTreeEntries.clear();
2083 OperandsToTreeEntry.clear();
2084 ScalarsInSplitNodes.clear();
2086 NonScheduledFirst.clear();
2087 EntryToLastInstruction.clear();
2088 LoadEntriesToVectorize.clear();
2089 IsGraphTransformMode =
false;
2090 GatheredLoadsEntriesFirst.reset();
2091 CompressEntryToData.clear();
2092 ExternalUses.clear();
2093 ExternalUsesAsOriginalScalar.clear();
2094 ExternalUsesWithNonUsers.clear();
2095 for (
auto &Iter : BlocksSchedules) {
2096 BlockScheduling *BS = Iter.second.get();
2100 ReductionBitWidth = 0;
2102 CastMaxMinBWSizes.reset();
2103 ExtraBitWidthNodes.clear();
2104 InstrElementSize.clear();
2105 UserIgnoreList =
nullptr;
2106 PostponedGathers.clear();
2107 ValueToGatherNodes.clear();
2108 TreeEntryToStridedPtrInfoMap.clear();
2124 assert(!Order.
empty() &&
"expected non-empty order");
2125 const unsigned Sz = Order.
size();
2127 return P.value() ==
P.index() ||
P.value() == Sz;
2140 bool IgnoreReorder);
2153 std::optional<OrdersType>
2191 return MaxVecRegSize;
2196 return MinVecRegSize;
2204 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2205 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2206 return MaxVF ? MaxVF : UINT_MAX;
2245 Align Alignment,
const int64_t Diff,
2246 const size_t Sz)
const;
2287 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const;
2305 Align CommonAlignment,
2307 StridedPtrInfo &SPtrInfo)
const;
2322 StridedPtrInfo &SPtrInfo,
2323 unsigned *BestVF =
nullptr,
2324 bool TryRecursiveCheck =
true)
const;
2328 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2332 template <
typename T>
2334 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2359 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2360 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2385 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2386 MaxLevel(MaxLevel) {}
2442 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2447 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2449 return U == U1 || U == U2 || R.isVectorized(U);
2452 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2455 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2457 ((
int)V1->getNumUses() == NumLanes ||
2458 AllUsersAreInternal(V1, V2)))
2464 auto CheckSameEntryOrFail = [&]() {
2469 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2478 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2480 return CheckSameEntryOrFail();
2483 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2484 LI2->getPointerOperand(), DL, SE,
true);
2485 if (!Dist || *Dist == 0) {
2488 R.TTI->isLegalMaskedGather(
2491 return CheckSameEntryOrFail();
2495 if (std::abs(*Dist) > NumLanes / 2)
2528 Value *EV2 =
nullptr;
2541 int Dist = Idx2 - Idx1;
2544 if (std::abs(Dist) == 0)
2546 if (std::abs(Dist) > NumLanes / 2)
2553 return CheckSameEntryOrFail();
2559 if (I1->getParent() != I2->getParent())
2560 return CheckSameEntryOrFail();
2568 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2569 !S.isAltShuffle()) &&
2573 S.getMainOp()->getNumOperands();
2585 return CheckSameEntryOrFail();
2619 int ShallowScoreAtThisLevel =
2630 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2633 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2635 ShallowScoreAtThisLevel))
2636 return ShallowScoreAtThisLevel;
2637 assert(I1 && I2 &&
"Should have early exited.");
2644 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2645 OpIdx1 != NumOperands1; ++OpIdx1) {
2647 int MaxTmpScore = 0;
2648 unsigned MaxOpIdx2 = 0;
2649 bool FoundBest =
false;
2653 ? I2->getNumOperands()
2654 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2655 assert(FromIdx <= ToIdx &&
"Bad index");
2656 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2658 if (Op2Used.
count(OpIdx2))
2663 I1, I2, CurrLevel + 1, {});
2666 TmpScore > MaxTmpScore) {
2667 MaxTmpScore = TmpScore;
2674 Op2Used.
insert(MaxOpIdx2);
2675 ShallowScoreAtThisLevel += MaxTmpScore;
2678 return ShallowScoreAtThisLevel;
2709 struct OperandData {
2710 OperandData() =
default;
2711 OperandData(
Value *V,
bool APO,
bool IsUsed)
2712 : V(V), APO(APO), IsUsed(IsUsed) {}
2722 bool IsUsed =
false;
2731 enum class ReorderingMode {
2745 unsigned ArgSize = 0;
2751 const Loop *L =
nullptr;
2754 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2755 return OpsVec[
OpIdx][Lane];
2759 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2760 return OpsVec[
OpIdx][Lane];
2765 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2767 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2769 OpsVec[
OpIdx][Lane].IsUsed =
false;
2773 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2774 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2786 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2788 Value *IdxLaneV = getData(Idx, Lane).V;
2801 unsigned UniquesCount = Uniques.
size();
2802 auto IdxIt = Uniques.
find(IdxLaneV);
2803 unsigned UniquesCntWithIdxLaneV =
2804 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2806 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2807 unsigned UniquesCntWithOpIdxLaneV =
2808 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2809 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2811 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2812 UniquesCntWithOpIdxLaneV,
2813 UniquesCntWithOpIdxLaneV -
2815 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2816 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2817 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2826 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2827 Value *IdxLaneV = getData(Idx, Lane).V;
2840 return R.areAllUsersVectorized(IdxLaneI)
2848 static const int ScoreScaleFactor = 10;
2856 int Lane,
unsigned OpIdx,
unsigned Idx,
2866 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2867 if (Score <= -SplatScore) {
2871 Score += SplatScore;
2877 Score *= ScoreScaleFactor;
2878 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2896 std::optional<unsigned>
2897 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2901 unsigned NumOperands = getNumOperands();
2904 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2907 ReorderingMode RMode = ReorderingModes[
OpIdx];
2908 if (RMode == ReorderingMode::Failed)
2909 return std::nullopt;
2912 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2918 std::optional<unsigned> Idx;
2922 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2928 bool IsUsed = RMode == ReorderingMode::Splat ||
2929 RMode == ReorderingMode::Constant ||
2930 RMode == ReorderingMode::Load;
2932 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2934 OperandData &OpData = getData(Idx, Lane);
2936 bool OpAPO = OpData.APO;
2945 if (OpAPO != OpIdxAPO)
2950 case ReorderingMode::Load:
2951 case ReorderingMode::Opcode: {
2952 bool LeftToRight = Lane > LastLane;
2953 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2954 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2955 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2956 OpIdx, Idx, IsUsed, UsedLanes);
2957 if (Score >
static_cast<int>(BestOp.Score) ||
2958 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2961 BestOp.Score = Score;
2962 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2966 case ReorderingMode::Constant:
2968 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2972 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2979 case ReorderingMode::Splat:
2981 IsUsed =
Op == OpLastLane;
2982 if (
Op == OpLastLane) {
2984 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2990 case ReorderingMode::Failed:
2996 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3000 return std::nullopt;
3007 unsigned getBestLaneToStartReordering()
const {
3008 unsigned Min = UINT_MAX;
3009 unsigned SameOpNumber = 0;
3020 for (
int I = getNumLanes();
I > 0; --
I) {
3021 unsigned Lane =
I - 1;
3022 OperandsOrderData NumFreeOpsHash =
3023 getMaxNumOperandsThatCanBeReordered(Lane);
3026 if (NumFreeOpsHash.NumOfAPOs < Min) {
3027 Min = NumFreeOpsHash.NumOfAPOs;
3028 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3030 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3031 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3032 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3035 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3036 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3037 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3038 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3039 auto [It, Inserted] =
3040 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3046 unsigned BestLane = 0;
3047 unsigned CntMin = UINT_MAX;
3049 if (
Data.second.first < CntMin) {
3050 CntMin =
Data.second.first;
3051 BestLane =
Data.second.second;
3058 struct OperandsOrderData {
3061 unsigned NumOfAPOs = UINT_MAX;
3064 unsigned NumOpsWithSameOpcodeParent = 0;
3078 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3079 unsigned CntTrue = 0;
3080 unsigned NumOperands = getNumOperands();
3090 bool AllUndefs =
true;
3091 unsigned NumOpsWithSameOpcodeParent = 0;
3096 const OperandData &OpData = getData(
OpIdx, Lane);
3103 I->getParent() != Parent) {
3104 if (NumOpsWithSameOpcodeParent == 0) {
3105 NumOpsWithSameOpcodeParent = 1;
3107 Parent =
I->getParent();
3109 --NumOpsWithSameOpcodeParent;
3112 ++NumOpsWithSameOpcodeParent;
3121 OperandsOrderData
Data;
3122 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3123 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3130 const InstructionsState &S) {
3134 return VL.
size() == getNumLanes();
3136 "Expected same number of lanes");
3137 assert(S.valid() &&
"InstructionsState is invalid.");
3143 OpsVec.resize(ArgSize);
3144 unsigned NumLanes = VL.
size();
3145 for (OperandDataVec &
Ops : OpsVec)
3146 Ops.resize(NumLanes);
3161 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane],
true,
false};
3164 bool IsInverseOperation =
false;
3165 if (S.isCopyableElement(VL[Lane])) {
3169 assert(
I &&
"Expected instruction");
3170 auto [SelectedOp,
Ops] = convertTo(
I, S);
3177 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3178 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane], APO,
false};
3184 unsigned getNumOperands()
const {
return ArgSize; }
3187 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3190 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3191 return getData(
OpIdx, Lane).V;
3195 bool empty()
const {
return OpsVec.empty(); }
3198 void clear() { OpsVec.clear(); }
3203 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3205 "Op is expected to be getValue(OpIdx, Lane).");
3209 bool OpAPO = getData(
OpIdx, Lane).APO;
3210 bool IsInvariant = L && L->isLoopInvariant(
Op);
3212 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3216 bool FoundCandidate =
false;
3217 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3218 OperandData &
Data = getData(OpI, Ln);
3219 if (
Data.APO != OpAPO ||
Data.IsUsed)
3221 Value *OpILane = getValue(OpI, Lane);
3245 L->isLoopInvariant(
Data.V))) {
3246 FoundCandidate =
true;
3253 if (!FoundCandidate)
3256 return getNumLanes() == 2 || Cnt > 1;
3263 "Op is expected to be getValue(OpIdx, Lane).");
3264 bool OpAPO = getData(
OpIdx, Lane).APO;
3265 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3269 const OperandData &
Data = getData(OpI, Ln);
3270 if (
Data.APO != OpAPO ||
Data.IsUsed)
3272 Value *OpILn = getValue(OpI, Ln);
3273 return (L && L->isLoopInvariant(OpILn)) ||
3285 const InstructionsState &S,
const BoUpSLP &R)
3286 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3287 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3289 appendOperands(RootVL, Operands, S);
3297 "Expected same num of lanes across all operands");
3298 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3299 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3307 unsigned NumOperands = getNumOperands();
3308 unsigned NumLanes = getNumLanes();
3328 unsigned FirstLane = getBestLaneToStartReordering();
3337 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3338 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3339 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3341 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3343 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3345 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3348 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3358 auto &&SkipReordering = [
this]() {
3361 for (
const OperandData &
Data : Op0)
3364 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3365 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3372 return UniqueValues.
size() != 2 &&
3374 UniqueValues.
size());
3386 if (SkipReordering())
3389 bool StrategyFailed =
false;
3397 for (
unsigned I = 0;
I < NumOperands; ++
I)
3398 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3401 UsedLanes.
set(FirstLane);
3402 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3404 for (
int Direction : {+1, -1}) {
3405 int Lane = FirstLane + Direction * Distance;
3406 if (Lane < 0 || Lane >= (
int)NumLanes)
3408 UsedLanes.
set(Lane);
3409 int LastLane = Lane - Direction;
3410 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3415 std::optional<unsigned> BestIdx =
3416 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3417 MainAltOps[
OpIdx], UsedLanes);
3424 swap(
OpIdx, *BestIdx, Lane);
3427 StrategyFailed =
true;
3431 OperandData &AltOp = getData(
OpIdx, Lane);
3432 InstructionsState OpS =
3434 if (OpS && OpS.isAltShuffle())
3441 if (!StrategyFailed)
3446#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3449 case ReorderingMode::Load:
3451 case ReorderingMode::Opcode:
3453 case ReorderingMode::Constant:
3455 case ReorderingMode::Splat:
3457 case ReorderingMode::Failed:
3478 const unsigned Indent = 2;
3480 for (
const OperandDataVec &OpDataVec : OpsVec) {
3481 OS <<
"Operand " << Cnt++ <<
"\n";
3482 for (
const OperandData &OpData : OpDataVec) {
3483 OS.
indent(Indent) <<
"{";
3484 if (
Value *V = OpData.V)
3488 OS <<
", APO:" << OpData.APO <<
"}\n";
3510 int BestScore = Limit;
3511 std::optional<int> Index;
3512 for (
int I :
seq<int>(0, Candidates.size())) {
3514 Candidates[
I].second,
3517 if (Score > BestScore) {
3532 DeletedInstructions.insert(
I);
3537 template <
typename T>
3540 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3542 for (
T *V : DeadVals) {
3547 for (
T *V : DeadVals) {
3548 if (!V || !Processed.
insert(V).second)
3553 for (
Use &U :
I->operands()) {
3555 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3557 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3558 return Entry->VectorizedValue == OpI;
3562 I->dropAllReferences();
3564 for (
T *V : DeadVals) {
3566 if (!
I->getParent())
3571 cast<Instruction>(U.getUser()));
3573 "trying to erase instruction with users.");
3574 I->removeFromParent();
3578 while (!DeadInsts.
empty()) {
3581 if (!VI || !VI->getParent())
3584 "Live instruction found in dead worklist!");
3585 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3592 for (
Use &OpU : VI->operands()) {
3593 Value *OpV = OpU.get();
3605 if (!DeletedInstructions.contains(OpI) &&
3606 (!OpI->getType()->isVectorTy() ||
3607 none_of(VectorValuesAndScales,
3608 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3609 return std::get<0>(V) == OpI;
3615 VI->removeFromParent();
3617 SE->forgetValue(VI);
3624 return AnalyzedReductionsRoots.count(
I);
3629 AnalyzedReductionsRoots.insert(
I);
3634 return AnalyzedReductionVals.contains(
hash_value(VL));
3639 AnalyzedReductionVals.insert(
hash_value(VL));
3643 AnalyzedReductionsRoots.clear();
3644 AnalyzedReductionVals.clear();
3645 AnalyzedMinBWVals.clear();
3653 return MustGather.contains(V);
3657 return NonScheduledFirst.contains(V);
3662 assert(V &&
"V cannot be nullptr.");
3663 return ScalarToTreeEntries.contains(V);
3673 bool collectValuesToDemote(
3674 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3677 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3686 void buildReorderableOperands(
3694 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3697 bool areAllUsersVectorized(
3706 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3707 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3708 return const_cast<TreeEntry *
>(
3709 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3715 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3719 getCastContextHint(
const TreeEntry &TE)
const;
3733 const InstructionsState &LocalState,
3740 unsigned InterleaveFactor = 0);
3751 bool ResizeAllowed =
false)
const;
3758 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx);
3763 template <
typename BVTy,
typename ResTy,
typename... Args>
3764 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3769 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
3775 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3782 std::optional<TargetTransformInfo::ShuffleKind>
3794 unsigned NumParts)
const;
3806 std::optional<TargetTransformInfo::ShuffleKind>
3807 isGatherShuffledSingleRegisterEntry(
3824 isGatherShuffledEntry(
3827 unsigned NumParts,
bool ForOrder =
false);
3833 Type *ScalarTy)
const;
3837 void setInsertPointAfterBundle(
const TreeEntry *E);
3847 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3852 void tryToVectorizeGatheredLoads(
3854 std::tuple<BasicBlock *, Value *, Type *>,
3862 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3878 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3882 void reorderGatherNode(TreeEntry &TE);
3887 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3891 if (State == TreeEntry::SplitVectorize)
3893 SmallVector<int>
Mask;
3900 SmallVector<int> getSplitMask()
const {
3901 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3902 "Expected only split vectorize node.");
3904 unsigned CommonVF = std::max<unsigned>(
3905 CombinedEntriesWithIndices.back().second,
3906 Scalars.size() - CombinedEntriesWithIndices.back().second);
3907 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3909 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3910 ? CommonVF - CombinedEntriesWithIndices.back().second
3917 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
3918 ArrayRef<int> MaskOrder);
3923 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
3924 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
3927 [Scalars](
Value *V,
int Idx) {
3928 return (isa<UndefValue>(V) &&
3929 Idx == PoisonMaskElem) ||
3930 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3933 if (!ReorderIndices.empty()) {
3937 SmallVector<int>
Mask;
3939 if (VL.
size() == Scalars.size())
3940 return IsSame(Scalars, Mask);
3941 if (VL.
size() == ReuseShuffleIndices.size()) {
3943 return IsSame(Scalars, Mask);
3947 return IsSame(Scalars, ReuseShuffleIndices);
3951 bool hasEqualOperands(
const TreeEntry &TE)
const {
3952 if (
TE.getNumOperands() != getNumOperands())
3954 SmallBitVector
Used(getNumOperands());
3955 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
3956 unsigned PrevCount =
Used.count();
3957 for (
unsigned K = 0;
K <
E; ++
K) {
3960 if (getOperand(K) ==
TE.getOperand(
I)) {
3966 if (PrevCount ==
Used.count())
3975 unsigned getVectorFactor()
const {
3976 if (!ReuseShuffleIndices.empty())
3977 return ReuseShuffleIndices.size();
3978 return Scalars.size();
3982 bool isGather()
const {
return State == NeedToGather; }
3988 WeakTrackingVH VectorizedValue =
nullptr;
4009 enum CombinedOpcode {
4011 MinMax = Instruction::OtherOpsEnd + 1,
4014 CombinedOpcode CombinedOp = NotCombinedOp;
4017 SmallVector<int, 4> ReuseShuffleIndices;
4020 SmallVector<unsigned, 4> ReorderIndices;
4028 VecTreeTy &Container;
4031 EdgeInfo UserTreeIndex;
4047 SmallPtrSet<const Value *, 4> CopyableElements;
4051 InstructionsState S = InstructionsState::invalid();
4054 unsigned InterleaveFactor = 0;
4057 bool DoesNotNeedToSchedule =
false;
4061 if (Operands.size() <
OpIdx + 1)
4062 Operands.resize(
OpIdx + 1);
4065 "Number of operands is greater than the number of scalars.");
4072 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4074 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4077 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4080 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4085 setOperand(
I, Operands[
I]);
4089 void reorderOperands(ArrayRef<int> Mask) {
4097 return Operands[
OpIdx];
4103 return Operands[
OpIdx];
4107 unsigned getNumOperands()
const {
return Operands.size(); }
4110 Value *getSingleOperand(
unsigned OpIdx)
const {
4113 return Operands[
OpIdx][0];
4117 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4119 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4120 return S.getMatchingMainOpOrAltOp(
I);
4128 if (
I && getMatchingMainOpOrAltOp(
I))
4130 return S.getMainOp();
4133 void setOperations(
const InstructionsState &S) {
4134 assert(S &&
"InstructionsState is invalid.");
4138 Instruction *getMainOp()
const {
return S.getMainOp(); }
4140 Instruction *getAltOp()
const {
return S.getAltOp(); }
4143 unsigned getOpcode()
const {
return S.getOpcode(); }
4145 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4147 bool hasState()
const {
return S.valid(); }
4150 void addCopyableElement(
Value *V) {
4151 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4152 CopyableElements.insert(V);
4156 bool isCopyableElement(
Value *V)
const {
4157 return CopyableElements.contains(V);
4161 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4164 const InstructionsState &getOperations()
const {
return S; }
4168 unsigned findLaneForValue(
Value *V)
const {
4169 unsigned FoundLane = getVectorFactor();
4170 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4171 std::advance(It, 1)) {
4174 FoundLane = std::distance(Scalars.begin(), It);
4175 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4176 if (!ReorderIndices.empty())
4177 FoundLane = ReorderIndices[FoundLane];
4178 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4179 if (ReuseShuffleIndices.empty())
4181 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4182 RIt != ReuseShuffleIndices.end()) {
4183 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4187 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4194 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4195 SmallVectorImpl<int> &Mask,
4196 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4197 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4200 bool isNonPowOf2Vec()
const {
4202 return IsNonPowerOf2;
4208 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4211 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4212 "Reshuffling not supported with non-power-of-2 vectors yet.");
4213 return IsNonPowerOf2;
4216 Value *getOrdered(
unsigned Idx)
const {
4217 assert(isGather() &&
"Must be used only for buildvectors/gathers.");
4218 if (ReorderIndices.empty())
4219 return Scalars[Idx];
4220 SmallVector<int>
Mask;
4222 return Scalars[
Mask[Idx]];
4228 dbgs() << Idx <<
".\n";
4229 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4230 dbgs() <<
"Operand " << OpI <<
":\n";
4231 for (
const Value *V : Operands[OpI])
4234 dbgs() <<
"Scalars: \n";
4235 for (
Value *V : Scalars)
4237 dbgs() <<
"State: ";
4238 if (S && hasCopyableElements())
4239 dbgs() <<
"[[Copyable]] ";
4242 if (InterleaveFactor > 0) {
4243 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4246 dbgs() <<
"Vectorize\n";
4249 case ScatterVectorize:
4250 dbgs() <<
"ScatterVectorize\n";
4252 case StridedVectorize:
4253 dbgs() <<
"StridedVectorize\n";
4255 case CompressVectorize:
4256 dbgs() <<
"CompressVectorize\n";
4259 dbgs() <<
"NeedToGather\n";
4261 case CombinedVectorize:
4262 dbgs() <<
"CombinedVectorize\n";
4264 case SplitVectorize:
4265 dbgs() <<
"SplitVectorize\n";
4269 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4270 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4272 dbgs() <<
"MainOp: NULL\n";
4273 dbgs() <<
"AltOp: NULL\n";
4275 dbgs() <<
"VectorizedValue: ";
4276 if (VectorizedValue)
4277 dbgs() << *VectorizedValue <<
"\n";
4280 dbgs() <<
"ReuseShuffleIndices: ";
4281 if (ReuseShuffleIndices.empty())
4284 for (
int ReuseIdx : ReuseShuffleIndices)
4285 dbgs() << ReuseIdx <<
", ";
4287 dbgs() <<
"ReorderIndices: ";
4288 for (
unsigned ReorderIdx : ReorderIndices)
4289 dbgs() << ReorderIdx <<
", ";
4291 dbgs() <<
"UserTreeIndex: ";
4293 dbgs() << UserTreeIndex;
4295 dbgs() <<
"<invalid>";
4297 if (!CombinedEntriesWithIndices.empty()) {
4298 dbgs() <<
"Combined entries: ";
4300 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4311 StringRef Banner)
const {
4312 dbgs() <<
"SLP: " << Banner <<
":\n";
4314 dbgs() <<
"SLP: Costs:\n";
4315 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4316 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4317 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4318 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4319 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4325 const InstructionsState &S,
4327 ArrayRef<int> ReuseShuffleIndices = {}) {
4328 auto Invalid = ScheduleBundle::invalid();
4329 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4334 const InstructionsState &S,
4336 ArrayRef<int> ReuseShuffleIndices = {},
4337 ArrayRef<unsigned> ReorderIndices = {},
4338 unsigned InterleaveFactor = 0) {
4339 TreeEntry::EntryState EntryState =
4340 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4341 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4342 ReuseShuffleIndices, ReorderIndices);
4343 if (
E && InterleaveFactor > 0)
4344 E->setInterleave(InterleaveFactor);
4349 TreeEntry::EntryState EntryState,
4350 ScheduleBundle &Bundle,
const InstructionsState &S,
4352 ArrayRef<int> ReuseShuffleIndices = {},
4353 ArrayRef<unsigned> ReorderIndices = {}) {
4354 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4355 EntryState == TreeEntry::SplitVectorize)) ||
4356 (Bundle && EntryState != TreeEntry::NeedToGather &&
4357 EntryState != TreeEntry::SplitVectorize)) &&
4358 "Need to vectorize gather entry?");
4360 if (GatheredLoadsEntriesFirst.has_value() &&
4361 EntryState == TreeEntry::NeedToGather && S &&
4362 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4363 !UserTreeIdx.UserTE)
4365 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4366 TreeEntry *
Last = VectorizableTree.back().get();
4367 Last->Idx = VectorizableTree.size() - 1;
4368 Last->State = EntryState;
4369 if (UserTreeIdx.UserTE)
4370 OperandsToTreeEntry.try_emplace(
4371 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4376 ReuseShuffleIndices.empty()) &&
4377 "Reshuffling scalars not yet supported for nodes with padding");
4378 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4379 ReuseShuffleIndices.end());
4380 if (ReorderIndices.
empty()) {
4383 Last->setOperations(S);
4386 Last->Scalars.assign(VL.
size(),
nullptr);
4388 [VL](
unsigned Idx) ->
Value * {
4389 if (Idx >= VL.size())
4390 return UndefValue::get(VL.front()->getType());
4395 Last->setOperations(S);
4396 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4398 if (EntryState == TreeEntry::SplitVectorize) {
4399 assert(S &&
"Split nodes must have operations.");
4400 Last->setOperations(S);
4401 SmallPtrSet<Value *, 4> Processed;
4402 for (
Value *V : VL) {
4406 auto It = ScalarsInSplitNodes.find(V);
4407 if (It == ScalarsInSplitNodes.end()) {
4408 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4409 (void)Processed.
insert(V);
4410 }
else if (Processed.
insert(V).second) {
4412 "Value already associated with the node.");
4413 It->getSecond().push_back(
Last);
4416 }
else if (!
Last->isGather()) {
4419 (!S.areInstructionsWithCopyableElements() &&
4421 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4422 Last->setDoesNotNeedToSchedule();
4423 SmallPtrSet<Value *, 4> Processed;
4424 for (
Value *V : VL) {
4427 if (S.isCopyableElement(V)) {
4428 Last->addCopyableElement(V);
4431 auto It = ScalarToTreeEntries.find(V);
4432 if (It == ScalarToTreeEntries.end()) {
4433 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4434 (void)Processed.
insert(V);
4435 }
else if (Processed.
insert(V).second) {
4437 "Value already associated with the node.");
4438 It->getSecond().push_back(
Last);
4442 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4443 "Bundle and VL out of sync");
4444 if (!Bundle.getBundle().empty()) {
4445#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4446 auto *BundleMember = Bundle.getBundle().begin();
4447 SmallPtrSet<Value *, 4> Processed;
4448 for (
Value *V : VL) {
4449 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4453 assert(BundleMember == Bundle.getBundle().end() &&
4454 "Bundle and VL out of sync");
4456 Bundle.setTreeEntry(
Last);
4460 bool AllConstsOrCasts =
true;
4461 for (
Value *V : VL) {
4462 if (S && S.areInstructionsWithCopyableElements() &&
4463 S.isCopyableElement(V))
4464 Last->addCopyableElement(V);
4467 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4468 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4469 !UserTreeIdx.UserTE->isGather())
4470 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4473 if (AllConstsOrCasts)
4475 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4476 MustGather.insert_range(VL);
4479 if (UserTreeIdx.UserTE)
4480 Last->UserTreeIndex = UserTreeIdx;
4486 TreeEntry::VecTreeTy VectorizableTree;
4491 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4492 VectorizableTree[
Id]->dump();
4500 assert(V &&
"V cannot be nullptr.");
4501 auto It = ScalarToTreeEntries.find(V);
4502 if (It == ScalarToTreeEntries.end())
4504 return It->getSecond();
4509 assert(V &&
"V cannot be nullptr.");
4510 auto It = ScalarsInSplitNodes.find(V);
4511 if (It == ScalarsInSplitNodes.end())
4513 return It->getSecond();
4518 bool SameVF =
false)
const {
4519 assert(V &&
"V cannot be nullptr.");
4520 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4521 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4532 bool areAltOperandsProfitable(
const InstructionsState &S,
4537 class ScalarsVectorizationLegality {
4538 InstructionsState S;
4540 bool TryToFindDuplicates;
4541 bool TrySplitVectorize;
4544 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4545 bool TryToFindDuplicates =
true,
4546 bool TrySplitVectorize =
false)
4547 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4548 TrySplitVectorize(TrySplitVectorize) {
4549 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4550 "Inconsistent state");
4552 const InstructionsState &getInstructionsState()
const {
return S; };
4553 bool isLegal()
const {
return IsLegal; }
4554 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4555 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4560 ScalarsVectorizationLegality
4563 bool TryCopyableElementsVectorization)
const;
4567 TreeEntry::EntryState getScalarsVectorizationState(
4569 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4570 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4573 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4576 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4577 OperandsToTreeEntry;
4580 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4583 SmallDenseMap<Value *, unsigned> InstrElementSize;
4597 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4602 SetVector<const TreeEntry *> PostponedGathers;
4604 using ValueToGatherNodesMap =
4605 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4606 ValueToGatherNodesMap ValueToGatherNodes;
4611 SetVector<unsigned> LoadEntriesToVectorize;
4614 bool IsGraphTransformMode =
false;
4617 std::optional<unsigned> GatheredLoadsEntriesFirst;
4620 SmallDenseMap<
const TreeEntry *,
4621 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4622 CompressEntryToData;
4625 struct ExternalUser {
4626 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4627 : Scalar(S), User(
U), E(E), Lane(
L) {}
4630 Value *Scalar =
nullptr;
4633 llvm::User *User =
nullptr;
4641 using UserList = SmallVector<ExternalUser, 16>;
4647 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4648 Instruction *Inst2) {
4651 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4652 auto Res = AliasCache.try_emplace(
Key);
4654 return Res.first->second;
4655 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4657 Res.first->getSecond() = Aliased;
4661 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4665 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4670 BatchAAResults BatchAA;
4677 DenseSet<Instruction *> DeletedInstructions;
4680 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4683 DenseSet<size_t> AnalyzedReductionVals;
4687 DenseSet<Value *> AnalyzedMinBWVals;
4693 UserList ExternalUses;
4697 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4701 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4704 SmallPtrSet<const Value *, 32> EphValues;
4708 SetVector<Instruction *> GatherShuffleExtractSeq;
4711 DenseSet<BasicBlock *> CSEBlocks;
4714 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4721 class ScheduleEntity {
4722 friend class ScheduleBundle;
4723 friend class ScheduleData;
4724 friend class ScheduleCopyableData;
4727 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4728 Kind getKind()
const {
return K; }
4729 ScheduleEntity(Kind K) : K(K) {}
4733 int SchedulingPriority = 0;
4736 bool IsScheduled =
false;
4738 const Kind K = Kind::ScheduleData;
4741 ScheduleEntity() =
delete;
4743 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4744 int getSchedulingPriority()
const {
return SchedulingPriority; }
4745 bool isReady()
const {
4747 return SD->isReady();
4749 return CD->isReady();
4755 bool hasValidDependencies()
const {
4757 return SD->hasValidDependencies();
4759 return CD->hasValidDependencies();
4763 int getUnscheduledDeps()
const {
4765 return SD->getUnscheduledDeps();
4767 return CD->getUnscheduledDeps();
4771 int incrementUnscheduledDeps(
int Incr) {
4773 return SD->incrementUnscheduledDeps(Incr);
4777 int getDependencies()
const {
4779 return SD->getDependencies();
4785 return SD->getInst();
4790 bool isScheduled()
const {
return IsScheduled; }
4791 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4793 static bool classof(
const ScheduleEntity *) {
return true; }
4795#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4796 void dump(raw_ostream &OS)
const {
4798 return SD->dump(OS);
4800 return CD->dump(OS);
4811#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4813 const BoUpSLP::ScheduleEntity &SE) {
4823 class ScheduleData final :
public ScheduleEntity {
4827 enum { InvalidDeps = -1 };
4829 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4830 static bool classof(
const ScheduleEntity *Entity) {
4831 return Entity->getKind() == Kind::ScheduleData;
4834 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4835 NextLoadStore =
nullptr;
4836 IsScheduled =
false;
4837 SchedulingRegionID = BlockSchedulingRegionID;
4838 clearDependencies();
4844 if (hasValidDependencies()) {
4845 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4847 assert(UnscheduledDeps == Dependencies &&
"invariant");
4851 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4852 "unexpected scheduled state");
4859 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4863 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4868 int incrementUnscheduledDeps(
int Incr) {
4869 assert(hasValidDependencies() &&
4870 "increment of unscheduled deps would be meaningless");
4871 UnscheduledDeps += Incr;
4872 return UnscheduledDeps;
4877 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4880 void clearDependencies() {
4881 clearDirectDependencies();
4882 MemoryDependencies.clear();
4883 ControlDependencies.clear();
4890 void clearDirectDependencies() {
4891 Dependencies = InvalidDeps;
4892 resetUnscheduledDeps();
4893 IsScheduled =
false;
4897 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4899 int getDependencies()
const {
return Dependencies; }
4901 void initDependencies() { Dependencies = 0; }
4903 void incDependencies() { Dependencies++; }
4906 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4913 return MemoryDependencies;
4916 void addMemoryDependency(ScheduleData *Dep) {
4917 MemoryDependencies.push_back(Dep);
4921 return ControlDependencies;
4924 void addControlDependency(ScheduleData *Dep) {
4925 ControlDependencies.push_back(Dep);
4928 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4929 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
4931 void dump(raw_ostream &OS)
const { OS << *Inst; }
4943 ScheduleData *NextLoadStore =
nullptr;
4947 SmallVector<ScheduleData *> MemoryDependencies;
4953 SmallVector<ScheduleData *> ControlDependencies;
4957 int SchedulingRegionID = 0;
4963 int Dependencies = InvalidDeps;
4969 int UnscheduledDeps = InvalidDeps;
4974 const BoUpSLP::ScheduleData &SD) {
4980 class ScheduleBundle final :
public ScheduleEntity {
4984 bool IsValid =
true;
4986 TreeEntry *TE =
nullptr;
4987 ScheduleBundle(
bool IsValid)
4988 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4991 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4992 static bool classof(
const ScheduleEntity *Entity) {
4993 return Entity->getKind() == Kind::ScheduleBundle;
4998 for (
const ScheduleEntity *SD : Bundle) {
4999 if (SD->hasValidDependencies()) {
5000 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5003 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5007 if (isScheduled()) {
5008 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5009 "unexpected scheduled state");
5015 int unscheduledDepsInBundle()
const {
5016 assert(*
this &&
"bundle must not be empty");
5018 for (
const ScheduleEntity *BundleMember : Bundle) {
5019 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5020 return ScheduleData::InvalidDeps;
5021 Sum += BundleMember->getUnscheduledDeps();
5029 bool hasValidDependencies()
const {
5030 return all_of(Bundle, [](
const ScheduleEntity *SD) {
5031 return SD->hasValidDependencies();
5037 bool isReady()
const {
5038 assert(*
this &&
"bundle must not be empty");
5039 return unscheduledDepsInBundle() == 0 && !isScheduled();
5047 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5050 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5051 TreeEntry *getTreeEntry()
const {
return TE; }
5053 static ScheduleBundle invalid() {
return {
false}; }
5055 operator bool()
const {
return IsValid; }
5058 void dump(raw_ostream &OS)
const {
5067 OS << *SD->getInst();
5081 const BoUpSLP::ScheduleBundle &Bundle) {
5092 class ScheduleCopyableData final :
public ScheduleEntity {
5099 int SchedulingRegionID = 0;
5101 ScheduleBundle &Bundle;
5104 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5105 const EdgeInfo &EI, ScheduleBundle &Bundle)
5106 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5107 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5108 static bool classof(
const ScheduleEntity *Entity) {
5109 return Entity->getKind() == Kind::ScheduleCopyableData;
5114 if (hasValidDependencies()) {
5115 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5117 assert(UnscheduledDeps == Dependencies &&
"invariant");
5121 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5122 "unexpected scheduled state");
5129 bool hasValidDependencies()
const {
5130 return Dependencies != ScheduleData::InvalidDeps;
5135 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5140 int incrementUnscheduledDeps(
int Incr) {
5141 assert(hasValidDependencies() &&
5142 "increment of unscheduled deps would be meaningless");
5143 UnscheduledDeps += Incr;
5144 assert(UnscheduledDeps >= 0 &&
"invariant");
5145 return UnscheduledDeps;
5150 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5153 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5155 int getDependencies()
const {
return Dependencies; }
5157 void initDependencies() { Dependencies = 0; }
5159 void incDependencies() { Dependencies++; }
5162 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5168 void clearDependencies() {
5169 Dependencies = ScheduleData::InvalidDeps;
5170 UnscheduledDeps = ScheduleData::InvalidDeps;
5171 IsScheduled =
false;
5175 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5178 ScheduleBundle &getBundle() {
return Bundle; }
5179 const ScheduleBundle &getBundle()
const {
return Bundle; }
5181#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5182 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5193 int Dependencies = ScheduleData::InvalidDeps;
5199 int UnscheduledDeps = ScheduleData::InvalidDeps;
5229 struct BlockScheduling {
5231 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5234 ScheduledBundles.clear();
5235 ScheduledBundlesList.
clear();
5236 ScheduleCopyableDataMap.clear();
5237 ScheduleCopyableDataMapByInst.clear();
5238 ScheduleCopyableDataMapByInstUser.clear();
5239 ScheduleCopyableDataMapByUsers.clear();
5241 ScheduleStart =
nullptr;
5242 ScheduleEnd =
nullptr;
5243 FirstLoadStoreInRegion =
nullptr;
5244 LastLoadStoreInRegion =
nullptr;
5245 RegionHasStackSave =
false;
5249 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5252 ScheduleRegionSize = 0;
5256 ++SchedulingRegionID;
5262 if (BB !=
I->getParent())
5265 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5266 if (SD && isInSchedulingRegion(*SD))
5271 ScheduleData *getScheduleData(
Value *V) {
5277 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5278 const Value *V)
const {
5279 if (ScheduleCopyableDataMap.empty())
5281 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5282 if (It == ScheduleCopyableDataMap.end())
5284 ScheduleCopyableData *SD = It->getSecond().get();
5285 if (!isInSchedulingRegion(*SD))
5293 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5295 if (ScheduleCopyableDataMapByInstUser.empty())
5297 const auto It = ScheduleCopyableDataMapByInstUser.find(
5298 std::make_pair(std::make_pair(User, OperandIdx), V));
5299 if (It == ScheduleCopyableDataMapByInstUser.end())
5302 for (ScheduleCopyableData *SD : It->getSecond()) {
5303 if (isInSchedulingRegion(*SD))
5317 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5321 if (ScheduleCopyableDataMap.empty())
5323 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5324 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5325 for (
const Use &U :
User->operands()) {
5329 if (Entries.
empty())
5333 for (TreeEntry *TE : Entries) {
5339 bool IsCommutativeUser =
5344 OrderedEntriesCount.
try_emplace(TE, 0).first->getSecond();
5345 EdgeInfo EI(TE,
U.getOperandNo());
5346 if (!getScheduleCopyableData(EI,
Op))
5352 ++PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5353 .first->getSecond();
5356 if (PotentiallyReorderedEntriesCount.
empty())
5357 return all_of(OrderedEntriesCount,
5358 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5362 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5363 auto *It =
find(
P.first->Scalars, User);
5364 assert(It !=
P.first->Scalars.end() &&
"User is not in the tree entry");
5365 int Lane = std::distance(
P.first->Scalars.begin(), It);
5366 assert(Lane >= 0 &&
"Lane is not found");
5368 Lane =
P.first->ReorderIndices[Lane];
5369 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5370 "Couldn't find extract lane");
5371 SmallVector<unsigned> OpIndices;
5372 for (
unsigned OpIdx :
5374 P.first->getMainOp()))) {
5375 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5376 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5380 return all_of(PotentiallyReorderedEntriesCount,
5381 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5382 return P.second ==
NumOps - 1;
5384 all_of(OrderedEntriesCount,
5385 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5391 getScheduleCopyableData(
const Instruction *
I)
const {
5392 if (ScheduleCopyableDataMapByInst.empty())
5394 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5395 if (It == ScheduleCopyableDataMapByInst.end())
5398 for (ScheduleCopyableData *SD : It->getSecond()) {
5399 if (isInSchedulingRegion(*SD))
5406 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5407 if (ScheduleCopyableDataMapByUsers.empty())
5409 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5410 if (It == ScheduleCopyableDataMapByUsers.end())
5413 for (ScheduleCopyableData *SD : It->getSecond()) {
5414 if (isInSchedulingRegion(*SD))
5420 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5422 int SchedulingRegionID,
5423 ScheduleBundle &Bundle) {
5424 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5425 ScheduleCopyableData *CD =
5426 ScheduleCopyableDataMap
5427 .try_emplace(std::make_pair(EI,
I),
5428 std::make_unique<ScheduleCopyableData>(
5429 SchedulingRegionID,
I, EI, Bundle))
5432 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5436 assert(It !=
Op.end() &&
"Lane not set");
5437 SmallPtrSet<Instruction *, 4> Visited;
5439 int Lane = std::distance(
Op.begin(), It);
5440 assert(Lane >= 0 &&
"Lane not set");
5442 !EI.UserTE->ReorderIndices.empty())
5443 Lane = EI.UserTE->ReorderIndices[Lane];
5444 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5445 "Couldn't find extract lane");
5447 if (!Visited.
insert(In).second) {
5451 ScheduleCopyableDataMapByInstUser
5452 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5455 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5462 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5463 if (ScheduleCopyableData *UserCD =
5464 getScheduleCopyableData(UserEI, In))
5465 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5468 }
while (It !=
Op.end());
5470 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5480 auto It = ScheduledBundles.find(
I);
5481 if (It == ScheduledBundles.end())
5483 return It->getSecond();
5487 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5489 return Data->getSchedulingRegionID() == SchedulingRegionID;
5491 return CD->getSchedulingRegionID() == SchedulingRegionID;
5493 [&](
const ScheduleEntity *BundleMember) {
5494 return isInSchedulingRegion(*BundleMember);
5500 template <
typename ReadyListType>
5501 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5502 const EdgeInfo &EI, ScheduleEntity *
Data,
5503 ReadyListType &ReadyList) {
5504 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5509 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5510 if ((IsControl ||
Data->hasValidDependencies()) &&
5511 Data->incrementUnscheduledDeps(-1) == 0) {
5518 CopyableBundle.
push_back(&CD->getBundle());
5519 Bundles = CopyableBundle;
5521 Bundles = getScheduleBundles(
Data->getInst());
5523 if (!Bundles.
empty()) {
5524 for (ScheduleBundle *Bundle : Bundles) {
5525 if (Bundle->unscheduledDepsInBundle() == 0) {
5526 assert(!Bundle->isScheduled() &&
5527 "already scheduled bundle gets ready");
5528 ReadyList.insert(Bundle);
5530 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5536 "already scheduled bundle gets ready");
5538 "Expected non-copyable data");
5539 ReadyList.insert(
Data);
5546 if (!ScheduleCopyableDataMap.empty()) {
5548 getScheduleCopyableData(User,
OpIdx,
I);
5549 for (ScheduleCopyableData *CD : CopyableData)
5550 DecrUnsched(CD,
false);
5551 if (!CopyableData.empty())
5554 if (ScheduleData *OpSD = getScheduleData(
I))
5555 DecrUnsched(OpSD,
false);
5561 if (!Bundles.empty()) {
5562 auto *
In = BundleMember->getInst();
5564 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5565 unsigned TotalOpCount = 0;
5568 TotalOpCount = OperandsUses[
In] = 1;
5570 for (
const Use &U :
In->operands()) {
5573 ++Res.first->getSecond();
5580 auto DecrUnschedForInst =
5582 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5584 if (!ScheduleCopyableDataMap.empty()) {
5585 const EdgeInfo EI = {UserTE,
OpIdx};
5586 if (ScheduleCopyableData *CD =
5587 getScheduleCopyableData(EI,
I)) {
5588 if (!Checked.insert(std::make_pair(CD,
OpIdx)).second)
5590 DecrUnsched(CD,
false);
5594 auto It = OperandsUses.
find(
I);
5595 assert(It != OperandsUses.
end() &&
"Operand not found");
5596 if (It->second > 0) {
5598 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5600 if (ScheduleData *OpSD = getScheduleData(
I)) {
5601 if (!Checked.insert(std::make_pair(OpSD,
OpIdx)).second)
5603 DecrUnsched(OpSD,
false);
5608 for (ScheduleBundle *Bundle : Bundles) {
5609 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5611 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5614 auto *It =
find(Bundle->getTreeEntry()->Scalars, In);
5615 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5618 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5619 assert(Lane >= 0 &&
"Lane not set");
5621 !Bundle->getTreeEntry()->ReorderIndices.empty())
5622 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5623 assert(Lane <
static_cast<int>(
5624 Bundle->getTreeEntry()->Scalars.size()) &&
5625 "Couldn't find extract lane");
5635 In->getNumOperands() ==
5636 Bundle->getTreeEntry()->getNumOperands() ||
5637 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5638 "Missed TreeEntry operands?");
5640 bool IsNonSchedulableWithParentPhiNode =
5641 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5642 Bundle->getTreeEntry()->UserTreeIndex &&
5643 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5644 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5648 if (IsNonSchedulableWithParentPhiNode) {
5649 const TreeEntry *ParentTE =
5650 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5652 if (!ParentsUniqueUsers.
insert(User).second)
5656 for (
unsigned OpIdx :
5659 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5662 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx, Checked);
5665 if (!IsNonSchedulableWithParentPhiNode)
5667 It = std::find(std::next(It),
5668 Bundle->getTreeEntry()->Scalars.end(), In);
5669 }
while (It != Bundle->getTreeEntry()->Scalars.end());
5674 for (Use &U : BundleMember->getInst()->operands()) {
5677 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5678 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5686 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5687 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5688 if (!VisitedMemory.
insert(MemoryDep).second)
5693 << *MemoryDep <<
"\n");
5694 DecrUnsched(MemoryDep);
5697 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5698 for (ScheduleData *Dep : SD->getControlDependencies()) {
5699 if (!VisitedControl.
insert(Dep).second)
5704 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5705 DecrUnsched(Dep,
true);
5709 SD->setScheduled(
true);
5714 if (
R.isVectorized(In)) {
5716 for (TreeEntry *TE : Entries) {
5718 In->getNumOperands() !=
TE->getNumOperands())
5721 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5722 BundlePtr->setTreeEntry(TE);
5727 ProcessBundleMember(SD, Bundles);
5730 Bundle.setScheduled(
true);
5732 auto AreAllBundlesScheduled =
5733 [&](
const ScheduleEntity *SD,
5737 return !SDBundles.empty() &&
5738 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5739 return SDBundle->isScheduled();
5742 for (ScheduleEntity *SD : Bundle.getBundle()) {
5745 SDBundles = getScheduleBundles(SD->getInst());
5746 if (AreAllBundlesScheduled(SD, SDBundles)) {
5747 SD->setScheduled(
true);
5760 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5761 ScheduleStart->comesBefore(ScheduleEnd) &&
5762 "Not a valid scheduling region?");
5764 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5766 if (!Bundles.
empty()) {
5767 for (ScheduleBundle *Bundle : Bundles) {
5768 assert(isInSchedulingRegion(*Bundle) &&
5769 "primary schedule data not in window?");
5774 auto *SD = getScheduleData(
I);
5777 assert(isInSchedulingRegion(*SD) &&
5778 "primary schedule data not in window?");
5783 [](
const ScheduleEntity *Bundle) {
5784 return Bundle->isReady();
5786 "item in ready list not ready?");
5790 template <
typename ReadyListType>
5791 void initialFillReadyList(ReadyListType &ReadyList) {
5792 SmallPtrSet<ScheduleBundle *, 16> Visited;
5793 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5794 ScheduleData *SD = getScheduleData(
I);
5795 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5798 for (ScheduleBundle *Bundle : Bundles) {
5799 if (!Visited.
insert(Bundle).second)
5801 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5802 ReadyList.insert(Bundle);
5804 << *Bundle <<
"\n");
5809 ReadyList.insert(SD);
5811 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5822 const InstructionsState &S,
const EdgeInfo &EI);
5829 std::optional<ScheduleBundle *>
5831 const InstructionsState &S,
const EdgeInfo &EI);
5834 ScheduleData *allocateScheduleDataChunks();
5838 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5842 void initScheduleData(Instruction *FromI, Instruction *ToI,
5843 ScheduleData *PrevLoadStore,
5844 ScheduleData *NextLoadStore);
5848 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5853 void resetSchedule();
5870 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5874 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5875 std::unique_ptr<ScheduleCopyableData>>
5876 ScheduleCopyableDataMap;
5882 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5883 ScheduleCopyableDataMapByInst;
5889 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
5891 ScheduleCopyableDataMapByInstUser;
5911 SmallSetVector<ScheduleCopyableData *, 4>>
5912 ScheduleCopyableDataMapByUsers;
5915 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5921 SetVector<ScheduleEntity *> ReadyInsts;
5931 ScheduleData *FirstLoadStoreInRegion =
nullptr;
5935 ScheduleData *LastLoadStoreInRegion =
nullptr;
5940 bool RegionHasStackSave =
false;
5943 int ScheduleRegionSize = 0;
5952 int SchedulingRegionID = 1;
5956 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5960 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
5963 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
5967 struct OrdersTypeDenseMapInfo {
5980 static unsigned getHashValue(
const OrdersType &V) {
5991 ScalarEvolution *SE;
5992 TargetTransformInfo *TTI;
5993 TargetLibraryInfo *TLI;
5996 AssumptionCache *AC;
5998 const DataLayout *DL;
5999 OptimizationRemarkEmitter *ORE;
6001 unsigned MaxVecRegSize;
6002 unsigned MinVecRegSize;
6005 IRBuilder<TargetFolder> Builder;
6012 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6017 unsigned ReductionBitWidth = 0;
6020 unsigned BaseGraphSize = 1;
6024 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6028 DenseSet<unsigned> ExtraBitWidthNodes;
6038 SecondInfo::getEmptyKey());
6043 SecondInfo::getTombstoneKey());
6048 SecondInfo::getHashValue(Val.
EdgeIdx));
6069 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6080 return R.VectorizableTree[0].get();
6084 return {&
N->UserTreeIndex,
N->Container};
6088 return {&
N->UserTreeIndex + 1,
N->Container};
6115 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6126 OS << Entry->Idx <<
".\n";
6129 for (
auto *V : Entry->Scalars) {
6131 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6132 return EU.Scalar == V;
6142 if (Entry->isGather())
6144 if (Entry->State == TreeEntry::ScatterVectorize ||
6145 Entry->State == TreeEntry::StridedVectorize ||
6146 Entry->State == TreeEntry::CompressVectorize)
6147 return "color=blue";
6156 for (
auto *
I : DeletedInstructions) {
6157 if (!
I->getParent()) {
6162 I->insertBefore(F->getEntryBlock(),
6163 F->getEntryBlock().getFirstNonPHIIt());
6165 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6168 for (
Use &U :
I->operands()) {
6170 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6174 I->dropAllReferences();
6176 for (
auto *
I : DeletedInstructions) {
6178 "trying to erase instruction with users.");
6179 I->eraseFromParent();
6185#ifdef EXPENSIVE_CHECKS
6196 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6197 "Expected non-empty mask.");
6200 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6202 Reuses[Mask[
I]] = Prev[
I];
6210 bool BottomOrder =
false) {
6211 assert(!Mask.empty() &&
"Expected non-empty mask.");
6212 unsigned Sz = Mask.size();
6215 if (Order.
empty()) {
6217 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6219 PrevOrder.
swap(Order);
6222 for (
unsigned I = 0;
I < Sz; ++
I)
6224 Order[
I] = PrevOrder[Mask[
I]];
6226 return Data.value() == Sz ||
Data.index() ==
Data.value();
6235 if (Order.
empty()) {
6237 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6247 for (
unsigned I = 0;
I < Sz; ++
I)
6249 Order[MaskOrder[
I]] =
I;
6253std::optional<BoUpSLP::OrdersType>
6255 bool TopToBottom,
bool IgnoreReorder) {
6256 assert(TE.isGather() &&
"Expected gather node only.");
6260 Type *ScalarTy = GatheredScalars.
front()->getType();
6261 size_t NumScalars = GatheredScalars.
size();
6263 return std::nullopt;
6270 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6272 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6275 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6276 return std::nullopt;
6277 OrdersType CurrentOrder(NumScalars, NumScalars);
6278 if (GatherShuffles.
size() == 1 &&
6280 Entries.
front().front()->isSame(TE.Scalars)) {
6284 return std::nullopt;
6286 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6287 TE.UserTreeIndex.UserTE)
6288 return std::nullopt;
6291 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6292 return std::nullopt;
6295 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6296 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6299 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6301 return std::nullopt;
6305 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6306 return CurrentOrder;
6310 return all_of(Mask, [&](
int I) {
6317 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6318 (Entries.
size() != 1 ||
6319 Entries.
front().front()->ReorderIndices.empty())) ||
6320 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6321 return std::nullopt;
6327 if (ShuffledSubMasks.
test(
I))
6329 const int VF = GetVF(
I);
6335 if (
any_of(Slice, [&](
unsigned I) {
return I != NumScalars; })) {
6337 ShuffledSubMasks.
set(
I);
6341 int FirstMin = INT_MAX;
6342 int SecondVecFound =
false;
6344 int Idx = Mask[
I * PartSz + K];
6346 Value *V = GatheredScalars[
I * PartSz + K];
6348 SecondVecFound =
true;
6357 SecondVecFound =
true;
6361 FirstMin = (FirstMin / PartSz) * PartSz;
6363 if (SecondVecFound) {
6365 ShuffledSubMasks.
set(
I);
6369 int Idx = Mask[
I * PartSz + K];
6373 if (Idx >= PartSz) {
6374 SecondVecFound =
true;
6377 if (CurrentOrder[
I * PartSz + Idx] >
6378 static_cast<unsigned>(
I * PartSz + K) &&
6379 CurrentOrder[
I * PartSz + Idx] !=
6380 static_cast<unsigned>(
I * PartSz + Idx))
6381 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6384 if (SecondVecFound) {
6386 ShuffledSubMasks.
set(
I);
6392 if (!ExtractShuffles.
empty())
6393 TransformMaskToOrder(
6394 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6395 if (!ExtractShuffles[
I])
6398 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6400 int K =
I * PartSz + Idx;
6403 if (!TE.ReuseShuffleIndices.empty())
6404 K = TE.ReuseShuffleIndices[K];
6407 if (!TE.ReorderIndices.empty())
6408 K = std::distance(TE.ReorderIndices.begin(),
6409 find(TE.ReorderIndices, K));
6415 .getKnownMinValue());
6420 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6421 if (ShuffledSubMasks.
any())
6422 return std::nullopt;
6423 PartSz = NumScalars;
6426 if (!Entries.
empty())
6427 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6428 if (!GatherShuffles[
I])
6430 return std::max(Entries[
I].front()->getVectorFactor(),
6431 Entries[
I].back()->getVectorFactor());
6433 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6434 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6435 return std::nullopt;
6436 return std::move(CurrentOrder);
6441 bool CompareOpcodes =
true) {
6447 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6448 (!GEP2 || GEP2->getNumOperands() == 2) &&
6449 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6450 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6453 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6457template <
typename T>
6462 return CommonAlignment;
6468 "Order is empty. Please check it before using isReverseOrder.");
6469 unsigned Sz = Order.
size();
6471 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6482 const SCEV *PtrSCEVLowest =
nullptr;
6483 const SCEV *PtrSCEVHighest =
nullptr;
6491 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6492 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6499 PtrSCEVLowest = PtrSCEV;
6506 PtrSCEVHighest = PtrSCEV;
6514 int Size =
DL.getTypeStoreSize(ElemTy);
6515 auto TryGetStride = [&](
const SCEV *Dist,
6516 const SCEV *Multiplier) ->
const SCEV * {
6518 if (M->getOperand(0) == Multiplier)
6519 return M->getOperand(1);
6520 if (M->getOperand(1) == Multiplier)
6521 return M->getOperand(0);
6524 if (Multiplier == Dist)
6529 const SCEV *Stride =
nullptr;
6530 if (
Size != 1 || SCEVs.
size() > 2) {
6532 Stride = TryGetStride(Dist, Sz);
6540 using DistOrdPair = std::pair<int64_t, int>;
6542 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6544 bool IsConsecutive =
true;
6545 for (
const SCEV *PtrSCEV : SCEVs) {
6547 if (PtrSCEV != PtrSCEVLowest) {
6549 const SCEV *Coeff = TryGetStride(Diff, Stride);
6559 Dist = SC->getAPInt().getZExtValue();
6564 auto Res = Offsets.emplace(Dist, Cnt);
6568 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6571 if (Offsets.size() != SCEVs.
size())
6573 SortedIndices.
clear();
6574 if (!IsConsecutive) {
6578 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6579 SortedIndices[Cnt] = Pair.second;
6586static std::pair<InstructionCost, InstructionCost>
6589 Type *ScalarTy, VectorType *VecTy);
6607 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6610 Mask, NumSrcElts, NumSubElts, Index)) {
6611 if (Index + NumSubElts > NumSrcElts &&
6612 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6629 "ScalableVectorType is not supported.");
6632 "Incorrect usage.");
6637 unsigned ScalarTyNumElements = VecTy->getNumElements();
6640 if (!DemandedElts[
I])
6644 I * ScalarTyNumElements, VecTy);
6647 I * ScalarTyNumElements, VecTy);
6660 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6661 if (Opcode == Instruction::ExtractElement) {
6667 Index * VecTy->getNumElements(), VecTy);
6670 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6683 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6685 Index * ScalarTy->getNumElements(), SubTp) +
6689 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6705 auto *Begin = std::next(
Mask.begin(), Index);
6706 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6710 std::iota(
Mask.begin(),
Mask.end(), 0);
6711 std::iota(std::next(
Mask.begin(), Index),
6712 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6714 return Generator(Vec, V, Mask);
6717 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6725 unsigned SubVecVF,
unsigned Index) {
6727 std::iota(Mask.begin(), Mask.end(), Index);
6728 return Builder.CreateShuffleVector(Vec, Mask);
6738 const unsigned Sz = PointerOps.
size();
6741 CompressMask[0] = 0;
6743 std::optional<unsigned> Stride = 0;
6747 std::optional<int64_t> OptPos =
6749 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6751 unsigned Pos =
static_cast<unsigned>(*OptPos);
6752 CompressMask[
I] = Pos;
6759 if (Pos != *Stride *
I)
6762 return Stride.has_value();
6775 InterleaveFactor = 0;
6777 const size_t Sz = VL.
size();
6785 if (AreAllUsersVectorized(V))
6788 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
6789 Mask.empty() ?
I : Mask[
I]);
6792 if (ExtractCost <= ScalarCost)
6797 if (Order.
empty()) {
6798 Ptr0 = PointerOps.
front();
6799 PtrN = PointerOps.
back();
6801 Ptr0 = PointerOps[Order.
front()];
6802 PtrN = PointerOps[Order.
back()];
6804 std::optional<int64_t> Diff =
6808 const size_t MaxRegSize =
6812 if (*Diff / Sz >= MaxRegSize / 8)
6816 Align CommonAlignment = LI->getAlign();
6818 Ptr0, LoadVecTy, CommonAlignment,
DL,
6821 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6822 LI->getPointerAddressSpace()))
6828 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6832 auto [ScalarGEPCost, VectorGEPCost] =
6834 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6852 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6853 LI->getPointerAddressSpace(),
CostKind);
6856 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6857 LI->getPointerAddressSpace(),
CostKind);
6859 if (IsStrided && !IsMasked && Order.
empty()) {
6866 AlignedLoadVecTy = LoadVecTy;
6867 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6869 LI->getPointerAddressSpace())) {
6871 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
6872 Instruction::Load, AlignedLoadVecTy,
6873 CompressMask[1], {}, CommonAlignment,
6874 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6875 if (InterleavedCost < GatherCost) {
6876 InterleaveFactor = CompressMask[1];
6877 LoadVecTy = AlignedLoadVecTy;
6884 if (!Order.
empty()) {
6887 NewMask[
I] = CompressMask[Mask[
I]];
6889 CompressMask.
swap(NewMask);
6891 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6892 return TotalVecCost < GatherCost;
6905 unsigned InterleaveFactor;
6909 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6910 CompressMask, LoadVecTy);
6927 Align Alignment,
const int64_t Diff,
6928 const size_t Sz)
const {
6929 if (Diff % (Sz - 1) != 0)
6933 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
6935 return !isVectorized(U) && !MustGather.contains(U);
6939 const uint64_t AbsoluteDiff = std::abs(Diff);
6941 if (IsAnyPointerUsedOutGraph ||
6942 (AbsoluteDiff > Sz &&
6945 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
6946 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
6947 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
6948 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
6950 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
6960 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const {
6961 const size_t Sz = PointerOps.
size();
6962 if (!
isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz))
6965 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
6974 else if (
Ptr != Ptr0)
6978 if (((Dist / Stride) * Stride) != Dist || !Dists.
insert(Dist).second)
6981 if (Dists.
size() == Sz) {
6982 Type *StrideTy = DL->getIndexType(Ptr0->
getType());
6983 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6993 StridedPtrInfo &SPtrInfo)
const {
6994 const unsigned Sz = PointerOps.
size();
6996 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
6997 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
6999 if (
const SCEV *Stride =
7002 SPtrInfo.StrideSCEV = Stride;
7011 unsigned *BestVF,
bool TryRecursiveCheck)
const {
7024 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7030 const size_t Sz = VL.
size();
7032 auto *POIter = PointerOps.
begin();
7033 for (
Value *V : VL) {
7035 if (!L || !L->isSimple())
7037 *POIter = L->getPointerOperand();
7043 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7052 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7053 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7064 if (Order.
empty()) {
7065 Ptr0 = PointerOps.
front();
7066 PtrN = PointerOps.
back();
7068 Ptr0 = PointerOps[Order.
front()];
7069 PtrN = PointerOps[Order.
back()];
7071 std::optional<int64_t> Diff =
7074 if (
static_cast<uint64_t>(*Diff) == Sz - 1)
7077 *TLI, [&](
Value *V) {
7078 return areAllUsersVectorized(
7086 *Diff, Ptr0, PtrN, SPtrInfo))
7089 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7090 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7095 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
7097 bool ProfitableGatherPointers) {
7102 auto [ScalarGEPCost, VectorGEPCost] =
7104 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
7108 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
7110 if (
static_cast<unsigned>(
count_if(
7129 return C + TTI.getInstructionCost(
7135 TTI.getGatherScatterOpCost(
7137 false, CommonAlignment,
CostKind) +
7138 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7146 constexpr unsigned ListLimit = 4;
7147 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7156 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7166 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7171 PointerOps, SPtrInfo, BestVF,
7179 DemandedElts.
setBits(Cnt, Cnt + VF);
7195 if (!DemandedElts.
isZero()) {
7201 if (DemandedElts[Idx])
7212 LI0->getPointerOperand(),
7213 Instruction::GetElementPtr,
CostKind, ScalarTy,
7217 if (
static_cast<unsigned>(
7219 PointerOps.
size() - 1 ||
7238 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7239 LI0->getPointerAddressSpace(),
CostKind,
7244 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7245 LI0->getPointerOperand(),
7251 VecLdCost += TTI.getMaskedMemoryOpCost(
7252 Instruction::Load, SubVecTy, CommonAlignment,
7253 LI0->getPointerAddressSpace(),
CostKind) +
7259 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7260 LI0->getPointerOperand(),
7271 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7280 if (MaskedGatherCost >= VecLdCost &&
7293 bool ProfitableGatherPointers =
7294 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7295 return L->isLoopInvariant(V);
7297 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7300 (
GEP &&
GEP->getNumOperands() == 2 &&
7308 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7309 ProfitableGatherPointers))
7321 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7322 "Expected list of pointer operands.");
7327 std::pair<BasicBlock *, Value *>,
7331 .try_emplace(std::make_pair(
7335 SortedIndices.
clear();
7337 auto Key = std::make_pair(BBs[Cnt + 1],
7339 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7341 std::optional<int64_t> Diff =
7342 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7343 ElemTy, Ptr, DL, SE,
7348 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7354 if (Bases.size() > VL.
size() / 2 - 1)
7358 Bases.find(
Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
7362 if (Bases.size() == VL.
size())
7365 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7366 Bases.front().second.size() == VL.
size()))
7371 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7380 FirstPointers.
insert(P1);
7381 SecondPointers.
insert(P2);
7387 "Unable to find matching root.");
7390 for (
auto &
Base : Bases) {
7391 for (
auto &Vec :
Base.second) {
7392 if (Vec.size() > 1) {
7394 int64_t InitialOffset = std::get<1>(Vec[0]);
7395 bool AnyConsecutive =
7397 return std::get<1>(
P.value()) ==
7398 int64_t(
P.index()) + InitialOffset;
7402 if (!AnyConsecutive)
7407 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7411 for (
auto &
T : Bases)
7412 for (
const auto &Vec :
T.second)
7413 for (
const auto &
P : Vec)
7417 "Expected SortedIndices to be the size of VL");
7421std::optional<BoUpSLP::OrdersType>
7423 assert(TE.isGather() &&
"Expected gather node only.");
7424 Type *ScalarTy = TE.Scalars[0]->getType();
7427 Ptrs.
reserve(TE.Scalars.size());
7429 BBs.
reserve(TE.Scalars.size());
7430 for (
Value *V : TE.Scalars) {
7432 if (!L || !L->isSimple())
7433 return std::nullopt;
7439 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7441 return std::move(Order);
7442 return std::nullopt;
7453 if (VU->
getType() != V->getType())
7456 if (!VU->
hasOneUse() && !V->hasOneUse())
7462 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7469 bool IsReusedIdx =
false;
7471 if (IE2 == VU && !IE1)
7473 if (IE1 == V && !IE2)
7474 return V->hasOneUse();
7475 if (IE1 && IE1 != V) {
7477 IsReusedIdx |= ReusedIdx.
test(Idx1);
7478 ReusedIdx.
set(Idx1);
7479 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7484 if (IE2 && IE2 != VU) {
7486 IsReusedIdx |= ReusedIdx.
test(Idx2);
7487 ReusedIdx.
set(Idx2);
7488 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7493 }
while (!IsReusedIdx && (IE1 || IE2));
7501 const TargetLibraryInfo &TLI);
7503std::optional<BoUpSLP::OrdersType>
7505 bool IgnoreReorder) {
7508 if (!TE.ReuseShuffleIndices.empty()) {
7510 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7511 "Reshuffling scalars not yet supported for nodes with padding");
7514 return std::nullopt;
7522 unsigned Sz = TE.Scalars.size();
7523 if (TE.isGather()) {
7524 if (std::optional<OrdersType> CurrentOrder =
7529 ::addMask(Mask, TE.ReuseShuffleIndices);
7530 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7531 unsigned Sz = TE.Scalars.size();
7532 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7535 Res[Idx + K * Sz] =
I + K * Sz;
7537 return std::move(Res);
7540 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7542 2 * TE.getVectorFactor())) == 1)
7543 return std::nullopt;
7544 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7545 return std::nullopt;
7549 if (TE.ReorderIndices.empty())
7550 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7553 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7554 unsigned VF = ReorderMask.
size();
7558 for (
unsigned I = 0;
I < VF;
I += Sz) {
7560 unsigned UndefCnt = 0;
7561 unsigned Limit = std::min(Sz, VF -
I);
7570 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7572 return std::nullopt;
7574 for (
unsigned K = 0; K < NumParts; ++K) {
7575 unsigned Idx = Val + Sz * K;
7576 if (Idx < VF &&
I + K < VF)
7577 ResOrder[Idx] =
I + K;
7580 return std::move(ResOrder);
7582 unsigned VF = TE.getVectorFactor();
7585 TE.ReuseShuffleIndices.end());
7586 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7588 if (isa<PoisonValue>(V))
7590 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7591 return Idx && *Idx < Sz;
7593 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7594 "by BinaryOperator and CastInst.");
7596 if (TE.ReorderIndices.empty())
7597 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7600 for (
unsigned I = 0;
I < VF; ++
I) {
7601 int &Idx = ReusedMask[
I];
7604 Value *V = TE.Scalars[ReorderMask[Idx]];
7606 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
7612 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
7613 auto *It = ResOrder.
begin();
7614 for (
unsigned K = 0; K < VF; K += Sz) {
7618 std::iota(SubMask.
begin(), SubMask.
end(), 0);
7620 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
7621 std::advance(It, Sz);
7624 return Data.index() ==
Data.value();
7626 return std::nullopt;
7627 return std::move(ResOrder);
7629 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7630 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7632 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
7633 return std::nullopt;
7634 if (TE.State == TreeEntry::SplitVectorize ||
7635 ((TE.State == TreeEntry::Vectorize ||
7636 TE.State == TreeEntry::StridedVectorize ||
7637 TE.State == TreeEntry::CompressVectorize) &&
7640 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7641 "Alternate instructions are only supported by "
7642 "BinaryOperator and CastInst.");
7643 return TE.ReorderIndices;
7645 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7646 TE.isAltShuffle()) {
7647 assert(TE.ReuseShuffleIndices.empty() &&
7648 "ReuseShuffleIndices should be "
7649 "empty for alternate instructions.");
7651 TE.buildAltOpShuffleMask(
7653 assert(TE.getMatchingMainOpOrAltOp(
I) &&
7654 "Unexpected main/alternate opcode");
7658 const int VF = TE.getVectorFactor();
7663 ResOrder[Mask[
I] % VF] =
I;
7665 return std::move(ResOrder);
7667 if (!TE.ReorderIndices.empty())
7668 return TE.ReorderIndices;
7669 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7670 if (!TE.ReorderIndices.empty())
7671 return TE.ReorderIndices;
7674 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
7682 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
7690 assert(BB1 != BB2 &&
"Expected different basic blocks.");
7691 if (!DT->isReachableFromEntry(BB1))
7693 if (!DT->isReachableFromEntry(BB2))
7695 auto *NodeA = DT->getNode(BB1);
7696 auto *NodeB = DT->getNode(BB2);
7697 assert(NodeA &&
"Should only process reachable instructions");
7698 assert(NodeB &&
"Should only process reachable instructions");
7699 assert((NodeA == NodeB) ==
7700 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7701 "Different nodes should have different DFS numbers");
7702 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7704 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
7705 Value *V1 = TE.Scalars[I1];
7706 Value *V2 = TE.Scalars[I2];
7719 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7720 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7721 FirstUserOfPhi2->getParent());
7731 if (UserBVHead[I1] && !UserBVHead[I2])
7733 if (!UserBVHead[I1])
7735 if (UserBVHead[I1] == UserBVHead[I2])
7738 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
7740 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7753 if (EE1->getOperand(0) == EE2->getOperand(0))
7755 if (!Inst1 && Inst2)
7757 if (Inst1 && Inst2) {
7765 "Expected either instructions or arguments vector operands.");
7766 return P1->getArgNo() < P2->getArgNo();
7771 std::iota(Phis.
begin(), Phis.
end(), 0);
7774 return std::nullopt;
7775 return std::move(Phis);
7777 if (TE.isGather() &&
7778 (!TE.hasState() || !TE.isAltShuffle() ||
7779 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7783 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7787 auto *EE = dyn_cast<ExtractElementInst>(V);
7788 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7794 canReuseExtract(TE.Scalars, CurrentOrder,
true);
7795 if (Reuse || !CurrentOrder.
empty())
7796 return std::move(CurrentOrder);
7804 int Sz = TE.Scalars.size();
7808 if (It == TE.Scalars.begin())
7811 if (It != TE.Scalars.end()) {
7813 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7828 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7831 return std::move(Order);
7836 return std::nullopt;
7837 if (TE.Scalars.size() >= 3)
7842 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7844 StridedPtrInfo SPtrInfo;
7847 CurrentOrder, PointerOps, SPtrInfo);
7850 return std::move(CurrentOrder);
7855 if (std::optional<OrdersType> CurrentOrder =
7857 return CurrentOrder;
7859 return std::nullopt;
7869 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
7871 if (Cluster != FirstCluster)
7877void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask)
const {
7880 const unsigned Sz =
TE.Scalars.size();
7882 if (!
TE.isGather() ||
7887 SmallVector<int> NewMask;
7889 addMask(NewMask,
TE.ReuseShuffleIndices);
7891 TE.ReorderIndices.clear();
7893 ArrayRef<int> Slice =
ArrayRef(NewMask).slice(0, Sz);
7894 SmallVector<unsigned> NewOrder(Slice);
7898 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
7899 *End =
TE.ReuseShuffleIndices.end();
7900 It != End; std::advance(It, Sz))
7901 std::iota(It, std::next(It, Sz), 0);
7907 "Expected same size of orders");
7908 size_t Sz = Order.
size();
7911 if (Order[Idx] != Sz)
7912 UsedIndices.
set(Order[Idx]);
7914 if (SecondaryOrder.
empty()) {
7916 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
7920 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7921 !UsedIndices.
test(SecondaryOrder[Idx]))
7922 Order[Idx] = SecondaryOrder[Idx];
7930 constexpr unsigned TinyVF = 2;
7931 constexpr unsigned TinyTree = 10;
7932 constexpr unsigned PhiOpsLimit = 12;
7933 constexpr unsigned GatherLoadsLimit = 2;
7934 if (VectorizableTree.size() <= TinyTree)
7936 if (VectorizableTree.front()->hasState() &&
7937 !VectorizableTree.front()->isGather() &&
7938 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7939 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7940 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7941 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7942 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7943 VectorizableTree.front()->ReorderIndices.empty()) {
7947 if (VectorizableTree.front()->hasState() &&
7948 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7949 VectorizableTree.front()->Scalars.size() == TinyVF &&
7950 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7953 if (VectorizableTree.front()->hasState() &&
7954 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7955 VectorizableTree.front()->ReorderIndices.empty()) {
7956 const unsigned ReorderedSplitsCnt =
7957 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7958 return TE->State == TreeEntry::SplitVectorize &&
7959 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7960 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7963 if (ReorderedSplitsCnt <= 1 &&
7965 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7966 return ((!TE->isGather() &&
7967 (TE->ReorderIndices.empty() ||
7968 (TE->UserTreeIndex.UserTE &&
7969 TE->UserTreeIndex.UserTE->State ==
7970 TreeEntry::Vectorize &&
7971 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7973 (TE->isGather() && TE->ReorderIndices.empty() &&
7974 (!TE->hasState() || TE->isAltShuffle() ||
7975 TE->getOpcode() == Instruction::Load ||
7976 TE->getOpcode() == Instruction::ZExt ||
7977 TE->getOpcode() == Instruction::SExt))) &&
7978 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7979 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
7980 return !isConstant(V) && isVectorized(V);
7982 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7985 bool HasPhis =
false;
7986 bool HasLoad =
true;
7987 unsigned GatherLoads = 0;
7988 for (
const std::unique_ptr<TreeEntry> &TE :
7989 ArrayRef(VectorizableTree).drop_front()) {
7990 if (TE->State == TreeEntry::SplitVectorize)
7992 if (!TE->hasState()) {
7996 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8001 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8002 if (!TE->isGather()) {
8009 if (GatherLoads >= GatherLoadsLimit)
8012 if (TE->getOpcode() == Instruction::GetElementPtr ||
8015 if (TE->getOpcode() != Instruction::PHI &&
8016 (!TE->hasCopyableElements() ||
8018 TE->Scalars.size() / 2))
8020 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8021 TE->getNumOperands() > PhiOpsLimit)
8030void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
8032 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
8035 std::iota(NewMask.
begin(), NewMask.
end(), 0);
8036 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8039 copy(MaskOrder, NewMaskOrder.begin());
8041 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
8042 unsigned Offset = CombinedEntriesWithIndices.
back().second;
8051 ReorderIndices.clear();
8070 ExternalUserReorderMap;
8074 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8075 const std::unique_ptr<TreeEntry> &TE) {
8078 findExternalStoreUsersReorderIndices(TE.get());
8079 if (!ExternalUserReorderIndices.
empty()) {
8080 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8082 std::move(ExternalUserReorderIndices));
8088 if (TE->hasState() && TE->isAltShuffle() &&
8089 TE->State != TreeEntry::SplitVectorize) {
8090 Type *ScalarTy = TE->Scalars[0]->getType();
8092 unsigned Opcode0 = TE->getOpcode();
8093 unsigned Opcode1 = TE->getAltOpcode();
8097 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8098 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8104 bool IgnoreReorder =
8105 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8106 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
8107 VectorizableTree.front()->getOpcode() == Instruction::Store);
8108 if (std::optional<OrdersType> CurrentOrder =
8118 const TreeEntry *UserTE = TE.get();
8120 if (!UserTE->UserTreeIndex)
8122 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8123 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8124 UserTE->UserTreeIndex.UserTE->Idx != 0)
8126 UserTE = UserTE->UserTreeIndex.UserTE;
8129 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8130 if (!(TE->State == TreeEntry::Vectorize ||
8131 TE->State == TreeEntry::StridedVectorize ||
8132 TE->State == TreeEntry::SplitVectorize ||
8133 TE->State == TreeEntry::CompressVectorize) ||
8134 !TE->ReuseShuffleIndices.empty())
8135 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8136 if (TE->State == TreeEntry::Vectorize &&
8137 TE->getOpcode() == Instruction::PHI)
8138 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8143 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8144 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8145 auto It = VFToOrderedEntries.
find(VF);
8146 if (It == VFToOrderedEntries.
end())
8160 for (
const TreeEntry *OpTE : OrderedEntries) {
8163 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8164 OpTE->State != TreeEntry::SplitVectorize)
8167 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8169 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8170 auto It = GathersToOrders.find(OpTE);
8171 if (It != GathersToOrders.end())
8174 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8175 auto It = AltShufflesToOrders.find(OpTE);
8176 if (It != AltShufflesToOrders.end())
8179 if (OpTE->State == TreeEntry::Vectorize &&
8180 OpTE->getOpcode() == Instruction::PHI) {
8181 auto It = PhisToOrders.
find(OpTE);
8182 if (It != PhisToOrders.
end())
8185 return OpTE->ReorderIndices;
8188 auto It = ExternalUserReorderMap.
find(OpTE);
8189 if (It != ExternalUserReorderMap.
end()) {
8190 const auto &ExternalUserReorderIndices = It->second;
8194 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8195 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8196 ExternalUserReorderIndices.size();
8198 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8199 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8206 if (OpTE->State == TreeEntry::Vectorize &&
8207 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8208 assert(!OpTE->isAltShuffle() &&
8209 "Alternate instructions are only supported by BinaryOperator "
8213 unsigned E = Order.
size();
8216 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8219 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8221 ++OrdersUses.try_emplace(Order, 0).first->second;
8224 if (OrdersUses.empty())
8227 unsigned IdentityCnt = 0;
8228 unsigned FilledIdentityCnt = 0;
8230 for (
auto &Pair : OrdersUses) {
8232 if (!Pair.first.empty())
8233 FilledIdentityCnt += Pair.second;
8234 IdentityCnt += Pair.second;
8239 unsigned Cnt = IdentityCnt;
8240 for (
auto &Pair : OrdersUses) {
8244 if (Cnt < Pair.second ||
8245 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8246 Cnt == Pair.second && !BestOrder.
empty() &&
8249 BestOrder = Pair.first;
8262 unsigned E = BestOrder.
size();
8264 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8267 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8269 if (TE->Scalars.size() != VF) {
8270 if (TE->ReuseShuffleIndices.size() == VF) {
8271 assert(TE->State != TreeEntry::SplitVectorize &&
8272 "Split vectorized not expected.");
8277 (!TE->UserTreeIndex ||
8278 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8279 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8280 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8281 "All users must be of VF size.");
8288 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8294 reorderNodeWithReuses(*TE, Mask);
8296 if (TE->UserTreeIndex &&
8297 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8298 TE->UserTreeIndex.UserTE->reorderSplitNode(
8299 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8303 if ((TE->State == TreeEntry::SplitVectorize &&
8304 TE->ReuseShuffleIndices.empty()) ||
8305 ((TE->State == TreeEntry::Vectorize ||
8306 TE->State == TreeEntry::StridedVectorize ||
8307 TE->State == TreeEntry::CompressVectorize) &&
8312 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8313 TE->ReuseShuffleIndices.empty())) &&
8314 "Alternate instructions are only supported by BinaryOperator "
8320 TE->reorderOperands(Mask);
8323 TE->reorderOperands(Mask);
8324 assert(TE->ReorderIndices.empty() &&
8325 "Expected empty reorder sequence.");
8328 if (!TE->ReuseShuffleIndices.empty()) {
8335 addMask(NewReuses, TE->ReuseShuffleIndices);
8336 TE->ReuseShuffleIndices.swap(NewReuses);
8337 }
else if (TE->UserTreeIndex &&
8338 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8340 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8346void BoUpSLP::buildReorderableOperands(
8347 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8351 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8352 return OpData.first ==
I &&
8353 (OpData.second->State == TreeEntry::Vectorize ||
8354 OpData.second->State == TreeEntry::StridedVectorize ||
8355 OpData.second->State == TreeEntry::CompressVectorize ||
8356 OpData.second->State == TreeEntry::SplitVectorize);
8360 if (UserTE->hasState()) {
8361 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8362 UserTE->getOpcode() == Instruction::ExtractValue)
8364 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8366 if (UserTE->getOpcode() == Instruction::Store &&
8367 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8369 if (UserTE->getOpcode() == Instruction::Load &&
8370 (UserTE->State == TreeEntry::Vectorize ||
8371 UserTE->State == TreeEntry::StridedVectorize ||
8372 UserTE->State == TreeEntry::CompressVectorize))
8375 TreeEntry *TE = getOperandEntry(UserTE,
I);
8376 assert(TE &&
"Expected operand entry.");
8377 if (!TE->isGather()) {
8380 Edges.emplace_back(
I, TE);
8386 if (TE->State == TreeEntry::ScatterVectorize &&
8387 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8391 if (ReorderableGathers.
contains(TE))
8397 struct TreeEntryCompare {
8398 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8399 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8400 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8401 return LHS->Idx < RHS->Idx;
8410 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8411 if (TE->State != TreeEntry::Vectorize &&
8412 TE->State != TreeEntry::StridedVectorize &&
8413 TE->State != TreeEntry::CompressVectorize &&
8414 TE->State != TreeEntry::SplitVectorize)
8415 NonVectorized.
insert(TE.get());
8416 if (std::optional<OrdersType> CurrentOrder =
8418 Queue.push(TE.get());
8419 if (!(TE->State == TreeEntry::Vectorize ||
8420 TE->State == TreeEntry::StridedVectorize ||
8421 TE->State == TreeEntry::CompressVectorize ||
8422 TE->State == TreeEntry::SplitVectorize) ||
8423 !TE->ReuseShuffleIndices.empty())
8424 GathersToOrders.
insert(TE.get());
8433 while (!Queue.empty()) {
8435 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8436 TreeEntry *TE = Queue.top();
8437 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8440 while (!Queue.empty()) {
8442 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8447 for (TreeEntry *TE : OrderedOps) {
8448 if (!(TE->State == TreeEntry::Vectorize ||
8449 TE->State == TreeEntry::StridedVectorize ||
8450 TE->State == TreeEntry::CompressVectorize ||
8451 TE->State == TreeEntry::SplitVectorize ||
8452 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8453 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8454 !Visited.
insert(TE).second)
8458 Users.first = TE->UserTreeIndex.UserTE;
8459 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8463 if (
Data.first->State == TreeEntry::SplitVectorize) {
8465 Data.second.size() <= 2 &&
8466 "Expected not greater than 2 operands for split vectorize node.");
8468 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8471 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8472 "Expected exactly 2 entries.");
8473 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8474 TreeEntry &OpTE = *VectorizableTree[
P.first];
8476 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8477 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8479 const auto BestOrder =
8488 const unsigned E = Order.
size();
8491 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8493 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8495 if (!OpTE.ReorderIndices.empty()) {
8496 OpTE.ReorderIndices.clear();
8497 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8500 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8504 if (
Data.first->ReuseShuffleIndices.empty() &&
8505 !
Data.first->ReorderIndices.empty()) {
8508 Queue.push(
Data.first);
8514 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8526 for (
const auto &
Op :
Data.second) {
8527 TreeEntry *OpTE =
Op.second;
8528 if (!VisitedOps.
insert(OpTE).second)
8530 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8532 const auto Order = [&]() ->
const OrdersType {
8533 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8537 return OpTE->ReorderIndices;
8541 if (Order.
size() == 1)
8547 Value *Root = OpTE->hasState()
8550 auto GetSameNodesUsers = [&](
Value *Root) {
8552 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8553 if (TE != OpTE && TE->UserTreeIndex &&
8554 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8555 TE->Scalars.size() == OpTE->Scalars.size() &&
8556 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8557 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8558 Res.
insert(TE->UserTreeIndex.UserTE);
8560 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8561 if (TE != OpTE && TE->UserTreeIndex &&
8562 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8563 TE->Scalars.size() == OpTE->Scalars.size() &&
8564 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8565 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8566 Res.
insert(TE->UserTreeIndex.UserTE);
8570 auto GetNumOperands = [](
const TreeEntry *TE) {
8571 if (TE->State == TreeEntry::SplitVectorize)
8572 return TE->getNumOperands();
8574 return CI->arg_size();
8575 return TE->getNumOperands();
8577 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8578 const TreeEntry *TE) {
8586 const TreeEntry *
Op = getOperandEntry(TE, Idx);
8587 if (
Op->isGather() &&
Op->hasState()) {
8588 const TreeEntry *VecOp =
8589 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8593 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8600 if (!RevisitedOps.
insert(UTE).second)
8602 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8603 !UTE->ReuseShuffleIndices.empty() ||
8604 (UTE->UserTreeIndex &&
8605 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8606 (
Data.first->UserTreeIndex &&
8607 Data.first->UserTreeIndex.UserTE == UTE) ||
8608 (IgnoreReorder && UTE->UserTreeIndex &&
8609 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8610 NodeShouldBeReorderedWithOperands(UTE);
8613 for (TreeEntry *UTE :
Users) {
8621 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
8623 Queue.push(
const_cast<TreeEntry *
>(
Op));
8628 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
8629 return P.second == OpTE;
8632 if (OpTE->State == TreeEntry::Vectorize &&
8633 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8634 assert(!OpTE->isAltShuffle() &&
8635 "Alternate instructions are only supported by BinaryOperator "
8639 unsigned E = Order.
size();
8642 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8645 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
8647 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
8649 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
8650 const auto AllowsReordering = [&](
const TreeEntry *TE) {
8651 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8652 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8653 (IgnoreReorder && TE->Idx == 0))
8655 if (TE->isGather()) {
8665 if (OpTE->UserTreeIndex) {
8666 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8667 if (!VisitedUsers.
insert(UserTE).second)
8672 if (AllowsReordering(UserTE))
8680 if (
static_cast<unsigned>(
count_if(
8681 Ops, [UserTE, &AllowsReordering](
8682 const std::pair<unsigned, TreeEntry *> &
Op) {
8683 return AllowsReordering(
Op.second) &&
8684 Op.second->UserTreeIndex.UserTE == UserTE;
8685 })) <=
Ops.size() / 2)
8686 ++Res.first->second;
8689 if (OrdersUses.empty()) {
8694 unsigned IdentityCnt = 0;
8695 unsigned VF =
Data.second.front().second->getVectorFactor();
8697 for (
auto &Pair : OrdersUses) {
8699 IdentityCnt += Pair.second;
8704 unsigned Cnt = IdentityCnt;
8705 for (
auto &Pair : OrdersUses) {
8709 if (Cnt < Pair.second) {
8711 BestOrder = Pair.first;
8728 unsigned E = BestOrder.
size();
8730 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8732 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
8733 TreeEntry *TE =
Op.second;
8734 if (!VisitedOps.
insert(TE).second)
8736 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
8737 reorderNodeWithReuses(*TE, Mask);
8741 if (TE->State != TreeEntry::Vectorize &&
8742 TE->State != TreeEntry::StridedVectorize &&
8743 TE->State != TreeEntry::CompressVectorize &&
8744 TE->State != TreeEntry::SplitVectorize &&
8745 (TE->State != TreeEntry::ScatterVectorize ||
8746 TE->ReorderIndices.empty()))
8748 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
8749 TE->ReorderIndices.empty()) &&
8750 "Non-matching sizes of user/operand entries.");
8752 if (IgnoreReorder && TE == VectorizableTree.front().get())
8753 IgnoreReorder =
false;
8756 for (TreeEntry *
Gather : GatherOps) {
8758 "Unexpected reordering of gathers.");
8759 if (!
Gather->ReuseShuffleIndices.empty()) {
8769 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
8770 return TE.isAltShuffle() &&
8771 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8772 TE.ReorderIndices.empty());
8774 if (
Data.first->State != TreeEntry::Vectorize ||
8776 Data.first->getMainOp()) ||
8777 IsNotProfitableAltCodeNode(*
Data.first))
8778 Data.first->reorderOperands(Mask);
8780 IsNotProfitableAltCodeNode(*
Data.first) ||
8781 Data.first->State == TreeEntry::StridedVectorize ||
8782 Data.first->State == TreeEntry::CompressVectorize) {
8786 if (
Data.first->ReuseShuffleIndices.empty() &&
8787 !
Data.first->ReorderIndices.empty() &&
8788 !IsNotProfitableAltCodeNode(*
Data.first)) {
8791 Queue.push(
Data.first);
8799 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8800 VectorizableTree.front()->ReuseShuffleIndices.empty())
8801 VectorizableTree.front()->ReorderIndices.
clear();
8804Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
8805 if (Entry.hasState() &&
8806 (Entry.getOpcode() == Instruction::Store ||
8807 Entry.getOpcode() == Instruction::Load) &&
8808 Entry.State == TreeEntry::StridedVectorize &&
8809 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
8816 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8820 for (
auto &TEPtr : VectorizableTree) {
8821 TreeEntry *Entry = TEPtr.get();
8824 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8828 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8829 Value *Scalar = Entry->Scalars[Lane];
8834 auto It = ScalarToExtUses.
find(Scalar);
8835 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
8838 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8839 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8840 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
8841 <<
" from " << *Scalar <<
"for many users.\n");
8842 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8843 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8844 ExternalUsesWithNonUsers.insert(Scalar);
8849 const auto ExtI = ExternallyUsedValues.
find(Scalar);
8850 if (ExtI != ExternallyUsedValues.
end()) {
8851 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8852 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
8853 << FoundLane <<
" from " << *Scalar <<
".\n");
8854 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
8855 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8866 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8871 !UseEntries.
empty()) {
8875 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8878 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8879 return UseEntry->State == TreeEntry::ScatterVectorize ||
8881 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8884 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
8887 [](TreeEntry *UseEntry) {
8888 return UseEntry->isGather();
8894 if (It != ScalarToExtUses.
end()) {
8895 ExternalUses[It->second].User =
nullptr;
8900 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
8902 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8904 <<
" from lane " << FoundLane <<
" from " << *Scalar
8906 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8907 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8908 ExternalUsesWithNonUsers.insert(Scalar);
8917BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
8921 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
8922 Value *V = TE->Scalars[Lane];
8935 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
8944 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
8945 SI->getValueOperand()->getType(),
Ptr}];
8948 if (StoresVec.size() > Lane)
8950 if (!StoresVec.empty()) {
8952 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
8953 SI->getValueOperand()->getType(),
8954 StoresVec.front()->getPointerOperand(), *
DL, *SE,
8960 StoresVec.push_back(
SI);
8965 for (
auto &
P : PtrToStoresMap) {
8980 StoreInst *S0 = StoresVec[0];
8985 StoreInst *
SI = StoresVec[Idx];
8986 std::optional<int64_t> Diff =
8988 SI->getPointerOperand(), *DL, *SE,
8994 if (StoreOffsetVec.
size() != StoresVec.
size())
8996 sort(StoreOffsetVec, llvm::less_first());
8998 int64_t PrevDist = 0;
8999 for (
const auto &
P : StoreOffsetVec) {
9000 if (Idx > 0 &&
P.first != PrevDist + 1)
9008 ReorderIndices.assign(StoresVec.
size(), 0);
9009 bool IsIdentity =
true;
9011 ReorderIndices[
P.second] =
I;
9012 IsIdentity &=
P.second ==
I;
9018 ReorderIndices.clear();
9025 for (
unsigned Idx : Order)
9026 dbgs() << Idx <<
", ";
9032BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
9033 unsigned NumLanes =
TE->Scalars.size();
9046 if (StoresVec.
size() != NumLanes)
9051 if (!canFormVector(StoresVec, ReorderIndices))
9056 ExternalReorderIndices.
push_back(ReorderIndices);
9058 return ExternalReorderIndices;
9064 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9065 "TreeEntryToStridedPtrInfoMap is not cleared");
9066 UserIgnoreList = &UserIgnoreLst;
9069 buildTreeRec(Roots, 0,
EdgeInfo());
9074 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9075 "TreeEntryToStridedPtrInfoMap is not cleared");
9078 buildTreeRec(Roots, 0,
EdgeInfo());
9087 bool AddNew =
true) {
9095 for (
Value *V : VL) {
9099 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9101 bool IsFound =
false;
9102 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
9103 assert(LI->getParent() ==
Data.front().first->getParent() &&
9104 LI->getType() ==
Data.front().first->getType() &&
9108 "Expected loads with the same type, same parent and same "
9109 "underlying pointer.");
9111 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
9112 Data.front().first->getPointerOperand(),
DL, SE,
9116 auto It = Map.find(*Dist);
9117 if (It != Map.end() && It->second != LI)
9119 if (It == Map.end()) {
9120 Data.emplace_back(LI, *Dist);
9121 Map.try_emplace(*Dist, LI);
9131 auto FindMatchingLoads =
9136 int64_t &
Offset,
unsigned &Start) {
9138 return GatheredLoads.
end();
9147 std::optional<int64_t> Dist =
9149 Data.front().first->getType(),
9150 Data.front().first->getPointerOperand(),
DL, SE,
9156 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9162 unsigned NumUniques = 0;
9163 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9164 bool Used = DataLoads.
contains(Pair.first);
9165 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9169 Repeated.insert(Cnt);
9172 if (NumUniques > 0 &&
9173 (Loads.
size() == NumUniques ||
9174 (Loads.
size() - NumUniques >= 2 &&
9175 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9181 return std::next(GatheredLoads.
begin(), Idx);
9185 return GatheredLoads.
end();
9187 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9191 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9193 while (It != GatheredLoads.
end()) {
9194 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9195 for (
unsigned Idx : LocalToAdd)
9198 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9202 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9209 Loads.push_back(
Data[Idx]);
9215 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9216 return PD.front().first->getParent() == LI->
getParent() &&
9217 PD.front().first->getType() == LI->
getType();
9219 while (It != GatheredLoads.
end()) {
9222 std::next(It), GatheredLoads.
end(),
9223 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9224 return PD.front().first->getParent() == LI->getParent() &&
9225 PD.front().first->getType() == LI->getType();
9229 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9230 AddNewLoads(GatheredLoads.emplace_back());
9235void BoUpSLP::tryToVectorizeGatheredLoads(
9236 const SmallMapVector<
9237 std::tuple<BasicBlock *, Value *, Type *>,
9240 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9243 LoadEntriesToVectorize.size());
9244 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9245 Set.insert_range(VectorizableTree[Idx]->Scalars);
9248 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9249 const std::pair<LoadInst *, int64_t> &L2) {
9250 return L1.second > L2.second;
9257 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9258 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9259 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9264 SmallVectorImpl<LoadInst *> &NonVectorized,
9265 bool Final,
unsigned MaxVF) {
9267 unsigned StartIdx = 0;
9268 SmallVector<int> CandidateVFs;
9272 *TTI, Loads.
front()->getType(), MaxVF);
9274 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9280 if (Final && CandidateVFs.
empty())
9283 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9284 for (
unsigned NumElts : CandidateVFs) {
9285 if (Final && NumElts > BestVF)
9287 SmallVector<unsigned> MaskedGatherVectorized;
9288 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9292 if (VectorizedLoads.count(Slice.
front()) ||
9293 VectorizedLoads.count(Slice.
back()) ||
9299 bool AllowToVectorize =
false;
9302 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9305 for (LoadInst *LI : Slice) {
9307 if (LI->hasOneUse())
9313 if (
static_cast<unsigned int>(std::distance(
9314 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9316 if (!IsLegalBroadcastLoad)
9320 for (User *U : LI->users()) {
9323 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9324 for (
int I :
seq<int>(UTE->getNumOperands())) {
9326 return V == LI || isa<PoisonValue>(V);
9336 AllowToVectorize = CheckIfAllowed(Slice);
9340 any_of(ValueToGatherNodes.at(Slice.front()),
9341 [=](
const TreeEntry *TE) {
9342 return TE->Scalars.size() == 2 &&
9343 ((TE->Scalars.front() == Slice.front() &&
9344 TE->Scalars.back() == Slice.back()) ||
9345 (TE->Scalars.front() == Slice.back() &&
9346 TE->Scalars.back() == Slice.front()));
9351 if (AllowToVectorize) {
9356 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9357 StridedPtrInfo SPtrInfo;
9359 PointerOps, SPtrInfo, &BestVF);
9361 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9363 if (MaskedGatherVectorized.
empty() ||
9364 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9369 Results.emplace_back(Values, LS);
9370 VectorizedLoads.insert_range(Slice);
9373 if (Cnt == StartIdx)
9374 StartIdx += NumElts;
9377 if (StartIdx >= Loads.
size())
9381 if (!MaskedGatherVectorized.
empty() &&
9382 Cnt < MaskedGatherVectorized.
back() + NumElts)
9388 if (!AllowToVectorize || BestVF == 0)
9392 for (
unsigned Cnt : MaskedGatherVectorized) {
9394 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9398 VectorizedLoads.insert_range(Slice);
9400 if (Cnt == StartIdx)
9401 StartIdx += NumElts;
9404 for (LoadInst *LI : Loads) {
9405 if (!VectorizedLoads.contains(LI))
9406 NonVectorized.push_back(LI);
9410 auto ProcessGatheredLoads =
9413 bool Final =
false) {
9415 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9417 if (LoadsDists.size() <= 1) {
9418 NonVectorized.
push_back(LoadsDists.back().first);
9426 unsigned MaxConsecutiveDistance = 0;
9427 unsigned CurrentConsecutiveDist = 1;
9428 int64_t LastDist = LocalLoadsDists.front().second;
9429 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9430 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9433 assert(LastDist >=
L.second &&
9434 "Expected first distance always not less than second");
9435 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9436 CurrentConsecutiveDist) {
9437 ++CurrentConsecutiveDist;
9438 MaxConsecutiveDistance =
9439 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9443 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9446 CurrentConsecutiveDist = 1;
9447 LastDist =
L.second;
9450 if (Loads.
size() <= 1)
9452 if (AllowMaskedGather)
9453 MaxConsecutiveDistance = Loads.
size();
9454 else if (MaxConsecutiveDistance < 2)
9459 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9460 Final, MaxConsecutiveDistance);
9462 OriginalLoads.size() == Loads.
size() &&
9463 MaxConsecutiveDistance == Loads.
size() &&
9468 VectorizedLoads.
clear();
9472 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9473 UnsortedNonVectorized, Final,
9474 OriginalLoads.size());
9475 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9476 SortedNonVectorized.
swap(UnsortedNonVectorized);
9477 Results.swap(UnsortedResults);
9482 << Slice.
size() <<
")\n");
9484 for (
Value *L : Slice)
9492 unsigned MaxVF = Slice.size();
9493 unsigned UserMaxVF = 0;
9494 unsigned InterleaveFactor = 0;
9499 std::optional<unsigned> InterleavedLoadsDistance = 0;
9501 std::optional<unsigned> CommonVF = 0;
9502 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9503 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9504 for (
auto [Idx, V] :
enumerate(Slice)) {
9505 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9506 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
9509 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9511 if (*CommonVF == 0) {
9512 CommonVF =
E->Scalars.size();
9515 if (*CommonVF !=
E->Scalars.size())
9519 if (Pos != Idx && InterleavedLoadsDistance) {
9522 if (isa<Constant>(V))
9524 if (isVectorized(V))
9526 const auto &Nodes = ValueToGatherNodes.at(V);
9527 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9528 !is_contained(Slice, V);
9530 InterleavedLoadsDistance.reset();
9534 if (*InterleavedLoadsDistance == 0) {
9535 InterleavedLoadsDistance = Idx - Pos;
9538 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9539 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9540 InterleavedLoadsDistance.reset();
9541 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9545 DeinterleavedNodes.
clear();
9547 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9548 CommonVF.value_or(0) != 0) {
9549 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9550 unsigned VF = *CommonVF;
9553 StridedPtrInfo SPtrInfo;
9555 if (InterleaveFactor <= Slice.size() &&
9556 TTI.isLegalInterleavedAccessType(
9564 UserMaxVF = InterleaveFactor * VF;
9566 InterleaveFactor = 0;
9571 unsigned ConsecutiveNodesSize = 0;
9572 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9573 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9574 [&, Slice = Slice](
const auto &
P) {
9576 return std::get<1>(
P).contains(V);
9578 if (It == Slice.end())
9580 const TreeEntry &
TE =
9581 *VectorizableTree[std::get<0>(
P)];
9585 StridedPtrInfo SPtrInfo;
9587 VL, VL.
front(), Order, PointerOps, SPtrInfo);
9591 ConsecutiveNodesSize += VL.
size();
9592 size_t Start = std::distance(Slice.begin(), It);
9593 size_t Sz = Slice.size() -
Start;
9594 return Sz < VL.
size() ||
9595 Slice.slice(Start, VL.
size()) != VL;
9600 if (InterleaveFactor == 0 &&
9602 [&, Slice = Slice](
unsigned Idx) {
9604 SmallVector<Value *> PointerOps;
9605 StridedPtrInfo SPtrInfo;
9606 return canVectorizeLoads(
9607 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9608 Slice[Idx * UserMaxVF], Order, PointerOps,
9609 SPtrInfo) == LoadsState::ScatterVectorize;
9612 if (Slice.size() != ConsecutiveNodesSize)
9613 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9615 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9616 bool IsVectorized =
true;
9617 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
9619 Slice.slice(
I, std::min(VF,
E -
I));
9624 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9625 [&](
const auto &
P) {
9627 VectorizableTree[std::get<0>(
P)]
9632 unsigned Sz = VectorizableTree.size();
9633 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
9634 if (Sz == VectorizableTree.size()) {
9635 IsVectorized =
false;
9638 if (InterleaveFactor > 0) {
9639 VF = 2 * (MaxVF / InterleaveFactor);
9640 InterleaveFactor = 0;
9649 NonVectorized.
append(SortedNonVectorized);
9651 return NonVectorized;
9653 for (
const auto &GLs : GatheredLoads) {
9654 const auto &
Ref = GLs.second;
9656 if (!
Ref.empty() && !NonVectorized.
empty() &&
9658 Ref.begin(),
Ref.end(), 0u,
9659 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9660 ->
unsigned { return S + LoadsDists.size(); }) !=
9661 NonVectorized.
size() &&
9662 IsMaskedGatherSupported(NonVectorized)) {
9665 for (LoadInst *LI : NonVectorized) {
9673 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
9677 for (
unsigned Idx : LoadEntriesToVectorize) {
9678 const TreeEntry &
E = *VectorizableTree[Idx];
9681 if (!
E.ReorderIndices.empty()) {
9684 SmallVector<int> ReorderMask;
9688 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
9692 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9693 VectorizableTree.size())
9694 GatheredLoadsEntriesFirst.reset();
9704 bool AllowAlternate) {
9727 isValidForAlternation(
I->getOpcode())) {
9739 std::pair<size_t, size_t> OpVals =
9747 if (CI->isCommutative())
9769 SubKey =
hash_value(Gep->getPointerOperand());
9781 return std::make_pair(
Key, SubKey);
9787 Instruction *AltOp,
const TargetLibraryInfo &TLI);
9789bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
9791 Type *ScalarTy = S.getMainOp()->getType();
9792 unsigned Opcode0 = S.getOpcode();
9793 unsigned Opcode1 = S.getAltOpcode();
9794 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9797 Opcode1, OpcodeMask))
9800 for (
unsigned I :
seq<unsigned>(S.getMainOp()->getNumOperands())) {
9803 for (
Value *V : VL) {
9805 Operands.
back().push_back(
9812 if (Operands.
size() == 2) {
9816 Candidates[0] = std::make_pair(Operands[0][
I], Operands[0][
I + 1]);
9817 Candidates[1] = std::make_pair(Operands[0][
I], Operands[1][
I + 1]);
9818 Candidates[2] = std::make_pair(Operands[1][
I], Operands[0][
I + 1]);
9820 switch (Res.value_or(0)) {
9824 std::swap(Operands[0][
I + 1], Operands[1][
I + 1]);
9834 DenseSet<unsigned> UniqueOpcodes;
9835 constexpr unsigned NumAltInsts = 3;
9836 unsigned NonInstCnt = 0;
9839 unsigned UndefCnt = 0;
9841 unsigned ExtraShuffleInsts = 0;
9844 if (Operands.
size() == 2) {
9846 if (Operands.
front() == Operands.
back()) {
9850 return is_contained(Operands.back(), V);
9853 ++ExtraShuffleInsts;
9856 const Loop *
L = LI->getLoopFor(S.getMainOp()->getParent());
9868 DenseMap<Value *, unsigned> Uniques;
9878 if (!Res.second && Res.first->second == 1)
9879 ++ExtraShuffleInsts;
9880 ++Res.first->getSecond();
9882 UniqueOpcodes.
insert(
I->getOpcode());
9883 else if (Res.second)
9886 return none_of(Uniques, [&](
const auto &
P) {
9887 return P.first->hasNUsesOrMore(
P.second + 1) &&
9888 none_of(
P.first->users(), [&](User *U) {
9889 return isVectorized(U) || Uniques.contains(U);
9898 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9899 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
9900 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9907 const unsigned VF,
unsigned MinBW,
9930static std::pair<InstructionCost, InstructionCost>
9950 FMF = FPCI->getFastMathFlags();
9953 LibCost.isValid() ? LibCost : ScalarLimit);
9963BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9965 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
9966 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9968 "Expected instructions with same/alternate opcodes only.");
9970 unsigned ShuffleOrOp =
9971 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
9973 switch (ShuffleOrOp) {
9974 case Instruction::PHI: {
9977 return TreeEntry::NeedToGather;
9979 for (
Value *V : VL) {
9983 for (
Value *Incoming :
PHI->incoming_values()) {
9985 if (Term &&
Term->isTerminator()) {
9987 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
9988 return TreeEntry::NeedToGather;
9993 return TreeEntry::Vectorize;
9995 case Instruction::ExtractElement:
10002 return TreeEntry::NeedToGather;
10004 case Instruction::ExtractValue: {
10005 bool Reuse = canReuseExtract(VL, CurrentOrder);
10009 return TreeEntry::NeedToGather;
10010 if (Reuse || !CurrentOrder.empty())
10011 return TreeEntry::Vectorize;
10013 return TreeEntry::NeedToGather;
10015 case Instruction::InsertElement: {
10019 for (
Value *V : VL) {
10021 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
10022 return TreeEntry::NeedToGather;
10026 "Non-constant or undef index?");
10030 return !SourceVectors.contains(V);
10033 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10034 "different source vectors.\n");
10035 return TreeEntry::NeedToGather;
10040 return SourceVectors.contains(V) && !
V->hasOneUse();
10043 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10044 "multiple uses.\n");
10045 return TreeEntry::NeedToGather;
10048 return TreeEntry::Vectorize;
10050 case Instruction::Load: {
10057 auto IsGatheredNode = [&]() {
10058 if (!GatheredLoadsEntriesFirst)
10063 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
10064 return TE->Idx >= *GatheredLoadsEntriesFirst;
10070 return TreeEntry::Vectorize;
10072 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10074 LoadEntriesToVectorize.insert(VectorizableTree.size());
10075 return TreeEntry::NeedToGather;
10077 return IsGatheredNode() ? TreeEntry::NeedToGather
10078 : TreeEntry::CompressVectorize;
10080 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10082 LoadEntriesToVectorize.insert(VectorizableTree.size());
10083 return TreeEntry::NeedToGather;
10085 return IsGatheredNode() ? TreeEntry::NeedToGather
10086 : TreeEntry::ScatterVectorize;
10088 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10090 LoadEntriesToVectorize.insert(VectorizableTree.size());
10091 return TreeEntry::NeedToGather;
10093 return IsGatheredNode() ? TreeEntry::NeedToGather
10094 : TreeEntry::StridedVectorize;
10098 if (DL->getTypeSizeInBits(ScalarTy) !=
10099 DL->getTypeAllocSizeInBits(ScalarTy))
10100 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
10103 return !LI || !LI->isSimple();
10107 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering non-consecutive loads.\n");
10110 return TreeEntry::NeedToGather;
10114 case Instruction::ZExt:
10115 case Instruction::SExt:
10116 case Instruction::FPToUI:
10117 case Instruction::FPToSI:
10118 case Instruction::FPExt:
10119 case Instruction::PtrToInt:
10120 case Instruction::IntToPtr:
10121 case Instruction::SIToFP:
10122 case Instruction::UIToFP:
10123 case Instruction::Trunc:
10124 case Instruction::FPTrunc:
10125 case Instruction::BitCast: {
10127 for (
Value *V : VL) {
10133 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10134 return TreeEntry::NeedToGather;
10137 return TreeEntry::Vectorize;
10139 case Instruction::ICmp:
10140 case Instruction::FCmp: {
10145 for (
Value *V : VL) {
10149 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10150 Cmp->getOperand(0)->getType() != ComparedTy) {
10151 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10152 return TreeEntry::NeedToGather;
10155 return TreeEntry::Vectorize;
10157 case Instruction::Select:
10158 case Instruction::FNeg:
10159 case Instruction::Add:
10160 case Instruction::FAdd:
10161 case Instruction::Sub:
10162 case Instruction::FSub:
10163 case Instruction::Mul:
10164 case Instruction::FMul:
10165 case Instruction::UDiv:
10166 case Instruction::SDiv:
10167 case Instruction::FDiv:
10168 case Instruction::URem:
10169 case Instruction::SRem:
10170 case Instruction::FRem:
10171 case Instruction::Shl:
10172 case Instruction::LShr:
10173 case Instruction::AShr:
10174 case Instruction::And:
10175 case Instruction::Or:
10176 case Instruction::Xor:
10177 case Instruction::Freeze:
10178 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10179 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10181 return I &&
I->isBinaryOp() && !
I->isFast();
10183 return TreeEntry::NeedToGather;
10184 return TreeEntry::Vectorize;
10185 case Instruction::GetElementPtr: {
10187 for (
Value *V : VL) {
10191 if (
I->getNumOperands() != 2) {
10192 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10193 return TreeEntry::NeedToGather;
10200 for (
Value *V : VL) {
10204 Type *CurTy =
GEP->getSourceElementType();
10205 if (Ty0 != CurTy) {
10206 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10207 return TreeEntry::NeedToGather;
10213 for (
Value *V : VL) {
10217 auto *
Op =
I->getOperand(1);
10219 (
Op->getType() != Ty1 &&
10221 Op->getType()->getScalarSizeInBits() >
10222 DL->getIndexSizeInBits(
10223 V->getType()->getPointerAddressSpace())))) {
10225 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10226 return TreeEntry::NeedToGather;
10230 return TreeEntry::Vectorize;
10232 case Instruction::Store: {
10234 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10237 if (DL->getTypeSizeInBits(ScalarTy) !=
10238 DL->getTypeAllocSizeInBits(ScalarTy)) {
10239 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10240 return TreeEntry::NeedToGather;
10244 for (
Value *V : VL) {
10246 if (!
SI->isSimple()) {
10248 return TreeEntry::NeedToGather;
10257 if (CurrentOrder.empty()) {
10258 Ptr0 = PointerOps.
front();
10259 PtrN = PointerOps.
back();
10261 Ptr0 = PointerOps[CurrentOrder.front()];
10262 PtrN = PointerOps[CurrentOrder.back()];
10264 std::optional<int64_t> Dist =
10267 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10268 return TreeEntry::Vectorize;
10272 return TreeEntry::NeedToGather;
10274 case Instruction::Call: {
10275 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10276 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10278 return I && !
I->isFast();
10280 return TreeEntry::NeedToGather;
10290 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10294 return TreeEntry::NeedToGather;
10297 unsigned NumArgs = CI->
arg_size();
10299 for (
unsigned J = 0; J != NumArgs; ++J)
10302 for (
Value *V : VL) {
10307 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10309 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10311 return TreeEntry::NeedToGather;
10315 for (
unsigned J = 0; J != NumArgs; ++J) {
10318 if (ScalarArgs[J] != A1J) {
10320 <<
"SLP: mismatched arguments in call:" << *CI
10321 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10322 return TreeEntry::NeedToGather;
10331 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10332 <<
"!=" << *V <<
'\n');
10333 return TreeEntry::NeedToGather;
10338 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10340 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10341 return TreeEntry::NeedToGather;
10343 return TreeEntry::Vectorize;
10345 case Instruction::ShuffleVector: {
10346 if (!S.isAltShuffle()) {
10349 return TreeEntry::Vectorize;
10352 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10353 return TreeEntry::NeedToGather;
10358 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10359 "the whole alt sequence is not profitable.\n");
10360 return TreeEntry::NeedToGather;
10363 return TreeEntry::Vectorize;
10367 return TreeEntry::NeedToGather;
10376 PHINode *Main =
nullptr;
10381 PHIHandler() =
delete;
10383 : DT(DT), Main(Main), Phis(Phis),
10384 Operands(Main->getNumIncomingValues(),
10386 void buildOperands() {
10387 constexpr unsigned FastLimit = 4;
10396 for (
auto [Idx, V] :
enumerate(Phis)) {
10400 "Expected isa instruction or poison value.");
10401 Operands[
I][Idx] =
V;
10404 if (
P->getIncomingBlock(
I) == InBB)
10405 Operands[
I][Idx] =
P->getIncomingValue(
I);
10407 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10412 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10422 for (
auto [Idx, V] :
enumerate(Phis)) {
10425 Operands[
I][Idx] =
V;
10434 Operands[
I][Idx] =
P->getIncomingValue(
I);
10437 auto *It = Blocks.
find(InBB);
10438 if (It == Blocks.
end())
10440 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10443 for (
const auto &
P : Blocks) {
10444 ArrayRef<unsigned> IncomingValues =
P.second;
10445 if (IncomingValues.
size() <= 1)
10448 for (
unsigned I : IncomingValues) {
10450 [&](
const auto &
Data) {
10451 return !
Data.value() ||
10452 Data.value() == Operands[BasicI][
Data.index()];
10454 "Expected empty operands list.");
10455 Operands[
I] = Operands[BasicI];
10468static std::pair<Instruction *, Instruction *>
10472 for (
Value *V : VL) {
10482 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10501 "Expected different main and alt instructions.");
10502 return std::make_pair(MainOp, AltOp);
10515 const InstructionsState &S,
10517 bool TryPad =
false) {
10521 for (
Value *V : VL) {
10537 size_t NumUniqueScalarValues = UniqueValues.
size();
10540 if (NumUniqueScalarValues == VL.
size() &&
10542 ReuseShuffleIndices.
clear();
10547 if ((UserTreeIdx.
UserTE &&
10548 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10550 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10551 "for nodes with padding.\n");
10552 ReuseShuffleIndices.
clear();
10557 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10561 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10562 S.getMainOp()->isSafeToRemove() &&
10563 (S.areInstructionsWithCopyableElements() ||
10567 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10568 PWSz = std::min<unsigned>(PWSz, VL.
size());
10569 if (PWSz == VL.
size()) {
10573 ReuseShuffleIndices.
clear();
10577 UniqueValues.
end());
10578 PaddedUniqueValues.
append(
10579 PWSz - UniqueValues.
size(),
10583 if ((!S.areInstructionsWithCopyableElements() &&
10585 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10586 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10589 ReuseShuffleIndices.
clear();
10592 VL = std::move(PaddedUniqueValues);
10597 ReuseShuffleIndices.
clear();
10600 VL = std::move(UniqueValues);
10605 const InstructionsState &LocalState,
10606 SmallVectorImpl<Value *> &Op1,
10607 SmallVectorImpl<Value *> &Op2,
10609 constexpr unsigned SmallNodeSize = 4;
10610 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10615 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
10617 for (TreeEntry *
E : getSplitTreeEntries(LocalState.getMainOp())) {
10618 if (
E->isSame(VL)) {
10620 << *LocalState.getMainOp() <<
".\n");
10632 ReorderIndices.assign(VL.
size(), VL.
size());
10633 SmallBitVector Op1Indices(VL.
size());
10638 Op1Indices.set(Idx);
10641 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10644 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10646 LocalState.getAltOp(), *TLI))) {
10648 Op1Indices.set(Idx);
10655 unsigned Opcode0 = LocalState.getOpcode();
10656 unsigned Opcode1 = LocalState.getAltOpcode();
10657 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10662 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10663 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10668 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
10670 if (Op1Indices.test(Idx)) {
10671 ReorderIndices[Op1Cnt] = Idx;
10674 ReorderIndices[Op2Cnt] = Idx;
10679 ReorderIndices.clear();
10680 SmallVector<int>
Mask;
10681 if (!ReorderIndices.empty())
10683 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10688 if (NumParts >= VL.
size())
10693 FixedVectorType *SubVecTy =
10697 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10698 (
Mask.empty() || InsertCost >= NewShuffleCost))
10700 if ((LocalState.getMainOp()->isBinaryOp() &&
10701 LocalState.getAltOp()->isBinaryOp() &&
10702 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10703 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10704 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10705 (LocalState.getMainOp()->isUnaryOp() &&
10706 LocalState.getAltOp()->isUnaryOp())) {
10708 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10709 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10714 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.
size());
10718 VecTy, OriginalMask, Kind);
10720 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10721 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10723 NewVecOpsCost + InsertCost +
10724 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10725 VectorizableTree.front()->getOpcode() == Instruction::Store
10729 if (NewCost >= OriginalCost)
10739class InstructionsCompatibilityAnalysis {
10741 const DataLayout &
DL;
10742 const TargetTransformInfo &
TTI;
10743 const TargetLibraryInfo &TLI;
10744 unsigned MainOpcode = 0;
10749 static bool isSupportedOpcode(
const unsigned Opcode) {
10750 return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
10751 Opcode == Instruction::Shl || Opcode == Instruction::SDiv ||
10752 Opcode == Instruction::UDiv || Opcode == Instruction::And ||
10753 Opcode == Instruction::Or || Opcode == Instruction::Xor;
10763 auto IsSupportedInstruction = [&](
Instruction *
I,
bool AnyUndef) {
10764 if (AnyUndef && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
10766 return I && isSupportedOpcode(
I->getOpcode()) &&
10771 SmallDenseSet<Value *, 8> Operands;
10772 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10773 bool AnyUndef =
false;
10774 for (
Value *V : VL) {
10782 if (Candidates.
empty()) {
10783 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10785 Operands.
insert(
I->op_begin(),
I->op_end());
10788 if (Parent ==
I->getParent()) {
10789 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10790 Operands.
insert(
I->op_begin(),
I->op_end());
10793 auto *NodeA = DT.
getNode(Parent);
10794 auto *NodeB = DT.
getNode(
I->getParent());
10795 assert(NodeA &&
"Should only process reachable instructions");
10796 assert(NodeB &&
"Should only process reachable instructions");
10797 assert((NodeA == NodeB) ==
10798 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10799 "Different nodes should have different DFS numbers");
10800 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10801 Candidates.
clear();
10802 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10805 Operands.
insert(
I->op_begin(),
I->op_end());
10808 unsigned BestOpcodeNum = 0;
10810 for (
const auto &
P : Candidates) {
10811 if (
P.second.size() < BestOpcodeNum)
10813 for (Instruction *
I :
P.second) {
10814 if (IsSupportedInstruction(
I, AnyUndef) && !Operands.
contains(
I)) {
10816 BestOpcodeNum =
P.second.size();
10826 return I &&
I->getParent() == MainOp->
getParent() &&
10839 Value *selectBestIdempotentValue()
const {
10840 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10851 if (!S.isCopyableElement(V))
10853 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10854 return {
V, selectBestIdempotentValue()};
10860 SmallVectorImpl<BoUpSLP::ValueList> &Operands)
const {
10862 unsigned ShuffleOrOp =
10863 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10866 switch (ShuffleOrOp) {
10867 case Instruction::PHI: {
10871 PHIHandler Handler(DT, PH, VL);
10872 Handler.buildOperands();
10873 Operands.
assign(PH->getNumOperands(), {});
10875 Operands[
I].
assign(Handler.getOperands(
I).begin(),
10876 Handler.getOperands(
I).end());
10879 case Instruction::ExtractValue:
10880 case Instruction::ExtractElement:
10885 case Instruction::InsertElement:
10893 case Instruction::Load:
10897 for (
auto [V,
Op] :
zip(VL, Operands.
back())) {
10901 Op = LI->getPointerOperand();
10904 case Instruction::ZExt:
10905 case Instruction::SExt:
10906 case Instruction::FPToUI:
10907 case Instruction::FPToSI:
10908 case Instruction::FPExt:
10909 case Instruction::PtrToInt:
10910 case Instruction::IntToPtr:
10911 case Instruction::SIToFP:
10912 case Instruction::UIToFP:
10913 case Instruction::Trunc:
10914 case Instruction::FPTrunc:
10915 case Instruction::BitCast:
10916 case Instruction::ICmp:
10917 case Instruction::FCmp:
10918 case Instruction::Select:
10919 case Instruction::FNeg:
10920 case Instruction::Add:
10921 case Instruction::FAdd:
10922 case Instruction::Sub:
10923 case Instruction::FSub:
10924 case Instruction::Mul:
10925 case Instruction::FMul:
10926 case Instruction::UDiv:
10927 case Instruction::SDiv:
10928 case Instruction::FDiv:
10929 case Instruction::URem:
10930 case Instruction::SRem:
10931 case Instruction::FRem:
10932 case Instruction::Shl:
10933 case Instruction::LShr:
10934 case Instruction::AShr:
10935 case Instruction::And:
10936 case Instruction::Or:
10937 case Instruction::Xor:
10938 case Instruction::Freeze:
10939 case Instruction::Store:
10940 case Instruction::ShuffleVector:
10949 auto [
Op, ConvertedOps] = convertTo(
I, S);
10954 case Instruction::GetElementPtr: {
10961 const unsigned IndexIdx = 1;
10967 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
10971 ->getPointerOperandType()
10972 ->getScalarType());
10976 Operands[0][Idx] =
V;
10977 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10980 Operands[0][Idx] =
GEP->getPointerOperand();
10981 auto *
Op =
GEP->getOperand(IndexIdx);
10984 CI, Ty, CI->getValue().isSignBitSet(),
DL)
10989 case Instruction::Call: {
10996 for (
Value *V : VL) {
10998 Ops.push_back(
I ?
I->getOperand(Idx)
11011 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
11012 const TargetTransformInfo &
TTI,
11013 const TargetLibraryInfo &TLI)
11018 bool TryCopyableElementsVectorization,
11019 bool WithProfitabilityCheck =
false,
11020 bool SkipSameCodeCheck =
false) {
11021 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
11022 ? InstructionsState::invalid()
11028 findAndSetMainInstruction(VL, R);
11030 return InstructionsState::invalid();
11031 S = InstructionsState(MainOp, MainOp,
true);
11032 if (!WithProfitabilityCheck)
11036 auto BuildCandidates =
11037 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
11043 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
11044 I1->getParent() != I2->getParent())
11048 if (VL.
size() == 2) {
11051 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11052 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11053 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11054 R.findBestRootPair(Candidates1) &&
11055 R.findBestRootPair(Candidates2);
11057 Candidates1.
clear();
11058 Candidates2.
clear();
11059 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11060 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11061 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11062 R.findBestRootPair(Candidates1) &&
11063 R.findBestRootPair(Candidates2);
11066 return InstructionsState::invalid();
11070 FixedVectorType *VecTy =
11072 switch (MainOpcode) {
11073 case Instruction::Add:
11074 case Instruction::LShr:
11075 case Instruction::Shl:
11076 case Instruction::SDiv:
11077 case Instruction::UDiv:
11078 case Instruction::And:
11079 case Instruction::Or:
11080 case Instruction::Xor:
11086 if (VectorCost > ScalarCost)
11087 return InstructionsState::invalid();
11090 assert(Operands.
size() == 2 &&
"Unexpected number of operands!");
11091 unsigned CopyableNum =
11092 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
11093 if (CopyableNum < VL.
size() / 2)
11096 const unsigned Limit = VL.
size() / 24;
11097 if ((CopyableNum >= VL.
size() - Limit ||
11098 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
11103 return InstructionsState::invalid();
11107 for (
auto &
Ops : Operands) {
11122 return InstructionsState::invalid();
11128 constexpr unsigned Limit = 4;
11129 if (Operands.front().size() >= Limit) {
11130 SmallDenseMap<const Value *, unsigned>
Counters;
11138 return C.second == 1;
11144 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11145 InstructionsState OpS =
Analysis.buildInstructionsState(
11147 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11149 unsigned CopyableNum =
11151 return CopyableNum <= VL.
size() / 2;
11153 if (!CheckOperand(Operands.front()))
11154 return InstructionsState::invalid();
11161 assert(S &&
"Invalid state!");
11163 if (S.areInstructionsWithCopyableElements()) {
11164 MainOp = S.getMainOp();
11165 MainOpcode = S.getOpcode();
11170 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11171 Operands[OperandIdx][Idx] = Operand;
11174 buildOriginalOperands(S, VL, Operands);
11181BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11183 bool TryCopyableElementsVectorization)
const {
11186 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11187 InstructionsState S =
Analysis.buildInstructionsState(
11188 VL, *
this, TryCopyableElementsVectorization,
11189 true, TryCopyableElementsVectorization);
11197 return ScalarsVectorizationLegality(S,
false,
11203 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11204 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11205 if (
E->isSame(VL)) {
11206 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11208 return ScalarsVectorizationLegality(S,
false);
11213 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11214 LI->getLoopFor(S.getMainOp()->getParent()) &&
11218 return ScalarsVectorizationLegality(S,
false);
11227 !(S && !S.isAltShuffle() && VL.
size() >= 4 &&
11234 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11235 return ScalarsVectorizationLegality(S,
false);
11239 if (S && S.getOpcode() == Instruction::ExtractElement &&
11242 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11243 return ScalarsVectorizationLegality(S,
false);
11250 return ScalarsVectorizationLegality(S,
false,
11260 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11268 SmallVector<unsigned, 8> InstsCount;
11269 for (
Value *V : VL) {
11272 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11275 bool IsCommutative =
11277 if ((IsCommutative &&
11278 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11280 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11282 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11286 for (
int Op :
seq<int>(S.getMainOp()->getNumOperands()))
11288 I2->getOperand(
Op));
11289 if (
static_cast<unsigned>(
count_if(
11290 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11292 })) >= S.getMainOp()->getNumOperands() / 2)
11294 if (S.getMainOp()->getNumOperands() > 2)
11296 if (IsCommutative) {
11298 Candidates.
clear();
11299 for (
int Op = 0,
E = S.getMainOp()->getNumOperands();
Op <
E; ++
Op)
11301 I2->getOperand((
Op + 1) %
E));
11303 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11310 SmallVector<unsigned> SortedIndices;
11312 bool IsScatterVectorizeUserTE =
11313 UserTreeIdx.UserTE &&
11314 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11315 bool AreAllSameBlock = S.valid();
11316 bool AreScatterAllGEPSameBlock =
11329 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11331 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11337 NotProfitableForVectorization(VL)) {
11339 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11340 "C,S,B,O, small shuffle. \n";
11344 return ScalarsVectorizationLegality(S,
false,
11348 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11352 return ScalarsVectorizationLegality(S,
false);
11356 if (S && !EphValues.empty()) {
11357 for (
Value *V : VL) {
11358 if (EphValues.count(V)) {
11360 <<
") is ephemeral.\n");
11362 return ScalarsVectorizationLegality(S,
false,
11374 if (S && S.isAltShuffle()) {
11375 auto GetNumVectorizedExtracted = [&]() {
11381 all_of(
I->operands(), [&](
const Use &U) {
11382 return isa<ExtractElementInst>(U.get());
11387 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11390 return std::make_pair(Vectorized, Extracted);
11392 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11394 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11395 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11398 Type *ScalarTy = VL.front()->getType();
11403 false,
true, Kind);
11405 *TTI, ScalarTy, VecTy, Vectorized,
11406 true,
false, Kind,
false);
11407 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11409 if (PreferScalarize) {
11410 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11411 "node is not profitable.\n");
11412 return ScalarsVectorizationLegality(S,
false);
11417 if (UserIgnoreList && !UserIgnoreList->empty()) {
11418 for (
Value *V : VL) {
11419 if (UserIgnoreList->contains(V)) {
11420 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11421 return ScalarsVectorizationLegality(S,
false);
11428 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11429 assert(VL.front()->getType()->isPointerTy() &&
11431 "Expected pointers only.");
11434 assert(It != VL.end() &&
"Expected at least one GEP.");
11445 !DT->isReachableFromEntry(BB))) {
11451 return ScalarsVectorizationLegality(S,
false);
11453 return ScalarsVectorizationLegality(S,
true);
11458 unsigned InterleaveFactor) {
11461 SmallVector<int> ReuseShuffleIndices;
11465 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11468 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11471 auto Invalid = ScheduleBundle::invalid();
11472 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11473 UserTreeIdx, {}, ReorderIndices);
11478 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11480 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11481 Idx == 0 ? 0 : Op1.
size());
11482 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
11484 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11485 Idx == 0 ? 0 : Op1.
size());
11495 bool AreConsts =
false;
11496 for (
Value *V : VL) {
11508 if (AreOnlyConstsWithPHIs(VL)) {
11509 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11510 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11514 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11515 VL,
Depth, UserTreeIdx,
false);
11516 InstructionsState S = Legality.getInstructionsState();
11517 if (!Legality.isLegal()) {
11518 if (Legality.trySplitVectorize()) {
11521 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11525 Legality = getScalarsVectorizationLegality(
11526 VL,
Depth, UserTreeIdx,
true);
11527 if (!Legality.isLegal()) {
11528 if (Legality.tryToFindDuplicates())
11532 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11535 S = Legality.getInstructionsState();
11539 if (S.isAltShuffle() && TrySplitNode(S))
11545 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11550 bool IsScatterVectorizeUserTE =
11551 UserTreeIdx.UserTE &&
11552 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11555 StridedPtrInfo SPtrInfo;
11556 TreeEntry::EntryState State = getScalarsVectorizationState(
11557 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11558 if (State == TreeEntry::NeedToGather) {
11559 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11565 auto &BSRef = BlocksSchedules[BB];
11567 BSRef = std::make_unique<BlockScheduling>(BB);
11569 BlockScheduling &BS = *BSRef;
11572 std::optional<ScheduleBundle *> BundlePtr =
11573 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11574#ifdef EXPENSIVE_CHECKS
11578 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11579 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11581 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11583 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11584 NonScheduledFirst.insert(VL.front());
11585 if (S.getOpcode() == Instruction::Load &&
11586 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11590 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11592 ScheduleBundle
Empty;
11593 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
11594 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
11596 unsigned ShuffleOrOp =
11597 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11598 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &Operands) {
11600 SmallVector<unsigned> PHIOps;
11606 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11611 for (
unsigned I : PHIOps)
11612 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
11614 switch (ShuffleOrOp) {
11615 case Instruction::PHI: {
11617 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11621 TE->setOperands(Operands);
11622 CreateOperandNodes(TE, Operands);
11625 case Instruction::ExtractValue:
11626 case Instruction::ExtractElement: {
11627 if (CurrentOrder.empty()) {
11628 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
11631 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
11633 for (
unsigned Idx : CurrentOrder)
11634 dbgs() <<
" " << Idx;
11641 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11642 ReuseShuffleIndices, CurrentOrder);
11644 "(ExtractValueInst/ExtractElementInst).\n";
11648 TE->setOperands(Operands);
11651 case Instruction::InsertElement: {
11652 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
11654 auto OrdCompare = [](
const std::pair<int, int> &
P1,
11655 const std::pair<int, int> &P2) {
11656 return P1.first > P2.first;
11659 decltype(OrdCompare)>
11660 Indices(OrdCompare);
11661 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11663 Indices.emplace(Idx,
I);
11665 OrdersType CurrentOrder(VL.size(), VL.size());
11666 bool IsIdentity =
true;
11667 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11668 CurrentOrder[Indices.top().second] =
I;
11669 IsIdentity &= Indices.top().second ==
I;
11673 CurrentOrder.clear();
11674 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11676 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
11679 TE->setOperands(Operands);
11680 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
11683 case Instruction::Load: {
11690 TreeEntry *
TE =
nullptr;
11693 case TreeEntry::Vectorize:
11694 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11695 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11696 if (CurrentOrder.empty())
11697 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
11701 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
11704 case TreeEntry::CompressVectorize:
11706 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11707 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11710 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11713 case TreeEntry::StridedVectorize:
11715 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11716 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11717 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
11718 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
11721 case TreeEntry::ScatterVectorize:
11723 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11724 UserTreeIdx, ReuseShuffleIndices);
11727 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11730 case TreeEntry::CombinedVectorize:
11731 case TreeEntry::SplitVectorize:
11732 case TreeEntry::NeedToGather:
11735 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11736 assert(Operands.
size() == 1 &&
"Expected a single operand only");
11737 SmallVector<int>
Mask;
11741 TE->setOperands(Operands);
11742 if (State == TreeEntry::ScatterVectorize)
11743 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
11746 case Instruction::ZExt:
11747 case Instruction::SExt:
11748 case Instruction::FPToUI:
11749 case Instruction::FPToSI:
11750 case Instruction::FPExt:
11751 case Instruction::PtrToInt:
11752 case Instruction::IntToPtr:
11753 case Instruction::SIToFP:
11754 case Instruction::UIToFP:
11755 case Instruction::Trunc:
11756 case Instruction::FPTrunc:
11757 case Instruction::BitCast: {
11758 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11759 std::make_pair(std::numeric_limits<unsigned>::min(),
11760 std::numeric_limits<unsigned>::max()));
11761 if (ShuffleOrOp == Instruction::ZExt ||
11762 ShuffleOrOp == Instruction::SExt) {
11763 CastMaxMinBWSizes = std::make_pair(
11764 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11766 std::min<unsigned>(
11769 }
else if (ShuffleOrOp == Instruction::Trunc) {
11770 CastMaxMinBWSizes = std::make_pair(
11771 std::max<unsigned>(
11774 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11777 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11778 ReuseShuffleIndices);
11779 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
11782 TE->setOperands(Operands);
11784 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11785 if (ShuffleOrOp == Instruction::Trunc) {
11786 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11787 }
else if (ShuffleOrOp == Instruction::SIToFP ||
11788 ShuffleOrOp == Instruction::UIToFP) {
11789 unsigned NumSignBits =
11792 APInt
Mask = DB->getDemandedBits(OpI);
11793 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
11795 if (NumSignBits * 2 >=
11797 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11801 case Instruction::ICmp:
11802 case Instruction::FCmp: {
11805 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11806 ReuseShuffleIndices);
11815 "Commutative Predicate mismatch");
11818 Operands.
back() =
Ops.getVL(1);
11825 if (
Cmp->getPredicate() != P0)
11829 TE->setOperands(Operands);
11830 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
11831 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
11832 if (ShuffleOrOp == Instruction::ICmp) {
11833 unsigned NumSignBits0 =
11835 if (NumSignBits0 * 2 >=
11837 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11838 unsigned NumSignBits1 =
11840 if (NumSignBits1 * 2 >=
11842 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11846 case Instruction::Select:
11847 case Instruction::FNeg:
11848 case Instruction::Add:
11849 case Instruction::FAdd:
11850 case Instruction::Sub:
11851 case Instruction::FSub:
11852 case Instruction::Mul:
11853 case Instruction::FMul:
11854 case Instruction::UDiv:
11855 case Instruction::SDiv:
11856 case Instruction::FDiv:
11857 case Instruction::URem:
11858 case Instruction::SRem:
11859 case Instruction::FRem:
11860 case Instruction::Shl:
11861 case Instruction::LShr:
11862 case Instruction::AShr:
11863 case Instruction::And:
11864 case Instruction::Or:
11865 case Instruction::Xor:
11866 case Instruction::Freeze: {
11867 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11868 ReuseShuffleIndices);
11870 dbgs() <<
"SLP: added a new TreeEntry "
11871 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11877 Operands[0] =
Ops.getVL(0);
11878 Operands[1] =
Ops.getVL(1);
11880 TE->setOperands(Operands);
11882 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11885 case Instruction::GetElementPtr: {
11886 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11887 ReuseShuffleIndices);
11888 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
11890 TE->setOperands(Operands);
11893 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
11896 case Instruction::Store: {
11897 bool Consecutive = CurrentOrder.empty();
11900 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11901 ReuseShuffleIndices, CurrentOrder);
11903 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
11907 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
11909 TE->setOperands(Operands);
11910 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
11913 case Instruction::Call: {
11919 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11920 ReuseShuffleIndices);
11921 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
11926 Operands[0] =
Ops.getVL(0);
11927 Operands[1] =
Ops.getVL(1);
11929 TE->setOperands(Operands);
11935 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11939 case Instruction::ShuffleVector: {
11940 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11941 ReuseShuffleIndices);
11942 if (S.isAltShuffle()) {
11943 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
11948 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11962 "Expected different main/alternate predicates.");
11978 TE->setOperands(Operands);
11979 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
11980 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
11987 Operands[0] =
Ops.getVL(0);
11988 Operands[1] =
Ops.getVL(1);
11990 TE->setOperands(Operands);
11992 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12010 for (
const auto *Ty : ST->elements())
12011 if (Ty != *ST->element_begin())
12013 N *= ST->getNumElements();
12014 EltTy = *ST->element_begin();
12016 N *= AT->getNumElements();
12017 EltTy = AT->getElementType();
12020 N *= VT->getNumElements();
12021 EltTy = VT->getElementType();
12027 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
12028 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12029 VTSize != DL->getTypeStoreSizeInBits(
T))
12036 bool ResizeAllowed)
const {
12038 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
12045 Value *Vec = E0->getOperand(0);
12047 CurrentOrder.
clear();
12051 if (E0->getOpcode() == Instruction::ExtractValue) {
12063 unsigned E = VL.
size();
12064 if (!ResizeAllowed && NElts !=
E)
12067 unsigned MinIdx = NElts, MaxIdx = 0;
12072 if (Inst->getOperand(0) != Vec)
12080 const unsigned ExtIdx = *Idx;
12081 if (ExtIdx >= NElts)
12083 Indices[
I] = ExtIdx;
12084 if (MinIdx > ExtIdx)
12086 if (MaxIdx < ExtIdx)
12089 if (MaxIdx - MinIdx + 1 >
E)
12091 if (MaxIdx + 1 <=
E)
12095 bool ShouldKeepOrder =
true;
12102 for (
unsigned I = 0;
I <
E; ++
I) {
12105 const unsigned ExtIdx = Indices[
I] - MinIdx;
12106 if (CurrentOrder[ExtIdx] !=
E) {
12107 CurrentOrder.
clear();
12110 ShouldKeepOrder &= ExtIdx ==
I;
12111 CurrentOrder[ExtIdx] =
I;
12113 if (ShouldKeepOrder)
12114 CurrentOrder.
clear();
12116 return ShouldKeepOrder;
12119bool BoUpSLP::areAllUsersVectorized(
12120 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
12121 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
12122 all_of(
I->users(), [
this](User *U) {
12123 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12124 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12128void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12129 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12130 SmallVectorImpl<Value *> *OpScalars,
12131 SmallVectorImpl<Value *> *AltScalars)
const {
12132 unsigned Sz = Scalars.size();
12134 SmallVector<int> OrderMask;
12135 if (!ReorderIndices.empty())
12137 for (
unsigned I = 0;
I < Sz; ++
I) {
12139 if (!ReorderIndices.empty())
12140 Idx = OrderMask[
I];
12144 if (IsAltOp(OpInst)) {
12145 Mask[
I] = Sz + Idx;
12154 if (!ReuseShuffleIndices.
empty()) {
12156 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12157 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12159 Mask.swap(NewMask);
12166 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12176 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12185 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12186 "CmpInst expected to match either main or alternate predicate or "
12188 return MainP !=
P && MainP != SwappedP;
12190 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12195 const auto *Op0 =
Ops.front();
12208 return CI->getValue().isPowerOf2();
12214 return CI->getValue().isNegatedPowerOf2();
12219 if (IsConstant && IsUniform)
12221 else if (IsConstant)
12223 else if (IsUniform)
12235class BaseShuffleAnalysis {
12237 Type *ScalarTy =
nullptr;
12239 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12247 unsigned getVF(
Value *V)
const {
12248 assert(V &&
"V cannot be nullptr");
12250 "V does not have FixedVectorType");
12251 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12253 unsigned VNumElements =
12255 assert(VNumElements > ScalarTyNumElements &&
12256 "the number of elements of V is not large enough");
12257 assert(VNumElements % ScalarTyNumElements == 0 &&
12258 "the number of elements of V is not a vectorized value");
12259 return VNumElements / ScalarTyNumElements;
12265 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12267 int Limit =
Mask.size();
12279 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12280 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12293 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12294 ArrayRef<int> ExtMask) {
12295 unsigned VF =
Mask.size();
12297 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12300 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12304 Mask.swap(NewMask);
12340 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
12341 bool SinglePermute) {
12343 ShuffleVectorInst *IdentityOp =
nullptr;
12344 SmallVector<int> IdentityMask;
12353 if (isIdentityMask(Mask, SVTy,
false)) {
12354 if (!IdentityOp || !SinglePermute ||
12355 (isIdentityMask(Mask, SVTy,
true) &&
12357 IdentityMask.
size()))) {
12362 IdentityMask.
assign(Mask);
12382 if (SV->isZeroEltSplat()) {
12384 IdentityMask.
assign(Mask);
12386 int LocalVF =
Mask.size();
12389 LocalVF = SVOpTy->getNumElements();
12393 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12395 ExtMask[Idx] = SV->getMaskValue(
I);
12405 if (!IsOp1Undef && !IsOp2Undef) {
12407 for (
int &
I : Mask) {
12410 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12416 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12417 combineMasks(LocalVF, ShuffleMask, Mask);
12418 Mask.swap(ShuffleMask);
12420 Op = SV->getOperand(0);
12422 Op = SV->getOperand(1);
12425 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12430 "Expected masks of same sizes.");
12435 Mask.swap(IdentityMask);
12437 return SinglePermute &&
12440 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12441 Shuffle->isZeroEltSplat() &&
12445 Shuffle->getShuffleMask()[
P.index()] == 0;
12458 template <
typename T,
typename ShuffleBuilderTy>
12459 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
12460 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12461 assert(V1 &&
"Expected at least one vector value.");
12463 SmallVector<int> NewMask(Mask);
12464 if (ScalarTyNumElements != 1) {
12470 Builder.resizeToMatch(V1, V2);
12471 int VF =
Mask.size();
12473 VF = FTy->getNumElements();
12484 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12486 CombinedMask1[
I] =
Mask[
I];
12488 CombinedMask2[
I] =
Mask[
I] - VF;
12495 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12496 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12502 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
12505 ExtMask1[Idx] = SV1->getMaskValue(
I);
12509 ->getNumElements(),
12510 ExtMask1, UseMask::SecondArg);
12511 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
12512 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
12515 ExtMask2[Idx] = SV2->getMaskValue(
I);
12519 ->getNumElements(),
12520 ExtMask2, UseMask::SecondArg);
12521 if (SV1->getOperand(0)->getType() ==
12522 SV2->getOperand(0)->getType() &&
12523 SV1->getOperand(0)->getType() != SV1->getType() &&
12526 Op1 = SV1->getOperand(0);
12527 Op2 = SV2->getOperand(0);
12528 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12529 int LocalVF = ShuffleMask1.size();
12531 LocalVF = FTy->getNumElements();
12532 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12533 CombinedMask1.swap(ShuffleMask1);
12534 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12535 LocalVF = ShuffleMask2.size();
12537 LocalVF = FTy->getNumElements();
12538 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12539 CombinedMask2.swap(ShuffleMask2);
12542 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12543 Builder.resizeToMatch(Op1, Op2);
12545 ->getElementCount()
12546 .getKnownMinValue(),
12548 ->getElementCount()
12549 .getKnownMinValue());
12550 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12553 "Expected undefined mask element");
12554 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12563 return Builder.createIdentity(Op1);
12564 return Builder.createShuffleVector(
12569 return Builder.createPoison(
12571 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12572 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12575 return Builder.createShuffleVector(V1, NewMask);
12576 return Builder.createIdentity(V1);
12582 ArrayRef<int> Mask) {
12591static std::pair<InstructionCost, InstructionCost>
12602 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12611 ScalarCost =
TTI.getPointersChainCost(
12612 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12616 for (
Value *V : Ptrs) {
12617 if (V == BasePtr) {
12626 if (!
Ptr || !
Ptr->hasOneUse())
12630 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
12635 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12636 TTI::PointersChainInfo::getKnownStride(),
12646 [](
const Value *V) {
12648 return Ptr && !
Ptr->hasAllConstantIndices();
12650 ? TTI::PointersChainInfo::getUnknownStride()
12651 : TTI::PointersChainInfo::getKnownStride();
12654 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
12658 if (It != Ptrs.
end())
12663 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
12664 BaseGEP->getPointerOperand(), Indices, VecTy,
12669 return std::make_pair(ScalarCost, VecCost);
12672void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12673 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
12674 "Expected gather node without reordering.");
12676 SmallSet<size_t, 2> LoadKeyUsed;
12680 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
12685 return VectorizableTree[Idx]->isSame(TE.Scalars);
12689 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
12694 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
12695 if (LIt != LoadsMap.
end()) {
12696 for (LoadInst *RLI : LIt->second) {
12698 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
12702 for (LoadInst *RLI : LIt->second) {
12704 LI->getPointerOperand(), *TLI)) {
12709 if (LIt->second.size() > 2) {
12711 hash_value(LIt->second.back()->getPointerOperand());
12720 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12721 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12722 bool IsOrdered =
true;
12723 unsigned NumInstructions = 0;
12727 size_t Key = 1, Idx = 1;
12735 auto &Container = SortedValues[
Key];
12736 if (IsOrdered && !KeyToIndex.
contains(V) &&
12739 ((Container.contains(Idx) &&
12740 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
12741 (!Container.empty() && !Container.contains(Idx) &&
12742 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
12744 auto &KTI = KeyToIndex[
V];
12746 Container[Idx].push_back(V);
12751 if (!IsOrdered && NumInstructions > 1) {
12753 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
12754 for (
const auto &
D : SortedValues) {
12755 for (
const auto &
P :
D.second) {
12757 for (
Value *V :
P.second) {
12758 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
12759 for (
auto [K, Idx] :
enumerate(Indices)) {
12760 TE.ReorderIndices[Cnt +
K] = Idx;
12761 TE.Scalars[Cnt +
K] =
V;
12763 Sz += Indices.
size();
12764 Cnt += Indices.
size();
12768 *TTI,
TE.Scalars.front()->getType(), Sz);
12772 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
12780 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
12785 auto *ScalarTy =
TE.Scalars.front()->getType();
12787 for (
auto [Idx, Sz] : SubVectors) {
12794 int Sz =
TE.Scalars.size();
12795 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
12796 TE.ReorderIndices.end());
12802 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
12806 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
12809 VecTy, ReorderMask);
12815 DemandedElts.clearBit(
I);
12817 ReorderMask[
I] =
I;
12819 ReorderMask[
I] =
I + Sz;
12825 if (!DemandedElts.isAllOnes())
12827 if (
Cost >= BVCost) {
12828 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
12830 TE.ReorderIndices.clear();
12837 const InstructionsState &S,
12843 return V->getType()->getScalarType()->isFloatingPointTy();
12845 "Can only convert to FMA for floating point types");
12846 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
12851 for (
Value *V : VL) {
12855 if (S.isCopyableElement(
I))
12857 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
12858 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12861 FMF &= FPCI->getFastMathFlags();
12865 if (!CheckForContractable(VL))
12868 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
12875 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12877 if (!CheckForContractable(Operands.
front()))
12885 for (
Value *V : VL) {
12889 if (!S.isCopyableElement(
I))
12891 FMF &= FPCI->getFastMathFlags();
12892 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12895 for (
auto [V,
Op] :
zip(VL, Operands.
front())) {
12896 if (S.isCopyableElement(V))
12899 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
12901 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
12908 FMF &= FPCI->getFastMathFlags();
12909 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12919 BaseGraphSize = VectorizableTree.size();
12921 class GraphTransformModeRAAI {
12922 bool &SavedIsGraphTransformMode;
12925 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
12926 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12927 IsGraphTransformMode =
true;
12929 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
12930 } TransformContext(IsGraphTransformMode);
12939 const InstructionsState &S) {
12943 I2->getOperand(
Op));
12945 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
12947 [](
const std::pair<Value *, Value *> &
P) {
12957 TreeEntry &E = *VectorizableTree[Idx];
12959 reorderGatherNode(E);
12964 constexpr unsigned VFLimit = 16;
12965 bool ForceLoadGather =
12966 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12967 return TE->isGather() && TE->hasState() &&
12968 TE->getOpcode() == Instruction::Load &&
12969 TE->getVectorFactor() < VFLimit;
12975 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
12984 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
12985 if (E.hasState()) {
12987 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12988 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12989 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12990 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12991 return is_contained(TEs, TE);
12998 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12999 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13000 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13001 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13002 return is_contained(TEs, TE);
13010 if (It != E.Scalars.end()) {
13012 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13013 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13014 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13015 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13016 return is_contained(TEs, TE);
13026 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
13027 TreeEntry &
E = *VectorizableTree[Idx];
13028 if (
E.isGather()) {
13031 unsigned MinVF =
getMinVF(2 * Sz);
13034 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13035 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
13041 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
13044 if (CheckForSameVectorNodes(
E))
13048 unsigned StartIdx = 0;
13049 unsigned End = VL.
size();
13051 *TTI, VL.
front()->getType(), VL.
size() - 1);
13053 *TTI, VL.
front()->getType(), VF - 1)) {
13054 if (StartIdx + VF > End)
13057 bool AllStrided =
true;
13058 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13063 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
13070 bool IsSplat =
isSplat(Slice);
13071 bool IsTwoRegisterSplat =
true;
13072 if (IsSplat && VF == 2) {
13075 IsTwoRegisterSplat = NumRegs2VF == 2;
13077 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
13085 (S.getOpcode() == Instruction::Load &&
13087 (S.getOpcode() != Instruction::Load &&
13093 if ((!UserIgnoreList ||
E.Idx != 0) &&
13094 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13103 if (S.getOpcode() == Instruction::Load) {
13106 StridedPtrInfo SPtrInfo;
13108 PointerOps, SPtrInfo);
13119 if (UserIgnoreList &&
E.Idx == 0)
13124 }
else if (S.getOpcode() == Instruction::ExtractElement ||
13125 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13127 !CheckOperandsProfitability(
13144 if (VF == 2 && AllStrided && Slices.
size() > 2)
13146 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
13147 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13148 if (StartIdx == Cnt)
13149 StartIdx = Cnt + Sz;
13150 if (End == Cnt + Sz)
13153 for (
auto [Cnt, Sz] : Slices) {
13155 const TreeEntry *SameTE =
nullptr;
13157 It != Slice.
end()) {
13159 SameTE = getSameValuesTreeEntry(*It, Slice);
13161 unsigned PrevSize = VectorizableTree.size();
13162 [[maybe_unused]]
unsigned PrevEntriesSize =
13163 LoadEntriesToVectorize.size();
13164 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
13165 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13166 VectorizableTree[PrevSize]->isGather() &&
13167 VectorizableTree[PrevSize]->hasState() &&
13168 VectorizableTree[PrevSize]->getOpcode() !=
13169 Instruction::ExtractElement &&
13171 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
13173 VectorizableTree.pop_back();
13174 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13175 "LoadEntriesToVectorize expected to remain the same");
13178 AddCombinedNode(PrevSize, Cnt, Sz);
13182 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
13183 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13185 E.ReorderIndices.clear();
13190 switch (
E.getOpcode()) {
13191 case Instruction::Load: {
13194 if (
E.State != TreeEntry::Vectorize)
13196 Type *ScalarTy =
E.getMainOp()->getType();
13202 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13203 SmallVector<int>
Mask;
13207 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13208 BaseLI->getPointerAddressSpace(),
CostKind,
13212 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13213 false, CommonAlignment,
CostKind, BaseLI);
13218 ->getPointerOperand()
13220 StridedPtrInfo SPtrInfo;
13221 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13222 SPtrInfo.Ty = VecTy;
13223 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
13224 E.State = TreeEntry::StridedVectorize;
13229 case Instruction::Store: {
13237 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13238 SmallVector<int>
Mask;
13242 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13243 BaseSI->getPointerAddressSpace(),
CostKind,
13247 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13248 false, CommonAlignment,
CostKind, BaseSI);
13249 if (StridedCost < OriginalVecCost)
13252 E.State = TreeEntry::StridedVectorize;
13253 }
else if (!
E.ReorderIndices.empty()) {
13255 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
13257 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13258 if (
Mask.size() < 4)
13262 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13263 TTI.isLegalInterleavedAccessType(
13264 VecTy, Factor, BaseSI->getAlign(),
13265 BaseSI->getPointerAddressSpace()))
13271 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13272 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13273 if (InterleaveFactor != 0)
13274 E.setInterleave(InterleaveFactor);
13278 case Instruction::Select: {
13279 if (
E.State != TreeEntry::Vectorize)
13285 E.CombinedOp = TreeEntry::MinMax;
13286 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
13287 if (SelectOnly && CondEntry->UserTreeIndex &&
13288 CondEntry->State == TreeEntry::Vectorize) {
13290 CondEntry->State = TreeEntry::CombinedVectorize;
13294 case Instruction::FSub:
13295 case Instruction::FAdd: {
13297 if (
E.State != TreeEntry::Vectorize ||
13298 !
E.getOperations().isAddSubLikeOp())
13304 E.CombinedOp = TreeEntry::FMulAdd;
13305 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
13306 if (FMulEntry->UserTreeIndex &&
13307 FMulEntry->State == TreeEntry::Vectorize) {
13309 FMulEntry->State = TreeEntry::CombinedVectorize;
13318 if (LoadEntriesToVectorize.empty()) {
13320 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13321 VectorizableTree.front()->getOpcode() == Instruction::Load)
13324 constexpr unsigned SmallTree = 3;
13325 constexpr unsigned SmallVF = 2;
13326 if ((VectorizableTree.size() <= SmallTree &&
13327 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13328 (VectorizableTree.size() <= 2 && UserIgnoreList))
13331 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13335 [](
const std::unique_ptr<TreeEntry> &TE) {
13336 return TE->isGather() &&
TE->hasState() &&
13337 TE->getOpcode() == Instruction::Load &&
13345 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13349 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13350 TreeEntry &
E = *
TE;
13351 if (
E.isGather() &&
13352 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
13353 (!
E.hasState() &&
any_of(
E.Scalars,
13355 return isa<LoadInst>(V) &&
13356 !isVectorized(V) &&
13357 !isDeleted(cast<Instruction>(V));
13360 for (
Value *V :
E.Scalars) {
13367 *
this, V, *DL, *SE, *TTI,
13368 GatheredLoads[std::make_tuple(
13376 if (!GatheredLoads.
empty())
13377 tryToVectorizeGatheredLoads(GatheredLoads);
13387 bool IsFinalized =
false;
13400 bool SameNodesEstimated =
true;
13403 if (Ty->getScalarType()->isPointerTy()) {
13407 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13408 Ty->getScalarType());
13426 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13429 count(VL, *It) > 1 &&
13431 if (!NeedShuffle) {
13434 return TTI.getShuffleCost(
13439 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13440 CostKind, std::distance(VL.
begin(), It),
13446 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13449 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13453 VecTy, ShuffleMask, CostKind,
13457 return GatherCost +
13460 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13468 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13469 unsigned NumParts) {
13470 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13472 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13473 auto *EE = dyn_cast<ExtractElementInst>(V);
13476 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13479 return std::max(Sz, VecTy->getNumElements());
13486 -> std::optional<TTI::ShuffleKind> {
13487 if (NumElts <= EltsPerVector)
13488 return std::nullopt;
13490 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13492 if (I == PoisonMaskElem)
13494 return std::min(S, I);
13497 int OffsetReg1 = OffsetReg0;
13501 int FirstRegId = -1;
13502 Indices.assign(1, OffsetReg0);
13506 int Idx =
I - OffsetReg0;
13508 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13509 if (FirstRegId < 0)
13510 FirstRegId = RegId;
13511 RegIndices.
insert(RegId);
13512 if (RegIndices.
size() > 2)
13513 return std::nullopt;
13514 if (RegIndices.
size() == 2) {
13516 if (Indices.
size() == 1) {
13519 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13520 [&](
int S,
int I) {
13521 if (I == PoisonMaskElem)
13523 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13524 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13525 if (RegId == FirstRegId)
13527 return std::min(S, I);
13530 unsigned Index = OffsetReg1 % NumElts;
13531 Indices.push_back(Index);
13532 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13534 Idx =
I - OffsetReg1;
13536 I = (Idx % NumElts) % EltsPerVector +
13537 (RegId == FirstRegId ? 0 : EltsPerVector);
13539 return ShuffleKind;
13547 if (!ShuffleKinds[Part])
13550 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13555 std::optional<TTI::ShuffleKind> RegShuffleKind =
13556 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13557 if (!RegShuffleKind) {
13560 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13573 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13574 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13575 assert((Idx + SubVecSize) <= BaseVF &&
13576 "SK_ExtractSubvector index out of range");
13586 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
13587 if (OriginalCost < Cost)
13588 Cost = OriginalCost;
13595 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
13597 unsigned SliceSize) {
13598 if (SameNodesEstimated) {
13604 if ((InVectors.size() == 2 &&
13608 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
13611 "Expected all poisoned elements.");
13613 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13618 Cost += createShuffle(InVectors.front(),
13619 InVectors.size() == 1 ?
nullptr : InVectors.back(),
13621 transformMaskAfterShuffle(CommonMask, CommonMask);
13622 }
else if (InVectors.size() == 2) {
13623 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13624 transformMaskAfterShuffle(CommonMask, CommonMask);
13626 SameNodesEstimated =
false;
13627 if (!E2 && InVectors.size() == 1) {
13628 unsigned VF = E1.getVectorFactor();
13630 VF = std::max(VF, getVF(V1));
13633 VF = std::max(VF, E->getVectorFactor());
13635 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13637 CommonMask[Idx] = Mask[Idx] + VF;
13638 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13639 transformMaskAfterShuffle(CommonMask, CommonMask);
13641 auto P = InVectors.front();
13642 Cost += createShuffle(&E1, E2, Mask);
13643 unsigned VF = Mask.size();
13649 VF = std::max(VF, E->getVectorFactor());
13651 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13653 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13654 Cost += createShuffle(
P, InVectors.front(), CommonMask);
13655 transformMaskAfterShuffle(CommonMask, CommonMask);
13659 class ShuffleCostBuilder {
13662 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
13664 return Mask.empty() ||
13665 (VF == Mask.size() &&
13673 ~ShuffleCostBuilder() =
default;
13679 if (isEmptyOrIdentity(Mask, VF))
13688 if (isEmptyOrIdentity(Mask, VF))
13697 void resizeToMatch(
Value *&,
Value *&)
const {}
13707 ShuffleCostBuilder Builder(TTI);
13710 unsigned CommonVF = Mask.size();
13712 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
13716 Type *EScalarTy = E.Scalars.front()->getType();
13717 bool IsSigned =
true;
13718 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13720 IsSigned = It->second.second;
13722 if (EScalarTy != ScalarTy) {
13723 unsigned CastOpcode = Instruction::Trunc;
13724 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13725 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13727 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13728 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
13738 Type *EScalarTy = VecTy->getElementType();
13739 if (EScalarTy != ScalarTy) {
13741 unsigned CastOpcode = Instruction::Trunc;
13742 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13743 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13745 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13746 return TTI.getCastInstrCost(
13752 if (!V1 && !V2 && !P2.
isNull()) {
13755 unsigned VF = E->getVectorFactor();
13757 CommonVF = std::max(VF, E2->getVectorFactor());
13760 return Idx < 2 * static_cast<int>(CommonVF);
13762 "All elements in mask must be less than 2 * CommonVF.");
13763 if (E->Scalars.size() == E2->Scalars.size()) {
13767 for (
int &Idx : CommonMask) {
13770 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
13772 else if (Idx >=
static_cast<int>(CommonVF))
13773 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13777 CommonVF = E->Scalars.size();
13778 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13779 GetNodeMinBWAffectedCost(*E2, CommonVF);
13781 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13782 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13785 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13786 }
else if (!V1 && P2.
isNull()) {
13789 unsigned VF = E->getVectorFactor();
13793 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13794 "All elements in mask must be less than CommonVF.");
13795 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13797 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
13798 for (
int &Idx : CommonMask) {
13802 CommonVF = E->Scalars.size();
13803 }
else if (
unsigned Factor = E->getInterleaveFactor();
13804 Factor > 0 && E->Scalars.size() != Mask.size() &&
13808 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13810 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13813 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13814 CommonVF == CommonMask.size() &&
13816 [](
const auto &&
P) {
13818 static_cast<unsigned>(
P.value()) !=
P.index();
13826 }
else if (V1 && P2.
isNull()) {
13828 ExtraCost += GetValueMinBWAffectedCost(V1);
13829 CommonVF = getVF(V1);
13832 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13833 "All elements in mask must be less than CommonVF.");
13834 }
else if (V1 && !V2) {
13836 unsigned VF = getVF(V1);
13838 CommonVF = std::max(VF, E2->getVectorFactor());
13841 return Idx < 2 * static_cast<int>(CommonVF);
13843 "All elements in mask must be less than 2 * CommonVF.");
13844 if (E2->Scalars.size() == VF && VF != CommonVF) {
13846 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
13847 for (
int &Idx : CommonMask) {
13850 if (Idx >=
static_cast<int>(CommonVF))
13851 Idx = E2Mask[Idx - CommonVF] + VF;
13855 ExtraCost += GetValueMinBWAffectedCost(V1);
13857 ExtraCost += GetNodeMinBWAffectedCost(
13858 *E2, std::min(CommonVF, E2->getVectorFactor()));
13859 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13860 }
else if (!V1 && V2) {
13862 unsigned VF = getVF(V2);
13864 CommonVF = std::max(VF, E1->getVectorFactor());
13867 return Idx < 2 * static_cast<int>(CommonVF);
13869 "All elements in mask must be less than 2 * CommonVF.");
13870 if (E1->Scalars.size() == VF && VF != CommonVF) {
13872 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
13873 for (
int &Idx : CommonMask) {
13876 if (Idx >=
static_cast<int>(CommonVF))
13877 Idx = E1Mask[Idx - CommonVF] + VF;
13883 ExtraCost += GetNodeMinBWAffectedCost(
13884 *E1, std::min(CommonVF, E1->getVectorFactor()));
13886 ExtraCost += GetValueMinBWAffectedCost(V2);
13887 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13889 assert(V1 && V2 &&
"Expected both vectors.");
13890 unsigned VF = getVF(V1);
13891 CommonVF = std::max(VF, getVF(V2));
13894 return Idx < 2 * static_cast<int>(CommonVF);
13896 "All elements in mask must be less than 2 * CommonVF.");
13898 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13901 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13906 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13909 InVectors.front() =
13911 if (InVectors.size() == 2)
13912 InVectors.pop_back();
13913 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13914 V1, V2, CommonMask, Builder, ScalarTy);
13921 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13922 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13923 CheckedExtracts(CheckedExtracts) {}
13925 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13926 unsigned NumParts,
bool &UseVecBaseAsInput) {
13927 UseVecBaseAsInput =
false;
13930 Value *VecBase =
nullptr;
13932 if (!E->ReorderIndices.empty()) {
13934 E->ReorderIndices.end());
13939 bool PrevNodeFound =
any_of(
13940 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13941 [&](
const std::unique_ptr<TreeEntry> &TE) {
13942 return ((TE->hasState() && !TE->isAltShuffle() &&
13943 TE->getOpcode() == Instruction::ExtractElement) ||
13945 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13946 return VL.size() > Data.index() &&
13947 (Mask[Data.index()] == PoisonMaskElem ||
13948 isa<UndefValue>(VL[Data.index()]) ||
13949 Data.value() == VL[Data.index()]);
13957 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13971 VecBase = EE->getVectorOperand();
13972 UniqueBases.
insert(VecBase);
13974 if (!CheckedExtracts.
insert(V).second ||
13978 return isa<GetElementPtrInst>(U) &&
13979 !R.areAllUsersVectorized(cast<Instruction>(U),
13987 unsigned Idx = *EEIdx;
13989 if (EE->hasOneUse() || !PrevNodeFound) {
13995 Cost -= TTI.getExtractWithExtendCost(
13996 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13999 Cost += TTI.getCastInstrCost(
14000 Ext->getOpcode(), Ext->getType(), EE->getType(),
14005 APInt &DemandedElts =
14006 VectorOpsToExtracts
14009 .first->getSecond();
14010 DemandedElts.
setBit(Idx);
14013 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14015 DemandedElts,
false,
14023 if (!PrevNodeFound)
14024 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14027 transformMaskAfterShuffle(CommonMask, CommonMask);
14028 SameNodesEstimated =
false;
14029 if (NumParts != 1 && UniqueBases.
size() != 1) {
14030 UseVecBaseAsInput =
true;
14038 std::optional<InstructionCost>
14042 return std::nullopt;
14046 IsFinalized =
false;
14047 CommonMask.clear();
14050 VectorizedVals.clear();
14051 SameNodesEstimated =
true;
14057 return Idx < static_cast<int>(E1.getVectorFactor());
14059 "Expected single vector shuffle mask.");
14063 if (InVectors.empty()) {
14064 CommonMask.assign(Mask.begin(), Mask.end());
14065 InVectors.assign({&E1, &E2});
14068 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14074 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14075 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14078 if (InVectors.empty()) {
14079 CommonMask.assign(Mask.begin(), Mask.end());
14080 InVectors.assign(1, &E1);
14083 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14089 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14090 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
14091 if (!SameNodesEstimated && InVectors.size() == 1)
14092 InVectors.emplace_back(&E1);
14098 assert(InVectors.size() == 1 &&
14105 ->getOrdered(
P.index()));
14106 return EI->getVectorOperand() == V1 ||
14107 EI->getVectorOperand() == V2;
14109 "Expected extractelement vectors.");
14113 if (InVectors.empty()) {
14114 assert(CommonMask.empty() && !ForExtracts &&
14115 "Expected empty input mask/vectors.");
14116 CommonMask.assign(Mask.begin(), Mask.end());
14117 InVectors.assign(1, V1);
14123 !CommonMask.empty() &&
14127 ->getOrdered(
P.index());
14129 return P.value() == Mask[
P.index()] ||
14134 return EI->getVectorOperand() == V1;
14136 "Expected only tree entry for extractelement vectors.");
14139 assert(!InVectors.empty() && !CommonMask.empty() &&
14140 "Expected only tree entries from extracts/reused buildvectors.");
14141 unsigned VF = getVF(V1);
14142 if (InVectors.size() == 2) {
14143 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14144 transformMaskAfterShuffle(CommonMask, CommonMask);
14145 VF = std::max<unsigned>(VF, CommonMask.size());
14146 }
else if (
const auto *InTE =
14147 InVectors.front().dyn_cast<
const TreeEntry *>()) {
14148 VF = std::max(VF, InTE->getVectorFactor());
14152 ->getNumElements());
14154 InVectors.push_back(V1);
14155 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14157 CommonMask[Idx] = Mask[Idx] + VF;
14160 Value *Root =
nullptr) {
14161 Cost += getBuildVectorCost(VL, Root);
14165 unsigned VF = VL.
size();
14167 VF = std::min(VF, MaskVF);
14168 Type *VLScalarTy = VL.
front()->getType();
14192 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14198 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14203 IsFinalized =
true;
14206 if (InVectors.
size() == 2)
14207 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14209 Cost += createShuffle(Vec,
nullptr, CommonMask);
14210 transformMaskAfterShuffle(CommonMask, CommonMask);
14212 "Expected vector length for the final value before action.");
14215 Cost += createShuffle(V1, V2, Mask);
14218 InVectors.
front() = V;
14220 if (!SubVectors.empty()) {
14222 if (InVectors.
size() == 2)
14223 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14225 Cost += createShuffle(Vec,
nullptr, CommonMask);
14226 transformMaskAfterShuffle(CommonMask, CommonMask);
14228 if (!SubVectorsMask.
empty()) {
14230 "Expected same size of masks for subvectors and common mask.");
14232 copy(SubVectorsMask, SVMask.begin());
14233 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14236 I1 = I2 + CommonMask.
size();
14243 for (
auto [
E, Idx] : SubVectors) {
14244 Type *EScalarTy =
E->Scalars.front()->getType();
14245 bool IsSigned =
true;
14246 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
14249 IsSigned = It->second.second;
14251 if (ScalarTy != EScalarTy) {
14252 unsigned CastOpcode = Instruction::Trunc;
14253 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14254 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14256 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14257 Cost += TTI.getCastInstrCost(
14266 if (!CommonMask.
empty()) {
14267 std::iota(std::next(CommonMask.
begin(), Idx),
14268 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
14274 if (!ExtMask.
empty()) {
14275 if (CommonMask.
empty()) {
14279 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14282 NewMask[
I] = CommonMask[ExtMask[
I]];
14284 CommonMask.
swap(NewMask);
14287 if (CommonMask.
empty()) {
14288 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14292 createShuffle(InVectors.
front(),
14293 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14298 assert((IsFinalized || CommonMask.empty()) &&
14299 "Shuffle construction must be finalized.");
14303const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14304 unsigned Idx)
const {
14305 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
14306 assert(
Op->isSame(E->getOperand(Idx)) &&
"Operands mismatch!");
14311 if (TE.State == TreeEntry::ScatterVectorize ||
14312 TE.State == TreeEntry::StridedVectorize)
14314 if (TE.State == TreeEntry::CompressVectorize)
14316 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14317 !TE.isAltShuffle()) {
14318 if (TE.ReorderIndices.empty())
14330 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14340 auto It = MinBWs.find(
E);
14341 Type *OrigScalarTy = ScalarTy;
14342 if (It != MinBWs.end()) {
14349 unsigned EntryVF =
E->getVectorFactor();
14352 if (
E->isGather()) {
14358 ScalarTy = VL.
front()->getType();
14359 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14360 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
14362 if (
E->State == TreeEntry::SplitVectorize) {
14363 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
14364 "Expected exactly 2 combined entries.");
14365 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14367 if (
E->ReorderIndices.empty()) {
14370 E->CombinedEntriesWithIndices.back().second,
14373 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14374 ->getVectorFactor()));
14376 unsigned CommonVF =
14377 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
14378 ->getVectorFactor(),
14379 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14380 ->getVectorFactor());
14385 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14389 SmallVector<int>
Mask;
14390 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
14391 (
E->State != TreeEntry::StridedVectorize ||
14393 SmallVector<int> NewMask;
14394 if (
E->getOpcode() == Instruction::Store) {
14396 NewMask.
resize(
E->ReorderIndices.size());
14403 if (!
E->ReuseShuffleIndices.empty())
14408 assert((
E->State == TreeEntry::Vectorize ||
14409 E->State == TreeEntry::ScatterVectorize ||
14410 E->State == TreeEntry::StridedVectorize ||
14411 E->State == TreeEntry::CompressVectorize) &&
14412 "Unhandled state");
14415 (
E->getOpcode() == Instruction::GetElementPtr &&
14416 E->getMainOp()->getType()->isPointerTy()) ||
14417 E->hasCopyableElements()) &&
14420 unsigned ShuffleOrOp =
14421 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
14422 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
14423 ShuffleOrOp =
E->CombinedOp;
14424 SmallSetVector<Value *, 16> UniqueValues(VL.
begin(), VL.
end());
14425 const unsigned Sz = UniqueValues.size();
14426 SmallBitVector UsedScalars(Sz,
false);
14427 for (
unsigned I = 0;
I < Sz; ++
I) {
14429 !
E->isCopyableElement(UniqueValues[
I]) &&
14430 getTreeEntries(UniqueValues[
I]).
front() ==
E)
14432 UsedScalars.set(
I);
14434 auto GetCastContextHint = [&](
Value *
V) {
14436 return getCastContextHint(*OpTEs.front());
14437 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
14438 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14439 !SrcState.isAltShuffle())
14452 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14454 for (
unsigned I = 0;
I < Sz; ++
I) {
14455 if (UsedScalars.test(
I))
14457 ScalarCost += ScalarEltCost(
I);
14466 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
14468 if (!EI.UserTE->hasState() ||
14469 EI.UserTE->getOpcode() != Instruction::Select ||
14471 auto UserBWIt = MinBWs.find(EI.UserTE);
14472 Type *UserScalarTy =
14473 (EI.UserTE->isGather() ||
14474 EI.UserTE->State == TreeEntry::SplitVectorize)
14475 ? EI.UserTE->Scalars.front()->getType()
14476 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14477 if (UserBWIt != MinBWs.end())
14479 UserBWIt->second.first);
14480 if (ScalarTy != UserScalarTy) {
14481 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14482 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14483 unsigned VecOpcode;
14485 if (BWSz > SrcBWSz)
14486 VecOpcode = Instruction::Trunc;
14489 It->second.second ? Instruction::SExt : Instruction::ZExt;
14491 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14496 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
14497 ScalarCost,
"Calculated costs for Tree"));
14498 return VecCost - ScalarCost;
14503 assert((
E->State == TreeEntry::Vectorize ||
14504 E->State == TreeEntry::StridedVectorize ||
14505 E->State == TreeEntry::CompressVectorize) &&
14506 "Entry state expected to be Vectorize, StridedVectorize or "
14507 "MaskedLoadCompressVectorize here.");
14511 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14512 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
14513 "Calculated GEPs cost for Tree"));
14515 return VecCost - ScalarCost;
14522 Type *CanonicalType = Ty;
14528 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14529 {CanonicalType, CanonicalType});
14531 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14534 if (VI && SelectOnly) {
14536 "Expected only for scalar type.");
14539 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14540 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14541 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14545 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
14550 switch (ShuffleOrOp) {
14551 case Instruction::PHI: {
14554 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14555 for (
Value *V : UniqueValues) {
14560 ValueList Operands(
PHI->getNumIncomingValues(),
nullptr);
14561 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14565 if (
const TreeEntry *OpTE =
14566 getSameValuesTreeEntry(Operands.
front(), Operands))
14567 if (CountedOps.
insert(OpTE).second &&
14568 !OpTE->ReuseShuffleIndices.empty())
14569 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14570 OpTE->Scalars.size());
14573 return CommonCost - ScalarCost;
14575 case Instruction::ExtractValue:
14576 case Instruction::ExtractElement: {
14577 APInt DemandedElts;
14579 auto GetScalarCost = [&](
unsigned Idx) {
14585 if (ShuffleOrOp == Instruction::ExtractElement) {
14587 SrcVecTy = EE->getVectorOperandType();
14590 Type *AggregateTy = EV->getAggregateOperand()->getType();
14593 NumElts = ATy->getNumElements();
14599 if (
I->hasOneUse()) {
14609 Cost -= TTI->getCastInstrCost(
14610 Ext->getOpcode(),
Ext->getType(),
I->getType(),
14615 if (DemandedElts.
isZero())
14621 return CommonCost - (DemandedElts.
isZero()
14623 : TTI.getScalarizationOverhead(
14624 SrcVecTy, DemandedElts,
false,
14627 return GetCostDiff(GetScalarCost, GetVectorCost);
14629 case Instruction::InsertElement: {
14630 assert(
E->ReuseShuffleIndices.empty() &&
14631 "Unique insertelements only are expected.");
14633 unsigned const NumElts = SrcVecTy->getNumElements();
14634 unsigned const NumScalars = VL.
size();
14640 unsigned OffsetEnd = OffsetBeg;
14641 InsertMask[OffsetBeg] = 0;
14644 if (OffsetBeg > Idx)
14646 else if (OffsetEnd < Idx)
14648 InsertMask[Idx] =
I + 1;
14651 if (NumOfParts > 0 && NumOfParts < NumElts)
14652 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14653 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14655 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14656 unsigned InsertVecSz = std::min<unsigned>(
14658 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14659 bool IsWholeSubvector =
14660 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14664 if (OffsetBeg + InsertVecSz > VecSz) {
14667 InsertVecSz = VecSz;
14672 SmallVector<int>
Mask;
14673 if (!
E->ReorderIndices.empty()) {
14678 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
14680 bool IsIdentity =
true;
14682 Mask.swap(PrevMask);
14683 for (
unsigned I = 0;
I < NumScalars; ++
I) {
14685 DemandedElts.
setBit(InsertIdx);
14686 IsIdentity &= InsertIdx - OffsetBeg ==
I;
14687 Mask[InsertIdx - OffsetBeg] =
I;
14689 assert(
Offset < NumElts &&
"Failed to find vector index offset");
14703 InsertVecTy, Mask);
14705 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14711 SmallBitVector InMask =
14713 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14714 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
14715 if (InsertVecSz != VecSz) {
14720 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
14722 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
14726 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
14735 case Instruction::ZExt:
14736 case Instruction::SExt:
14737 case Instruction::FPToUI:
14738 case Instruction::FPToSI:
14739 case Instruction::FPExt:
14740 case Instruction::PtrToInt:
14741 case Instruction::IntToPtr:
14742 case Instruction::SIToFP:
14743 case Instruction::UIToFP:
14744 case Instruction::Trunc:
14745 case Instruction::FPTrunc:
14746 case Instruction::BitCast: {
14747 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
14750 unsigned Opcode = ShuffleOrOp;
14751 unsigned VecOpcode = Opcode;
14753 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14755 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
14756 if (SrcIt != MinBWs.end()) {
14757 SrcBWSz = SrcIt->second.first;
14763 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
14764 if (BWSz == SrcBWSz) {
14765 VecOpcode = Instruction::BitCast;
14766 }
else if (BWSz < SrcBWSz) {
14767 VecOpcode = Instruction::Trunc;
14768 }
else if (It != MinBWs.end()) {
14769 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14770 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14771 }
else if (SrcIt != MinBWs.end()) {
14772 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14774 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14776 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14777 !SrcIt->second.second) {
14778 VecOpcode = Instruction::UIToFP;
14781 assert(Idx == 0 &&
"Expected 0 index only");
14782 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
14789 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14791 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
14794 bool IsArithmeticExtendedReduction =
14795 E->Idx == 0 && UserIgnoreList &&
14798 return is_contained({Instruction::Add, Instruction::FAdd,
14799 Instruction::Mul, Instruction::FMul,
14800 Instruction::And, Instruction::Or,
14804 if (IsArithmeticExtendedReduction &&
14805 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14807 return CommonCost +
14808 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
14809 VecOpcode == Opcode ? VI :
nullptr);
14811 return GetCostDiff(GetScalarCost, GetVectorCost);
14813 case Instruction::FCmp:
14814 case Instruction::ICmp:
14815 case Instruction::Select: {
14816 CmpPredicate VecPred, SwappedVecPred;
14819 match(VL0, MatchCmp))
14825 auto GetScalarCost = [&](
unsigned Idx) {
14835 !
match(VI, MatchCmp)) ||
14843 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14844 CostKind, getOperandInfo(
VI->getOperand(0)),
14845 getOperandInfo(
VI->getOperand(1)), VI);
14856 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
14857 CostKind, getOperandInfo(
E->getOperand(0)),
14858 getOperandInfo(
E->getOperand(1)), VL0);
14862 unsigned CondNumElements = CondType->getNumElements();
14864 assert(VecTyNumElements >= CondNumElements &&
14865 VecTyNumElements % CondNumElements == 0 &&
14866 "Cannot vectorize Instruction::Select");
14867 if (CondNumElements != VecTyNumElements) {
14876 return VecCost + CommonCost;
14878 return GetCostDiff(GetScalarCost, GetVectorCost);
14880 case TreeEntry::MinMax: {
14881 auto GetScalarCost = [&](
unsigned Idx) {
14882 return GetMinMaxCost(OrigScalarTy);
14886 return VecCost + CommonCost;
14888 return GetCostDiff(GetScalarCost, GetVectorCost);
14890 case TreeEntry::FMulAdd: {
14891 auto GetScalarCost = [&](
unsigned Idx) {
14894 return GetFMulAddCost(
E->getOperations(),
14900 for (
Value *V :
E->Scalars) {
14902 FMF &= FPCI->getFastMathFlags();
14904 FMF &= FPCIOp->getFastMathFlags();
14907 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14908 {VecTy, VecTy, VecTy}, FMF);
14910 return VecCost + CommonCost;
14912 return GetCostDiff(GetScalarCost, GetVectorCost);
14914 case Instruction::FNeg:
14915 case Instruction::Add:
14916 case Instruction::FAdd:
14917 case Instruction::Sub:
14918 case Instruction::FSub:
14919 case Instruction::Mul:
14920 case Instruction::FMul:
14921 case Instruction::UDiv:
14922 case Instruction::SDiv:
14923 case Instruction::FDiv:
14924 case Instruction::URem:
14925 case Instruction::SRem:
14926 case Instruction::FRem:
14927 case Instruction::Shl:
14928 case Instruction::LShr:
14929 case Instruction::AShr:
14930 case Instruction::And:
14931 case Instruction::Or:
14932 case Instruction::Xor: {
14933 auto GetScalarCost = [&](
unsigned Idx) {
14940 Value *Op1 =
E->getOperand(0)[Idx];
14942 SmallVector<const Value *, 2> Operands(1, Op1);
14946 Op2 =
E->getOperand(1)[Idx];
14952 ShuffleOrOp, OrigScalarTy,
CostKind, Op1Info, Op2Info, Operands);
14954 I && (ShuffleOrOp == Instruction::FAdd ||
14955 ShuffleOrOp == Instruction::FSub)) {
14963 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14968 return CI && CI->getValue().countr_one() >= It->second.first;
14976 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
14977 Op2Info, {},
nullptr, TLI) +
14980 return GetCostDiff(GetScalarCost, GetVectorCost);
14982 case Instruction::GetElementPtr: {
14983 return CommonCost + GetGEPCostDiff(VL, VL0);
14985 case Instruction::Load: {
14986 auto GetScalarCost = [&](
unsigned Idx) {
14988 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14989 VI->getAlign(),
VI->getPointerAddressSpace(),
14995 switch (
E->State) {
14996 case TreeEntry::Vectorize:
14997 if (
unsigned Factor =
E->getInterleaveFactor()) {
14998 VecLdCost = TTI->getInterleavedMemoryOpCost(
14999 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
15000 LI0->getPointerAddressSpace(),
CostKind);
15003 VecLdCost = TTI->getMemoryOpCost(
15004 Instruction::Load, VecTy, LI0->getAlign(),
15008 case TreeEntry::StridedVectorize: {
15009 Align CommonAlignment =
15011 VecLdCost = TTI->getStridedMemoryOpCost(
15012 Instruction::Load, VecTy, LI0->getPointerOperand(),
15013 false, CommonAlignment,
CostKind);
15016 case TreeEntry::CompressVectorize: {
15018 unsigned InterleaveFactor;
15019 SmallVector<int> CompressMask;
15022 if (!
E->ReorderIndices.empty()) {
15023 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
15024 E->ReorderIndices.end());
15031 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15032 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
15033 CompressMask, LoadVecTy);
15034 assert(IsVectorized &&
"Failed to vectorize load");
15035 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
15036 InterleaveFactor, IsMasked);
15037 Align CommonAlignment = LI0->getAlign();
15038 if (InterleaveFactor) {
15039 VecLdCost = TTI->getInterleavedMemoryOpCost(
15040 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15041 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
15042 }
else if (IsMasked) {
15043 VecLdCost = TTI->getMaskedMemoryOpCost(
15044 Instruction::Load, LoadVecTy, CommonAlignment,
15045 LI0->getPointerAddressSpace(),
CostKind);
15048 LoadVecTy, CompressMask,
CostKind);
15050 VecLdCost = TTI->getMemoryOpCost(
15051 Instruction::Load, LoadVecTy, CommonAlignment,
15055 LoadVecTy, CompressMask,
CostKind);
15059 case TreeEntry::ScatterVectorize: {
15060 Align CommonAlignment =
15062 VecLdCost = TTI->getGatherScatterOpCost(
15063 Instruction::Load, VecTy, LI0->getPointerOperand(),
15064 false, CommonAlignment,
CostKind);
15067 case TreeEntry::CombinedVectorize:
15068 case TreeEntry::SplitVectorize:
15069 case TreeEntry::NeedToGather:
15072 return VecLdCost + CommonCost;
15078 if (
E->State == TreeEntry::ScatterVectorize)
15085 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15087 case Instruction::Store: {
15088 bool IsReorder = !
E->ReorderIndices.empty();
15089 auto GetScalarCost = [=](
unsigned Idx) {
15092 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15093 VI->getAlign(),
VI->getPointerAddressSpace(),
15101 if (
E->State == TreeEntry::StridedVectorize) {
15102 Align CommonAlignment =
15104 VecStCost = TTI->getStridedMemoryOpCost(
15105 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
15106 false, CommonAlignment,
CostKind);
15108 assert(
E->State == TreeEntry::Vectorize &&
15109 "Expected either strided or consecutive stores.");
15110 if (
unsigned Factor =
E->getInterleaveFactor()) {
15111 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
15112 "No reused shuffles expected");
15114 VecStCost = TTI->getInterleavedMemoryOpCost(
15115 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15116 BaseSI->getPointerAddressSpace(),
CostKind);
15119 VecStCost = TTI->getMemoryOpCost(
15120 Instruction::Store, VecTy, BaseSI->getAlign(),
15121 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
15124 return VecStCost + CommonCost;
15128 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
15132 return GetCostDiff(GetScalarCost, GetVectorCost) +
15133 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15135 case Instruction::Call: {
15136 auto GetScalarCost = [&](
unsigned Idx) {
15140 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
15141 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15151 CI,
ID, VecTy->getNumElements(),
15152 It != MinBWs.end() ? It->second.first : 0, TTI);
15154 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15156 return GetCostDiff(GetScalarCost, GetVectorCost);
15158 case Instruction::ShuffleVector: {
15166 "Invalid Shuffle Vector Operand");
15169 auto TryFindNodeWithEqualOperands = [=]() {
15170 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15173 if (
TE->hasState() &&
TE->isAltShuffle() &&
15174 ((
TE->getOpcode() ==
E->getOpcode() &&
15175 TE->getAltOpcode() ==
E->getAltOpcode()) ||
15176 (
TE->getOpcode() ==
E->getAltOpcode() &&
15177 TE->getAltOpcode() ==
E->getOpcode())) &&
15178 TE->hasEqualOperands(*
E))
15183 auto GetScalarCost = [&](
unsigned Idx) {
15188 assert(
E->getMatchingMainOpOrAltOp(VI) &&
15189 "Unexpected main/alternate opcode");
15191 return TTI->getInstructionCost(VI,
CostKind);
15199 if (TryFindNodeWithEqualOperands()) {
15201 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15208 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
15210 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
15213 VecCost = TTIRef.getCmpSelInstrCost(
15214 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15215 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15217 VecCost += TTIRef.getCmpSelInstrCost(
15218 E->getOpcode(), VecTy, MaskTy,
15220 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15223 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
15226 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15227 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15229 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
15230 if (SrcIt != MinBWs.end()) {
15231 SrcBWSz = SrcIt->second.first;
15235 if (BWSz <= SrcBWSz) {
15236 if (BWSz < SrcBWSz)
15238 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15242 <<
"SLP: alternate extension, which should be truncated.\n";
15248 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
15251 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
15254 SmallVector<int>
Mask;
15255 E->buildAltOpShuffleMask(
15256 [&](Instruction *
I) {
15257 assert(
E->getMatchingMainOpOrAltOp(
I) &&
15258 "Unexpected main/alternate opcode");
15269 unsigned Opcode0 =
E->getOpcode();
15270 unsigned Opcode1 =
E->getAltOpcode();
15271 SmallBitVector OpcodeMask(
15275 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15277 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15278 return AltVecCost < VecCost ? AltVecCost : VecCost;
15284 return GetCostDiff(
15289 "Not supported shufflevector usage.");
15291 unsigned SVNumElements =
15293 ->getNumElements();
15294 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15295 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
15300 "Not supported shufflevector usage.");
15303 [[maybe_unused]]
bool IsExtractSubvectorMask =
15304 SV->isExtractSubvectorMask(Index);
15305 assert(IsExtractSubvectorMask &&
15306 "Not supported shufflevector usage.");
15307 if (NextIndex != Index)
15309 NextIndex += SV->getShuffleMask().size();
15312 return ::getShuffleCost(
15318 return GetCostDiff(GetScalarCost, GetVectorCost);
15320 case Instruction::Freeze:
15327bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15329 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15331 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15332 SmallVector<int>
Mask;
15333 return TE->isGather() &&
15335 [
this](
Value *V) { return EphValues.contains(V); }) &&
15337 TE->Scalars.size() < Limit ||
15338 (((
TE->hasState() &&
15339 TE->getOpcode() == Instruction::ExtractElement) ||
15342 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15343 !
TE->isAltShuffle()) ||
15348 if (VectorizableTree.size() == 1 &&
15349 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15350 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15351 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15353 AreVectorizableGathers(VectorizableTree[0].
get(),
15354 VectorizableTree[0]->Scalars.size()) &&
15355 VectorizableTree[0]->getVectorFactor() > 2)))
15358 if (VectorizableTree.size() != 2)
15365 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15366 AreVectorizableGathers(VectorizableTree[1].
get(),
15367 VectorizableTree[0]->Scalars.size()))
15371 if (VectorizableTree[0]->
isGather() ||
15372 (VectorizableTree[1]->
isGather() &&
15373 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15374 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15375 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15383 bool MustMatchOrInst) {
15387 Value *ZextLoad = Root;
15388 const APInt *ShAmtC;
15389 bool FoundOr =
false;
15393 ShAmtC->
urem(8) == 0))) {
15395 ZextLoad = BinOp->getOperand(0);
15396 if (BinOp->getOpcode() == Instruction::Or)
15401 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15408 Type *SrcTy = Load->getType();
15409 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15415 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15425 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15426 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15434 unsigned NumElts = Stores.
size();
15435 for (
Value *Scalar : Stores) {
15449 if (VectorizableTree.empty()) {
15450 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15456 if (VectorizableTree.size() == 2 &&
15458 VectorizableTree[1]->isGather() &&
15459 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15460 !(
isSplat(VectorizableTree[1]->Scalars) ||
15468 constexpr int Limit = 4;
15470 !VectorizableTree.empty() &&
15471 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15472 return (TE->isGather() &&
15473 (!TE->hasState() ||
15474 TE->getOpcode() != Instruction::ExtractElement) &&
15476 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15483 VectorizableTree.size() <= Limit &&
15484 all_of(VectorizableTree,
15485 [&](
const std::unique_ptr<TreeEntry> &TE) {
15486 return (TE->isGather() &&
15487 (!TE->hasState() ||
15488 TE->getOpcode() != Instruction::ExtractElement) &&
15492 (TE->getOpcode() == Instruction::InsertElement ||
15493 (TE->getOpcode() == Instruction::PHI &&
15495 return isa<PoisonValue>(V) || MustGather.contains(V);
15498 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15499 return TE->State == TreeEntry::Vectorize &&
15500 TE->getOpcode() == Instruction::PHI;
15507 unsigned NumGathers = 0;
15508 constexpr int LimitTreeSize = 36;
15510 all_of(VectorizableTree,
15511 [&](
const std::unique_ptr<TreeEntry> &TE) {
15512 if (!TE->isGather() && TE->hasState() &&
15513 (TE->getOpcode() == Instruction::Load ||
15514 TE->getOpcode() == Instruction::Store)) {
15518 if (TE->isGather())
15520 return TE->State == TreeEntry::SplitVectorize ||
15521 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15522 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15523 VectorizableTree.size() > LimitTreeSize) ||
15527 (TE->getOpcode() == Instruction::PHI ||
15528 (TE->hasCopyableElements() &&
15531 TE->Scalars.size() / 2) ||
15532 ((!TE->ReuseShuffleIndices.empty() ||
15533 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15534 TE->Scalars.size() == 2)));
15536 (StoreLoadNodes.
empty() ||
15537 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15538 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15539 return TE->getOpcode() == Instruction::Store ||
15541 return !isa<LoadInst>(V) ||
15542 areAllUsersVectorized(cast<Instruction>(V));
15550 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15551 VectorizableTree.size() >= Limit &&
15553 [&](
const std::unique_ptr<TreeEntry> &TE) {
15554 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15555 TE->UserTreeIndex.UserTE->Idx == 0;
15562 VectorizableTree.size() > 2 &&
15563 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15564 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15565 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15566 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15568 ArrayRef(VectorizableTree).drop_front(2),
15569 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
15579 if (isFullyVectorizableTinyTree(ForReduction))
15584 bool IsAllowedSingleBVNode =
15585 VectorizableTree.
size() > 1 ||
15586 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15587 !VectorizableTree.front()->isAltShuffle() &&
15588 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15589 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15591 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15592 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
15593 return isa<ExtractElementInst, Constant>(V) ||
15594 (IsAllowedSingleBVNode &&
15595 !V->hasNUsesOrMore(UsesLimit) &&
15596 any_of(V->users(), IsaPred<InsertElementInst>));
15601 if (VectorizableTree.back()->isGather() &&
15602 VectorizableTree.back()->hasState() &&
15603 VectorizableTree.back()->isAltShuffle() &&
15604 VectorizableTree.back()->getVectorFactor() > 2 &&
15606 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15607 TTI->getScalarizationOverhead(
15608 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15609 VectorizableTree.back()->getVectorFactor()),
15622 constexpr unsigned SmallTree = 3;
15623 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15626 [](
const std::unique_ptr<TreeEntry> &TE) {
15627 return TE->isGather() && TE->hasState() &&
15628 TE->getOpcode() == Instruction::Load &&
15636 TreeEntry &E = *VectorizableTree[Idx];
15637 if (E.State == TreeEntry::SplitVectorize)
15641 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15660 const TreeEntry *Root = VectorizableTree.front().get();
15661 if (Root->isGather())
15669 for (
const auto &TEPtr : VectorizableTree) {
15670 if (!TEPtr->isGather()) {
15671 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15672 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
15673 LastInstructions.
insert(LastInst);
15675 if (TEPtr->UserTreeIndex)
15676 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15683 if (
II->isAssumeLikeIntrinsic())
15690 return IntrCost < CallCost;
15697 CheckedInstructions;
15698 unsigned Budget = 0;
15699 const unsigned BudgetLimit =
15704 "Expected instructions in same block.");
15705 if (
auto It = CheckedInstructions.
find(
Last);
15706 It != CheckedInstructions.
end()) {
15707 const Instruction *Checked = It->second.getPointer();
15709 return It->second.getInt() != 0;
15715 ++
First->getIterator().getReverse(),
15717 Last->getIterator().getReverse();
15719 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15725 for (
const Instruction *LastInst : LastInstsInRange)
15726 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
15729 if (LastInstructions.
contains(&*PrevInstIt))
15730 LastInstsInRange.
push_back(&*PrevInstIt);
15735 for (
const Instruction *LastInst : LastInstsInRange)
15737 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
15738 Budget <= BudgetLimit ? 1 : 0);
15739 return Budget <= BudgetLimit;
15741 auto AddCosts = [&](
const TreeEntry *
Op) {
15742 Type *ScalarTy =
Op->Scalars.front()->getType();
15743 auto It = MinBWs.find(
Op);
15744 if (It != MinBWs.end())
15747 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15750 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15757 ParentOpParentToPreds;
15760 auto Key = std::make_pair(Root, OpParent);
15761 if (
auto It = ParentOpParentToPreds.
find(
Key);
15762 It != ParentOpParentToPreds.
end())
15774 for (
const auto &KeyPair : ParentsPairsToAdd) {
15776 "Should not have been added before.");
15780 while (!Worklist.
empty()) {
15782 if (BB == OpParent || !Visited.
insert(BB).second)
15784 auto Pair = std::make_pair(BB, OpParent);
15785 if (
auto It = ParentOpParentToPreds.
find(Pair);
15786 It != ParentOpParentToPreds.
end()) {
15790 ParentsPairsToAdd.
insert(Pair);
15795 if (Budget > BudgetLimit)
15807 while (!LiveEntries.
empty()) {
15810 if (Operands.
empty())
15812 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
15814 for (
const TreeEntry *
Op : Operands) {
15815 if (!
Op->isGather())
15817 if (Entry->State == TreeEntry::SplitVectorize ||
15818 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
15824 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15827 if (
Op->isGather()) {
15828 assert(Entry->getOpcode() == Instruction::PHI &&
15829 "Expected phi node only.");
15831 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15833 for (
Value *V :
Op->Scalars) {
15844 OpLastInst = EntriesToLastInstruction.
at(
Op);
15848 if (OpParent == Parent) {
15849 if (Entry->getOpcode() == Instruction::PHI) {
15850 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15854 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15860 if (Entry->getOpcode() != Instruction::PHI &&
15861 !CheckForNonVecCallsInSameBlock(
15862 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
15868 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15874 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15890 const auto *I1 = IE1;
15891 const auto *I2 = IE2;
15903 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15906 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
15909 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15916struct ValueSelect {
15917 template <
typename U>
15918 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
15921 template <
typename U>
15922 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
15940template <
typename T>
15946 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
15948 auto VMIt = std::next(ShuffleMask.begin());
15951 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15953 if (!IsBaseUndef.
all()) {
15955 std::pair<T *, bool> Res =
15956 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
15958 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15962 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15964 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
15965 assert((!V || GetVF(V) == Mask.size()) &&
15966 "Expected base vector of VF number of elements.");
15967 Prev = Action(Mask, {
nullptr, Res.first});
15968 }
else if (ShuffleMask.size() == 1) {
15971 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15977 Prev = Action(Mask, {ShuffleMask.begin()->first});
15981 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15982 unsigned Vec2VF = GetVF(VMIt->first);
15983 if (Vec1VF == Vec2VF) {
15987 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15990 Mask[
I] = SecMask[
I] + Vec1VF;
15993 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15996 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15998 std::pair<T *, bool> Res2 =
15999 ResizeAction(VMIt->first, VMIt->second,
false);
16001 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16008 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
16011 Prev = Action(Mask, {Res1.first, Res2.first});
16013 VMIt = std::next(VMIt);
16015 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
16017 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
16019 std::pair<T *, bool> Res =
16020 ResizeAction(VMIt->first, VMIt->second,
false);
16022 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16025 "Multiple uses of scalars.");
16026 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
16031 Prev = Action(Mask, {Prev, Res.first});
16039template <
typename T>
struct ShuffledInsertData {
16043 MapVector<T, SmallVector<int>> ValueMasks;
16051 << VectorizableTree.size() <<
".\n");
16054 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
16055 TreeEntry &TE = *VectorizableTree[
I];
16058 if (TE.State == TreeEntry::CombinedVectorize) {
16060 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
16061 << *TE.Scalars[0] <<
".\n";
16062 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16065 if (TE.hasState() &&
16066 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16067 if (
const TreeEntry *E =
16068 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16069 E && E->getVectorFactor() == TE.getVectorFactor()) {
16074 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16081 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16082 "Expected gather nodes with users only.");
16088 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16092 none_of(ExternalUses, [](
const ExternalUser &EU) {
16103 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16110 for (ExternalUser &EU : ExternalUses) {
16111 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
16114 for (ExternalUser &EU : ExternalUses) {
16115 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
16116 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
16118 else dbgs() <<
" User: nullptr\n");
16119 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
16124 if (EphValues.count(EU.User))
16128 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
16130 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
16138 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16144 !ExtractCostCalculated.
insert(EU.Scalar).second)
16157 if (!UsedInserts.
insert(VU).second)
16161 const TreeEntry *ScalarTE = &EU.E;
16164 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
16169 Value *Op0 =
II->getOperand(0);
16176 if (It == ShuffledInserts.
end()) {
16178 Data.InsertElements.emplace_back(VU);
16180 VecId = ShuffledInserts.
size() - 1;
16181 auto It = MinBWs.find(ScalarTE);
16182 if (It != MinBWs.end() &&
16184 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
16186 unsigned BWSz = It->second.first;
16187 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16188 unsigned VecOpcode;
16189 if (DstBWSz < BWSz)
16190 VecOpcode = Instruction::Trunc;
16193 It->second.second ? Instruction::SExt : Instruction::ZExt;
16198 FTy->getNumElements()),
16201 <<
" for extending externally used vector with "
16202 "non-equal minimum bitwidth.\n");
16207 It->InsertElements.front() = VU;
16208 VecId = std::distance(ShuffledInserts.
begin(), It);
16210 int InIdx = *InsertIdx;
16212 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16215 Mask[InIdx] = EU.Lane;
16216 DemandedElts[VecId].setBit(InIdx);
16227 auto *ScalarTy = EU.Scalar->getType();
16228 const unsigned BundleWidth = EU.E.getVectorFactor();
16229 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16231 const TreeEntry *Entry = &EU.E;
16232 auto It = MinBWs.find(Entry);
16233 if (It != MinBWs.end()) {
16238 ? Instruction::ZExt
16239 : Instruction::SExt;
16244 << ExtraCost <<
"\n");
16248 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16249 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16250 << *VecTy <<
": " << ExtraCost <<
"\n");
16253 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16254 Entry->getOpcode() == Instruction::Load) {
16256 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16259 const Loop *L = LI->getLoopFor(Phi->getParent());
16260 return L && (Phi->getParent() ==
I->getParent() ||
16261 L == LI->getLoopFor(
I->getParent()));
16265 if (!ValueToExtUses) {
16266 ValueToExtUses.emplace();
16267 for (
const auto &
P :
enumerate(ExternalUses)) {
16269 if (IsPhiInLoop(
P.value()))
16272 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16279 auto OperandIsScalar = [&](
Value *V) {
16285 return !EE->hasOneUse() || !MustGather.contains(EE);
16288 return ValueToExtUses->contains(V);
16290 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16291 bool CanBeUsedAsScalarCast =
false;
16294 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16299 if (ScalarCost + OpCost <= ExtraCost) {
16300 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16301 ScalarCost += OpCost;
16305 if (CanBeUsedAsScalar) {
16306 bool KeepScalar = ScalarCost <= ExtraCost;
16310 bool IsProfitablePHIUser =
16312 VectorizableTree.front()->Scalars.size() > 2)) &&
16313 VectorizableTree.front()->hasState() &&
16314 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16318 auto *PHIUser = dyn_cast<PHINode>(U);
16319 return (!PHIUser ||
16320 PHIUser->getParent() !=
16322 VectorizableTree.front()->getMainOp())
16327 return ValueToExtUses->contains(V);
16329 if (IsProfitablePHIUser) {
16333 (!GatheredLoadsEntriesFirst.has_value() ||
16334 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16335 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16336 return ValueToExtUses->contains(V);
16338 auto It = ExtractsCount.
find(Entry);
16339 if (It != ExtractsCount.
end()) {
16340 assert(ScalarUsesCount >= It->getSecond().size() &&
16341 "Expected total number of external uses not less than "
16342 "number of scalar uses.");
16343 ScalarUsesCount -= It->getSecond().size();
16348 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
16351 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16352 for (
Value *V : Inst->operands()) {
16353 auto It = ValueToExtUses->find(V);
16354 if (It != ValueToExtUses->end()) {
16356 ExternalUses[It->second].User =
nullptr;
16359 ExtraCost = ScalarCost;
16360 if (!IsPhiInLoop(EU))
16361 ExtractsCount[Entry].
insert(Inst);
16362 if (CanBeUsedAsScalarCast) {
16363 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
16367 for (
Value *V : IOp->operands()) {
16368 auto It = ValueToExtUses->find(V);
16369 if (It != ValueToExtUses->end()) {
16371 ExternalUses[It->second].User =
nullptr;
16380 ExtractCost += ExtraCost;
16384 for (
Value *V : ScalarOpsFromCasts) {
16385 ExternalUsesAsOriginalScalar.insert(V);
16387 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
16388 TEs.front()->findLaneForValue(V));
16392 if (!VectorizedVals.
empty()) {
16393 const TreeEntry &Root = *VectorizableTree.front();
16394 auto BWIt = MinBWs.find(&Root);
16395 if (BWIt != MinBWs.end()) {
16396 Type *DstTy = Root.Scalars.front()->getType();
16397 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
16399 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16400 if (OriginalSz != SrcSz) {
16401 unsigned Opcode = Instruction::Trunc;
16402 if (OriginalSz > SrcSz)
16403 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16409 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16416 Cost += ExtractCost;
16418 bool ForSingleMask) {
16420 unsigned VF = Mask.size();
16421 unsigned VecVF = TE->getVectorFactor();
16422 bool HasLargeIndex =
16423 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
16424 if ((VF != VecVF && HasLargeIndex) ||
16427 if (HasLargeIndex) {
16429 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16435 dbgs() <<
"SLP: Adding cost " <<
C
16436 <<
" for final shuffle of insertelement external users.\n";
16437 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16439 return std::make_pair(TE,
true);
16442 if (!ForSingleMask) {
16444 for (
unsigned I = 0;
I < VF; ++
I) {
16446 ResizeMask[Mask[
I]] = Mask[
I];
16453 dbgs() <<
"SLP: Adding cost " <<
C
16454 <<
" for final shuffle of insertelement external users.\n";
16455 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16460 return std::make_pair(TE,
false);
16463 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16464 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
16465 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16469 assert((TEs.size() == 1 || TEs.size() == 2) &&
16470 "Expected exactly 1 or 2 tree entries.");
16471 if (TEs.size() == 1) {
16473 VF = TEs.front()->getVectorFactor();
16474 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16478 (
Data.index() < VF &&
16479 static_cast<int>(
Data.index()) ==
Data.value());
16484 <<
" for final shuffle of insertelement "
16485 "external users.\n";
16486 TEs.front()->
dump();
16487 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16493 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16494 VF = TEs.front()->getVectorFactor();
16498 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16502 <<
" for final shuffle of vector node and external "
16503 "insertelement users.\n";
16504 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16505 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16513 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16514 EstimateShufflesCost);
16517 ShuffledInserts[
I].InsertElements.
front()->getType()),
16520 Cost -= InsertCost;
16524 if (ReductionBitWidth != 0) {
16525 assert(UserIgnoreList &&
"Expected reduction tree.");
16526 const TreeEntry &E = *VectorizableTree.front();
16527 auto It = MinBWs.find(&E);
16528 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16529 unsigned SrcSize = It->second.first;
16530 unsigned DstSize = ReductionBitWidth;
16531 unsigned Opcode = Instruction::Trunc;
16532 if (SrcSize < DstSize) {
16533 bool IsArithmeticExtendedReduction =
16536 return is_contained({Instruction::Add, Instruction::FAdd,
16537 Instruction::Mul, Instruction::FMul,
16538 Instruction::And, Instruction::Or,
16542 if (IsArithmeticExtendedReduction)
16544 Instruction::BitCast;
16546 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16548 if (Opcode != Instruction::BitCast) {
16550 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16552 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16555 switch (E.getOpcode()) {
16556 case Instruction::SExt:
16557 case Instruction::ZExt:
16558 case Instruction::Trunc: {
16559 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16560 CCH = getCastContextHint(*OpTE);
16566 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16570 <<
" for final resize for reduction from " << SrcVecTy
16571 <<
" to " << DstVecTy <<
"\n";
16572 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16577 std::optional<InstructionCost> SpillCost;
16580 Cost += *SpillCost;
16586 OS <<
"SLP: Spill Cost = ";
16591 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
16592 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
16596 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
16607std::optional<TTI::ShuffleKind>
16608BoUpSLP::tryToGatherSingleRegisterExtractElements(
16614 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
16630 if (Idx >= VecTy->getNumElements()) {
16634 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
16635 ExtractMask.reset(*Idx);
16640 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
16645 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
16646 return P1.second.size() > P2.second.size();
16649 const int UndefSz = UndefVectorExtracts.
size();
16650 unsigned SingleMax = 0;
16651 unsigned PairMax = 0;
16652 if (!Vectors.
empty()) {
16653 SingleMax = Vectors.
front().second.size() + UndefSz;
16654 if (Vectors.
size() > 1) {
16655 auto *ItNext = std::next(Vectors.
begin());
16656 PairMax = SingleMax + ItNext->second.size();
16659 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16660 return std::nullopt;
16666 if (SingleMax >= PairMax && SingleMax) {
16667 for (
int Idx : Vectors.
front().second)
16668 std::swap(GatheredExtracts[Idx], VL[Idx]);
16669 }
else if (!Vectors.
empty()) {
16670 for (
unsigned Idx : {0, 1})
16671 for (
int Idx : Vectors[Idx].second)
16672 std::swap(GatheredExtracts[Idx], VL[Idx]);
16675 for (
int Idx : UndefVectorExtracts)
16676 std::swap(GatheredExtracts[Idx], VL[Idx]);
16679 std::optional<TTI::ShuffleKind> Res =
16685 return std::nullopt;
16689 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
16710BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16711 SmallVectorImpl<int> &Mask,
16712 unsigned NumParts)
const {
16713 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
16722 SmallVector<int> SubMask;
16723 std::optional<TTI::ShuffleKind> Res =
16724 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16725 ShufflesRes[Part] = Res;
16726 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
16728 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
16729 return Res.has_value();
16731 ShufflesRes.clear();
16732 return ShufflesRes;
16735std::optional<TargetTransformInfo::ShuffleKind>
16736BoUpSLP::isGatherShuffledSingleRegisterEntry(
16738 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
16742 auto GetUserEntry = [&](
const TreeEntry *
TE) {
16743 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16744 TE =
TE->UserTreeIndex.UserTE;
16745 if (TE == VectorizableTree.front().get())
16746 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
16747 return TE->UserTreeIndex;
16749 auto HasGatherUser = [&](
const TreeEntry *
TE) {
16750 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
16751 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16753 TE =
TE->UserTreeIndex.UserTE;
16757 const EdgeInfo TEUseEI = GetUserEntry(TE);
16759 return std::nullopt;
16760 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16765 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
16766 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16767 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16770 TEInsertBlock = TEInsertPt->
getParent();
16772 if (!DT->isReachableFromEntry(TEInsertBlock))
16773 return std::nullopt;
16774 auto *NodeUI = DT->getNode(TEInsertBlock);
16775 assert(NodeUI &&
"Should only process reachable instructions");
16777 auto CheckOrdering = [&](
const Instruction *InsertPt) {
16790 const BasicBlock *InsertBlock = InsertPt->getParent();
16791 auto *NodeEUI = DT->getNode(InsertBlock);
16794 assert((NodeUI == NodeEUI) ==
16795 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16796 "Different nodes should have different DFS numbers");
16798 if (TEInsertPt->
getParent() != InsertBlock &&
16799 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16801 if (TEInsertPt->
getParent() == InsertBlock &&
16814 SmallDenseMap<Value *, int> UsedValuesEntry;
16815 SmallPtrSet<const Value *, 16> VisitedValue;
16816 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
16818 if ((TEPtr->getVectorFactor() != VL.
size() &&
16819 TEPtr->Scalars.size() != VL.
size()) ||
16820 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
16824 for (
Value *V : VL) {
16831 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
16832 unsigned EdgeIdx) {
16833 const TreeEntry *Ptr1 = User1;
16834 const TreeEntry *Ptr2 = User2;
16835 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16838 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16839 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16842 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16843 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16844 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
16845 return Idx < It->second;
16849 for (
Value *V : VL) {
16853 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16854 for (
const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16855 if (TEPtr == TE || TEPtr->Idx == 0)
16858 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
16859 "Must contain at least single gathered value.");
16860 assert(TEPtr->UserTreeIndex &&
16861 "Expected only single user of a gather node.");
16862 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16864 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16865 UseEI.UserTE->hasState())
16870 : &getLastInstructionInBundle(UseEI.UserTE);
16871 if (TEInsertPt == InsertPt) {
16873 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16874 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16875 TEUseEI.UserTE->isAltShuffle()) &&
16877 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16878 (UseEI.UserTE->hasState() &&
16879 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16880 !UseEI.UserTE->isAltShuffle()) ||
16889 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16892 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16893 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16894 UseEI.UserTE->State == TreeEntry::Vectorize &&
16895 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16896 TEUseEI.UserTE != UseEI.UserTE)
16901 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16905 if (TEUseEI.UserTE != UseEI.UserTE &&
16906 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16907 HasGatherUser(TEUseEI.UserTE)))
16910 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16914 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16915 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16916 UseEI.UserTE->doesNotNeedToSchedule() &&
16921 if ((TEInsertBlock != InsertPt->
getParent() ||
16922 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16923 (!CheckOrdering(InsertPt) ||
16924 (UseEI.UserTE->hasCopyableElements() &&
16929 if (CheckAndUseSameNode(TEPtr))
16935 VTEs, [&](
const TreeEntry *MTE) {
return MTE != TEUseEI.UserTE; });
16936 if (It != VTEs.end()) {
16937 const TreeEntry *VTE = *It;
16938 if (
none_of(
TE->CombinedEntriesWithIndices,
16939 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16940 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16941 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16945 if (CheckAndUseSameNode(VTE))
16951 const TreeEntry *VTE = VTEs.front();
16952 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16953 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16954 VTEs = VTEs.drop_front();
16956 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
16957 return MTE->State == TreeEntry::Vectorize;
16959 if (MIt == VTEs.end())
16963 if (
none_of(
TE->CombinedEntriesWithIndices,
16964 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16965 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16966 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16970 if (CheckAndUseSameNode(VTE))
16974 if (VToTEs.
empty())
16976 if (UsedTEs.
empty()) {
16984 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16986 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16990 if (!VToTEs.
empty()) {
16996 VToTEs = SavedVToTEs;
17001 if (Idx == UsedTEs.
size()) {
17005 if (UsedTEs.
size() == 2)
17007 UsedTEs.push_back(SavedVToTEs);
17008 Idx = UsedTEs.
size() - 1;
17014 if (UsedTEs.
empty()) {
17016 return std::nullopt;
17020 if (UsedTEs.
size() == 1) {
17023 UsedTEs.front().
end());
17024 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17025 return TE1->Idx < TE2->Idx;
17028 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
17029 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
17031 if (It != FirstEntries.end() &&
17032 ((*It)->getVectorFactor() == VL.size() ||
17033 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
17034 TE->ReuseShuffleIndices.size() == VL.size() &&
17035 (*It)->isSame(
TE->Scalars)))) {
17037 if ((*It)->getVectorFactor() == VL.size()) {
17038 std::iota(std::next(
Mask.begin(), Part * VL.size()),
17039 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
17041 SmallVector<int> CommonMask =
TE->getCommonMask();
17052 Entries.
push_back(FirstEntries.front());
17054 for (
auto &
P : UsedValuesEntry)
17056 VF = FirstEntries.front()->getVectorFactor();
17059 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
17061 DenseMap<int, const TreeEntry *> VFToTE;
17062 for (
const TreeEntry *TE : UsedTEs.front()) {
17063 unsigned VF =
TE->getVectorFactor();
17064 auto It = VFToTE.
find(VF);
17065 if (It != VFToTE.
end()) {
17066 if (It->second->Idx >
TE->Idx)
17067 It->getSecond() =
TE;
17074 UsedTEs.back().
end());
17075 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17076 return TE1->Idx < TE2->Idx;
17078 for (
const TreeEntry *TE : SecondEntries) {
17079 auto It = VFToTE.
find(
TE->getVectorFactor());
17080 if (It != VFToTE.
end()) {
17089 if (Entries.
empty()) {
17091 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17092 return TE1->Idx < TE2->Idx;
17094 Entries.
push_back(SecondEntries.front());
17095 VF = std::max(Entries.
front()->getVectorFactor(),
17096 Entries.
back()->getVectorFactor());
17098 VF = Entries.
front()->getVectorFactor();
17101 for (
const TreeEntry *
E : Entries)
17105 for (
auto &
P : UsedValuesEntry) {
17107 if (ValuesToEntries[Idx].
contains(
P.first)) {
17117 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
17124 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
17126 Value *In1 = PHI1->getIncomingValue(
I);
17141 auto MightBeIgnored = [=](
Value *
V) {
17145 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
17150 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
17151 Value *V1 = VL[Idx];
17152 bool UsedInSameVTE =
false;
17153 auto It = UsedValuesEntry.find(V1);
17154 if (It != UsedValuesEntry.end())
17155 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17156 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17163 SmallBitVector UsedIdxs(Entries.size());
17165 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
17167 auto It = UsedValuesEntry.find(V);
17168 if (It == UsedValuesEntry.end())
17174 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
17175 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
17177 unsigned Idx = It->second;
17184 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
17185 if (!UsedIdxs.test(
I))
17191 for (std::pair<unsigned, int> &Pair : EntryLanes)
17192 if (Pair.first ==
I)
17193 Pair.first = TempEntries.
size();
17196 Entries.swap(TempEntries);
17197 if (EntryLanes.size() == Entries.size() &&
17199 .slice(Part * VL.size(),
17200 std::min<int>(VL.size(),
TE->Scalars.size())))) {
17206 return std::nullopt;
17209 bool IsIdentity = Entries.size() == 1;
17212 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17213 unsigned Idx = Part * VL.size() + Pair.second;
17216 (ForOrder ? std::distance(
17217 Entries[Pair.first]->Scalars.begin(),
17218 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17219 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17220 IsIdentity &=
Mask[Idx] == Pair.second;
17222 if (ForOrder || IsIdentity || Entries.empty()) {
17223 switch (Entries.size()) {
17225 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17229 if (EntryLanes.size() > 2 || VL.size() <= 2)
17236 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17238 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
17239 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17240 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17241 for (
int Idx : SubMask) {
17249 assert(MaxElement >= 0 && MinElement >= 0 &&
17250 MaxElement % VF >= MinElement % VF &&
17251 "Expected at least single element.");
17252 unsigned NewVF = std::max<unsigned>(
17254 (MaxElement % VF) -
17255 (MinElement % VF) + 1));
17257 for (
int &Idx : SubMask) {
17260 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17261 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
17269 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17270 auto GetShuffleCost = [&,
17271 &TTI = *TTI](ArrayRef<int>
Mask,
17274 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17276 Mask, Entries.front()->getInterleaveFactor()))
17278 return ::getShuffleCost(TTI,
17283 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17285 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17286 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17287 FirstShuffleCost = ShuffleCost;
17291 bool IsIdentity =
true;
17292 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
17293 if (Idx >=
static_cast<int>(NewVF)) {
17298 IsIdentity &=
static_cast<int>(
I) == Idx;
17302 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17304 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17308 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17309 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17310 SecondShuffleCost = ShuffleCost;
17314 bool IsIdentity =
true;
17315 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
17316 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
17322 IsIdentity &=
static_cast<int>(
I) == Idx;
17327 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17329 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17337 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17339 const TreeEntry *BestEntry =
nullptr;
17340 if (FirstShuffleCost < ShuffleCost) {
17341 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17342 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17344 if (Idx >= static_cast<int>(VF))
17345 Idx = PoisonMaskElem;
17347 BestEntry = Entries.front();
17348 ShuffleCost = FirstShuffleCost;
17350 if (SecondShuffleCost < ShuffleCost) {
17351 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17352 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17354 if (Idx < static_cast<int>(VF))
17355 Idx = PoisonMaskElem;
17359 BestEntry = Entries[1];
17360 ShuffleCost = SecondShuffleCost;
17362 if (BuildVectorCost >= ShuffleCost) {
17365 Entries.push_back(BestEntry);
17373 std::fill(std::next(
Mask.begin(), Part * VL.size()),
17375 return std::nullopt;
17379BoUpSLP::isGatherShuffledEntry(
17383 assert(NumParts > 0 && NumParts < VL.
size() &&
17384 "Expected positive number of registers.");
17387 if (TE == VectorizableTree.front().get() &&
17388 (!GatheredLoadsEntriesFirst.has_value() ||
17390 [](
const std::unique_ptr<TreeEntry> &TE) {
17391 return !
TE->isGather();
17396 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17399 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17400 "Expected only single user of the gather node.");
17402 "Number of scalars must be divisible by NumParts.");
17403 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
17404 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17406 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
17409 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
17416 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17417 std::optional<TTI::ShuffleKind> SubRes =
17418 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17421 SubEntries.
clear();
17424 SubEntries.
front()->getVectorFactor() == VL.
size() &&
17425 (SubEntries.
front()->isSame(
TE->Scalars) ||
17426 SubEntries.
front()->isSame(VL))) {
17428 LocalSubEntries.
swap(SubEntries);
17431 std::iota(
Mask.begin(),
Mask.end(), 0);
17433 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
17436 Entries.emplace_back(1, LocalSubEntries.
front());
17442 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
17450 Type *ScalarTy)
const {
17451 const unsigned VF = VL.
size();
17459 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
17461 if (
V->getType() != ScalarTy)
17462 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
17466 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17473 ConstantShuffleMask[
I] =
I + VF;
17476 EstimateInsertCost(
I, V);
17479 bool IsAnyNonUndefConst =
17482 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17484 ConstantShuffleMask);
17488 if (!DemandedElements.
isZero())
17492 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17496Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
17497 auto It = EntryToLastInstruction.find(
E);
17498 if (It != EntryToLastInstruction.end())
17506 if (
E->hasState()) {
17507 Front =
E->getMainOp();
17508 Opcode =
E->getOpcode();
17515 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17516 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
17517 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
17519 [=](
Value *V) ->
bool {
17520 if (Opcode == Instruction::GetElementPtr &&
17521 !isa<GetElementPtrInst>(V))
17523 auto *I = dyn_cast<Instruction>(V);
17524 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17525 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17527 "Expected gathered loads or GEPs or instructions from same basic "
17530 auto FindLastInst = [&]() {
17532 for (
Value *V :
E->Scalars) {
17536 if (
E->isCopyableElement(
I))
17538 if (LastInst->
getParent() ==
I->getParent()) {
17543 assert(((Opcode == Instruction::GetElementPtr &&
17545 E->State == TreeEntry::SplitVectorize ||
17548 (GatheredLoadsEntriesFirst.has_value() &&
17549 Opcode == Instruction::Load &&
E->isGather() &&
17550 E->Idx < *GatheredLoadsEntriesFirst)) &&
17551 "Expected vector-like or non-GEP in GEP node insts only.");
17552 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
17556 if (!DT->isReachableFromEntry(
I->getParent()))
17558 auto *NodeA = DT->getNode(LastInst->
getParent());
17559 auto *NodeB = DT->getNode(
I->getParent());
17560 assert(NodeA &&
"Should only process reachable instructions");
17561 assert(NodeB &&
"Should only process reachable instructions");
17562 assert((NodeA == NodeB) ==
17563 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17564 "Different nodes should have different DFS numbers");
17565 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17572 auto FindFirstInst = [&]() {
17574 for (
Value *V :
E->Scalars) {
17578 if (
E->isCopyableElement(
I))
17580 if (FirstInst->
getParent() ==
I->getParent()) {
17581 if (
I->comesBefore(FirstInst))
17585 assert(((Opcode == Instruction::GetElementPtr &&
17589 "Expected vector-like or non-GEP in GEP node insts only.");
17590 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
17594 if (!DT->isReachableFromEntry(
I->getParent()))
17596 auto *NodeA = DT->getNode(FirstInst->
getParent());
17597 auto *NodeB = DT->getNode(
I->getParent());
17598 assert(NodeA &&
"Should only process reachable instructions");
17599 assert(NodeB &&
"Should only process reachable instructions");
17600 assert((NodeA == NodeB) ==
17601 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17602 "Different nodes should have different DFS numbers");
17603 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17609 if (
E->State == TreeEntry::SplitVectorize) {
17610 Res = FindLastInst();
17612 for (
auto *
E : Entries) {
17615 I = &getLastInstructionInBundle(
E);
17620 EntryToLastInstruction.try_emplace(
E, Res);
17625 if (GatheredLoadsEntriesFirst.has_value() &&
17626 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17627 Opcode == Instruction::Load) {
17628 Res = FindFirstInst();
17629 EntryToLastInstruction.try_emplace(
E, Res);
17635 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
17639 const auto *It = BlocksSchedules.find(BB);
17640 if (It == BlocksSchedules.end())
17642 for (
Value *V :
E->Scalars) {
17648 if (Bundles.
empty())
17651 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
17652 if (It != Bundles.
end())
17657 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
17658 if (!
E->isGather() && !Bundle) {
17659 if ((Opcode == Instruction::GetElementPtr &&
17662 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17666 return isa<PoisonValue>(V) ||
17667 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17668 E->isCopyableElement(V) ||
17669 (!isVectorLikeInstWithConstOps(V) &&
17670 isUsedOutsideBlock(V));
17672 (!
E->doesNotNeedToSchedule() ||
17675 if (!isa<Instruction>(V) ||
17676 (E->hasCopyableElements() && E->isCopyableElement(V)))
17678 return !areAllOperandsNonInsts(V);
17681 if (!isa<Instruction>(V) ||
17682 (E->hasCopyableElements() && E->isCopyableElement(V)))
17684 return MustGather.contains(V);
17686 Res = FindLastInst();
17688 Res = FindFirstInst();
17689 EntryToLastInstruction.try_emplace(
E, Res);
17698 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
17699 Res = Bundle->getBundle().back()->getInst();
17700 EntryToLastInstruction.try_emplace(
E, Res);
17723 Res = FindLastInst();
17724 assert(Res &&
"Failed to find last instruction in bundle");
17725 EntryToLastInstruction.try_emplace(
E, Res);
17729void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
17730 auto *Front =
E->getMainOp();
17731 Instruction *LastInst = &getLastInstructionInBundle(
E);
17732 assert(LastInst &&
"Failed to find last instruction in bundle");
17737 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
17738 if (LastInstIt != LastInst->
getParent()->end() &&
17739 LastInstIt->getParent()->isLandingPad())
17740 LastInstIt = std::next(LastInstIt);
17743 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
17744 (
E->doesNotNeedToSchedule() ||
17745 (
E->hasCopyableElements() && !
E->isCopyableElement(LastInst) &&
17747 (GatheredLoadsEntriesFirst.has_value() &&
17748 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17749 E->getOpcode() == Instruction::Load)) {
17750 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
17754 Builder.SetInsertPoint(
17758 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
17761Value *BoUpSLP::gather(
17763 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
17769 SmallSet<int, 4> PostponedIndices;
17770 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
17772 SmallPtrSet<BasicBlock *, 4> Visited;
17773 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
17774 InsertBB = InsertBB->getSinglePredecessor();
17775 return InsertBB && InsertBB == InstBB;
17777 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17779 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17781 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
17782 PostponedIndices.
insert(
I).second)
17786 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
17789 if (
Scalar->getType() != Ty) {
17800 Scalar = Builder.CreateIntCast(
17814 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17819 GatherShuffleExtractSeq.insert(InsElt);
17825 User *UserOp =
nullptr;
17830 if (
V->getType()->isVectorTy()) {
17832 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17834 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
17836 if (SV->getOperand(0) == V)
17838 if (SV->getOperand(1) == V)
17844 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17846 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17849 "Failed to find shufflevector, caused by resize.");
17855 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17856 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17864 SmallVector<int> NonConsts;
17866 std::iota(
Mask.begin(),
Mask.end(), 0);
17867 Value *OriginalRoot = Root;
17870 SV->getOperand(0)->getType() == VecTy) {
17871 Root = SV->getOperand(0);
17872 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17875 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17884 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17889 Vec = OriginalRoot;
17891 Vec = CreateShuffle(Root, Vec, Mask);
17893 OI && OI->use_empty() &&
17894 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
17895 return TE->VectorizedValue == OI;
17901 for (
int I : NonConsts)
17902 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17905 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17906 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17944 bool IsFinalized =
false;
17957 class ShuffleIRBuilder {
17970 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17971 CSEBlocks(CSEBlocks),
DL(DL) {}
17972 ~ShuffleIRBuilder() =
default;
17978 "Expected integer vector types only.");
17984 ->getIntegerBitWidth())
17985 V2 = Builder.CreateIntCast(
17988 V1 = Builder.CreateIntCast(
17992 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17994 GatherShuffleExtractSeq.insert(
I);
17995 CSEBlocks.insert(
I->getParent());
18004 unsigned VF = Mask.size();
18008 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
18010 GatherShuffleExtractSeq.insert(
I);
18011 CSEBlocks.insert(
I->getParent());
18015 Value *createIdentity(
Value *V) {
return V; }
18016 Value *createPoison(
Type *Ty,
unsigned VF) {
18021 void resizeToMatch(
Value *&V1,
Value *&V2) {
18026 int VF = std::max(V1VF, V2VF);
18027 int MinVF = std::min(V1VF, V2VF);
18029 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
18031 Value *&
Op = MinVF == V1VF ? V1 : V2;
18032 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
18034 GatherShuffleExtractSeq.insert(
I);
18035 CSEBlocks.insert(
I->getParent());
18048 assert(V1 &&
"Expected at least one vector value.");
18049 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18050 R.CSEBlocks, *R.DL);
18051 return BaseShuffleAnalysis::createShuffle<Value *>(
18052 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18058 std::optional<bool> IsSigned = std::nullopt) {
18061 if (VecTy->getElementType() == ScalarTy->getScalarType())
18063 return Builder.CreateIntCast(
18064 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18068 Value *getVectorizedValue(
const TreeEntry &E) {
18069 Value *Vec = E.VectorizedValue;
18072 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
18073 return !isa<PoisonValue>(V) &&
18074 !isKnownNonNegative(
18075 V, SimplifyQuery(*R.DL));
18081 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18085 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18086 unsigned NumParts,
bool &UseVecBaseAsInput) {
18087 UseVecBaseAsInput =
false;
18089 Value *VecBase =
nullptr;
18091 if (!E->ReorderIndices.empty()) {
18093 E->ReorderIndices.end());
18096 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
18101 VecBase = EI->getVectorOperand();
18103 VecBase = TEs.front()->VectorizedValue;
18104 assert(VecBase &&
"Expected vectorized value.");
18105 UniqueBases.
insert(VecBase);
18108 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18109 (NumParts != 1 &&
count(VL, EI) > 1) ||
18111 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18112 return UTEs.empty() || UTEs.size() > 1 ||
18113 (isa<GetElementPtrInst>(U) &&
18114 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
18116 count_if(R.VectorizableTree,
18117 [&](const std::unique_ptr<TreeEntry> &TE) {
18118 return TE->UserTreeIndex.UserTE ==
18120 is_contained(VL, EI);
18124 R.eraseInstruction(EI);
18126 if (NumParts == 1 || UniqueBases.
size() == 1) {
18127 assert(VecBase &&
"Expected vectorized value.");
18128 return castToScalarTyElem(VecBase);
18130 UseVecBaseAsInput =
true;
18140 Value *Vec =
nullptr;
18147 constexpr int MaxBases = 2;
18149 auto VLMask =
zip(SubVL, SubMask);
18150 const unsigned VF = std::accumulate(
18151 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
18152 if (std::get<1>(D) == PoisonMaskElem)
18155 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18156 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18158 VecOp = TEs.front()->VectorizedValue;
18159 assert(VecOp &&
"Expected vectorized value.");
18160 const unsigned Size =
18161 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18162 return std::max(S, Size);
18164 for (
const auto [V,
I] : VLMask) {
18169 VecOp = TEs.front()->VectorizedValue;
18170 assert(VecOp &&
"Expected vectorized value.");
18171 VecOp = castToScalarTyElem(VecOp);
18172 Bases[
I / VF] = VecOp;
18174 if (!Bases.front())
18177 if (Bases.back()) {
18178 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18179 TransformToIdentity(SubMask);
18181 SubVec = Bases.front();
18187 ArrayRef<int> SubMask =
18188 Mask.slice(
P * SliceSize,
18191 return all_of(SubMask, [](
int Idx) {
18195 "Expected first part or all previous parts masked.");
18196 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18201 unsigned SubVecVF =
18203 NewVF = std::max(NewVF, SubVecVF);
18206 for (
int &Idx : SubMask)
18209 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18210 Vec = createShuffle(Vec, SubVec, VecMask);
18211 TransformToIdentity(VecMask);
18219 std::optional<Value *>
18225 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
18227 return std::nullopt;
18230 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
18231 return Builder.CreateAlignedLoad(
18238 IsFinalized =
false;
18239 CommonMask.clear();
18245 Value *V1 = getVectorizedValue(E1);
18246 Value *V2 = getVectorizedValue(E2);
18252 Value *V1 = getVectorizedValue(E1);
18257 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
18260 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18261 V1 = castToScalarTyElem(V1);
18262 V2 = castToScalarTyElem(V2);
18263 if (InVectors.empty()) {
18264 InVectors.push_back(V1);
18265 InVectors.push_back(V2);
18266 CommonMask.assign(Mask.begin(), Mask.end());
18269 Value *Vec = InVectors.front();
18270 if (InVectors.size() == 2) {
18271 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18272 transformMaskAfterShuffle(CommonMask, CommonMask);
18275 Vec = createShuffle(Vec,
nullptr, CommonMask);
18276 transformMaskAfterShuffle(CommonMask, CommonMask);
18278 V1 = createShuffle(V1, V2, Mask);
18279 unsigned VF = std::max(getVF(V1), getVF(Vec));
18280 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18282 CommonMask[Idx] = Idx + VF;
18283 InVectors.front() = Vec;
18284 if (InVectors.size() == 2)
18285 InVectors.back() = V1;
18287 InVectors.push_back(V1);
18292 "castToScalarTyElem expects V1 to be FixedVectorType");
18293 V1 = castToScalarTyElem(V1);
18294 if (InVectors.empty()) {
18295 InVectors.push_back(V1);
18296 CommonMask.assign(Mask.begin(), Mask.end());
18299 const auto *It =
find(InVectors, V1);
18300 if (It == InVectors.end()) {
18301 if (InVectors.size() == 2 ||
18302 InVectors.front()->getType() != V1->
getType()) {
18303 Value *V = InVectors.front();
18304 if (InVectors.size() == 2) {
18305 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18306 transformMaskAfterShuffle(CommonMask, CommonMask);
18308 CommonMask.size()) {
18309 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
18310 transformMaskAfterShuffle(CommonMask, CommonMask);
18312 unsigned VF = std::max(CommonMask.size(), Mask.size());
18313 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18315 CommonMask[Idx] = V->getType() != V1->
getType()
18317 : Mask[Idx] + getVF(V1);
18318 if (V->getType() != V1->
getType())
18319 V1 = createShuffle(V1,
nullptr, Mask);
18320 InVectors.front() = V;
18321 if (InVectors.size() == 2)
18322 InVectors.back() = V1;
18324 InVectors.push_back(V1);
18329 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18331 InVectors.push_back(V1);
18336 for (
Value *V : InVectors)
18337 VF = std::max(VF, getVF(V));
18338 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18340 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18349 Value *Root =
nullptr) {
18350 return R.gather(VL, Root, ScalarTy,
18352 return createShuffle(V1, V2, Mask);
18361 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18366 IsFinalized =
true;
18369 if (InVectors.
size() == 2) {
18370 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18373 Vec = createShuffle(Vec,
nullptr, CommonMask);
18375 transformMaskAfterShuffle(CommonMask, CommonMask);
18377 "Expected vector length for the final value before action.");
18381 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18382 Vec = createShuffle(Vec,
nullptr, ResizeMask);
18385 return createShuffle(V1, V2, Mask);
18387 InVectors.
front() = Vec;
18389 if (!SubVectors.empty()) {
18391 if (InVectors.
size() == 2) {
18392 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18395 Vec = createShuffle(Vec,
nullptr, CommonMask);
18397 transformMaskAfterShuffle(CommonMask, CommonMask);
18398 auto CreateSubVectors = [&](
Value *Vec,
18399 SmallVectorImpl<int> &CommonMask) {
18400 for (
auto [
E, Idx] : SubVectors) {
18401 Value *
V = getVectorizedValue(*
E);
18408 Type *OrigScalarTy = ScalarTy;
18411 Builder, Vec, V, InsertionIndex,
18412 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
18414 ScalarTy = OrigScalarTy;
18415 if (!CommonMask.
empty()) {
18416 std::iota(std::next(CommonMask.
begin(), Idx),
18417 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
18423 if (SubVectorsMask.
empty()) {
18424 Vec = CreateSubVectors(Vec, CommonMask);
18427 copy(SubVectorsMask, SVMask.begin());
18428 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
18431 I1 = I2 + CommonMask.
size();
18436 Vec = createShuffle(InsertVec, Vec, SVMask);
18437 transformMaskAfterShuffle(CommonMask, SVMask);
18439 InVectors.
front() = Vec;
18442 if (!ExtMask.
empty()) {
18443 if (CommonMask.
empty()) {
18447 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
18450 NewMask[
I] = CommonMask[ExtMask[
I]];
18452 CommonMask.
swap(NewMask);
18455 if (CommonMask.
empty()) {
18456 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
18457 return InVectors.
front();
18459 if (InVectors.
size() == 2)
18460 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18461 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
18465 assert((IsFinalized || CommonMask.empty()) &&
18466 "Shuffle construction must be finalized.");
18470Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
18474template <
typename BVTy,
typename ResTy,
typename... Args>
18475ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
18477 assert(E->isGather() &&
"Expected gather node.");
18478 unsigned VF = E->getVectorFactor();
18480 bool NeedFreeze =
false;
18483 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18485 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18488 E->CombinedEntriesWithIndices.size());
18489 transform(
E->CombinedEntriesWithIndices, SubVectors.begin(),
18490 [&](
const auto &
P) {
18491 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18496 E->ReorderIndices.end());
18497 if (!ReorderMask.empty())
18503 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
18505 if (
E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
18508 SubVectorsMask.
clear();
18512 unsigned I,
unsigned SliceSize,
18513 bool IsNotPoisonous) {
18515 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18518 TreeEntry *UserTE =
E->UserTreeIndex.UserTE;
18519 unsigned EdgeIdx =
E->UserTreeIndex.EdgeIdx;
18520 if (UserTE->getNumOperands() != 2)
18522 if (!IsNotPoisonous) {
18523 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18524 [=](
const std::unique_ptr<TreeEntry> &TE) {
18525 return TE->UserTreeIndex.UserTE == UserTE &&
18526 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18528 if (It == VectorizableTree.end())
18531 if (!(*It)->ReorderIndices.empty()) {
18535 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
18536 Value *V0 = std::get<0>(
P);
18537 Value *V1 = std::get<1>(
P);
18545 if ((
Mask.size() < InputVF &&
18548 (
Mask.size() == InputVF &&
18551 std::next(
Mask.begin(),
I * SliceSize),
18552 std::next(
Mask.begin(),
18559 std::next(
Mask.begin(),
I * SliceSize),
18560 std::next(
Mask.begin(),
18566 BVTy ShuffleBuilder(ScalarTy, Params...);
18567 ResTy Res = ResTy();
18568 SmallVector<int>
Mask;
18569 SmallVector<int> ExtractMask(GatheredScalars.size(),
PoisonMaskElem);
18571 Value *ExtractVecBase =
nullptr;
18572 bool UseVecBaseAsInput =
false;
18575 Type *OrigScalarTy = GatheredScalars.front()->getType();
18580 bool Resized =
false;
18582 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18583 if (!ExtractShuffles.
empty()) {
18585 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
18591 ExtractEntries.
append(TEs.begin(), TEs.end());
18593 if (std::optional<ResTy> Delayed =
18594 ShuffleBuilder.needToDelay(
E, ExtractEntries)) {
18596 PostponedGathers.insert(
E);
18601 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
18602 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18603 ExtractVecBase = VecBase;
18605 if (VF == VecBaseTy->getNumElements() &&
18606 GatheredScalars.size() != VF) {
18608 GatheredScalars.append(VF - GatheredScalars.size(),
18616 if (!ExtractShuffles.
empty() || !
E->hasState() ||
18617 E->getOpcode() != Instruction::Load ||
18618 (((
E->hasState() &&
E->getOpcode() == Instruction::Load) ||
18622 return isa<LoadInst>(V) && isVectorized(V);
18624 (
E->hasState() &&
E->isAltShuffle()) ||
18625 all_of(
E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
18627 (
E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18629 isGatherShuffledEntry(
E, GatheredScalars, Mask, Entries, NumParts);
18631 if (!GatherShuffles.
empty()) {
18632 if (std::optional<ResTy> Delayed =
18633 ShuffleBuilder.needToDelay(
E, Entries)) {
18635 PostponedGathers.insert(
E);
18640 if (GatherShuffles.
size() == 1 &&
18642 Entries.
front().front()->isSame(
E->Scalars)) {
18645 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
18648 Mask.resize(
E->Scalars.size());
18649 const TreeEntry *FrontTE = Entries.
front().front();
18650 if (FrontTE->ReorderIndices.empty() &&
18651 ((FrontTE->ReuseShuffleIndices.empty() &&
18652 E->Scalars.size() == FrontTE->Scalars.size()) ||
18653 (
E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18654 std::iota(
Mask.begin(),
Mask.end(), 0);
18661 Mask[
I] = FrontTE->findLaneForValue(V);
18666 ShuffleBuilder.resetForSameNode();
18667 ShuffleBuilder.add(*FrontTE, Mask);
18669 Res = ShuffleBuilder.finalize(
E->getCommonMask(), {}, {});
18673 if (GatheredScalars.size() != VF &&
18675 return any_of(TEs, [&](
const TreeEntry *TE) {
18676 return TE->getVectorFactor() == VF;
18679 GatheredScalars.append(VF - GatheredScalars.size(),
18683 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
18689 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18690 SmallVectorImpl<int> &ReuseMask,
18691 bool IsRootPoison) {
18694 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
18697 SmallVector<int> UndefPos;
18698 DenseMap<Value *, unsigned> UniquePositions;
18701 int NumNonConsts = 0;
18720 Scalars.
front() = OrigV;
18723 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
18724 Scalars[Res.first->second] = OrigV;
18725 ReuseMask[
I] = Res.first->second;
18728 if (NumNonConsts == 1) {
18733 if (!UndefPos.
empty() && UndefPos.
front() == 0)
18736 ReuseMask[SinglePos] = SinglePos;
18737 }
else if (!UndefPos.
empty() && IsSplat) {
18744 (
E->UserTreeIndex &&
any_of(
V->uses(), [
E](
const Use &U) {
18747 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18748 is_contained(E->UserTreeIndex.UserTE->Scalars,
18752 if (It != Scalars.
end()) {
18754 int Pos = std::distance(Scalars.
begin(), It);
18755 for (
int I : UndefPos) {
18757 ReuseMask[
I] = Pos;
18766 for (
int I : UndefPos) {
18775 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
18776 bool IsNonPoisoned =
true;
18777 bool IsUsedInExpr =
true;
18778 Value *Vec1 =
nullptr;
18779 if (!ExtractShuffles.
empty()) {
18783 Value *Vec2 =
nullptr;
18784 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18788 if (UseVecBaseAsInput) {
18789 Vec1 = ExtractVecBase;
18791 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18797 Value *VecOp = EI->getVectorOperand();
18799 !TEs.
empty() && TEs.
front()->VectorizedValue)
18800 VecOp = TEs.
front()->VectorizedValue;
18803 }
else if (Vec1 != VecOp) {
18804 assert((!Vec2 || Vec2 == VecOp) &&
18805 "Expected only 1 or 2 vectors shuffle.");
18811 IsUsedInExpr =
false;
18814 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18817 IsUsedInExpr &= FindReusedSplat(
18820 ExtractMask.size(), IsNotPoisonedVec);
18821 ShuffleBuilder.add(Vec1, ExtractMask,
true);
18822 IsNonPoisoned &= IsNotPoisonedVec;
18824 IsUsedInExpr =
false;
18829 if (!GatherShuffles.
empty()) {
18830 unsigned SliceSize =
18834 for (
const auto [
I, TEs] :
enumerate(Entries)) {
18837 "No shuffles with empty entries list expected.");
18841 "Expected shuffle of 1 or 2 entries.");
18845 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
18846 if (TEs.
size() == 1) {
18847 bool IsNotPoisonedVec =
18848 TEs.
front()->VectorizedValue
18852 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
18853 SliceSize, IsNotPoisonedVec);
18854 ShuffleBuilder.add(*TEs.
front(), VecMask);
18855 IsNonPoisoned &= IsNotPoisonedVec;
18857 IsUsedInExpr =
false;
18858 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
18859 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
18870 int EMSz = ExtractMask.size();
18871 int MSz =
Mask.size();
18874 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
18875 bool IsIdentityShuffle =
18876 ((UseVecBaseAsInput ||
18878 [](
const std::optional<TTI::ShuffleKind> &SK) {
18882 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
18884 (!GatherShuffles.
empty() &&
18886 [](
const std::optional<TTI::ShuffleKind> &SK) {
18890 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
18892 bool EnoughConstsForShuffle =
18902 (!IsIdentityShuffle ||
18903 (GatheredScalars.size() == 2 &&
18911 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
18912 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
18919 SmallVector<int> BVMask(GatheredScalars.size(),
PoisonMaskElem);
18920 TryPackScalars(GatheredScalars, BVMask,
true);
18921 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18922 ShuffleBuilder.add(BV, BVMask);
18926 (IsSingleShuffle && ((IsIdentityShuffle &&
18929 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18932 Res = ShuffleBuilder.finalize(
18933 E->ReuseShuffleIndices, SubVectors, SubVectorsMask,
E->Scalars.size(),
18934 [&](
Value *&Vec, SmallVectorImpl<int> &Mask,
auto CreateShuffle) {
18935 bool IsSplat = isSplat(NonConstants);
18936 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18937 TryPackScalars(NonConstants, BVMask, false);
18938 auto CheckIfSplatIsProfitable = [&]() {
18941 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18942 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18943 if (isa<ExtractElementInst>(V) || isVectorized(V))
18945 InstructionCost SplatCost = TTI->getVectorInstrCost(
18946 Instruction::InsertElement, VecTy, CostKind, 0,
18947 PoisonValue::get(VecTy), V);
18948 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18949 for (auto [Idx, I] : enumerate(BVMask))
18950 if (I != PoisonMaskElem)
18951 NewMask[Idx] = Mask.size();
18952 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18953 NewMask, CostKind);
18954 InstructionCost BVCost = TTI->getVectorInstrCost(
18955 Instruction::InsertElement, VecTy, CostKind,
18956 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18959 if (count(BVMask, PoisonMaskElem) <
18960 static_cast<int>(BVMask.size() - 1)) {
18961 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18962 for (auto [Idx, I] : enumerate(BVMask))
18963 if (I != PoisonMaskElem)
18965 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18966 VecTy, NewMask, CostKind);
18968 return SplatCost <= BVCost;
18970 if (!IsSplat ||
Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18974 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
18980 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18982 transform(BVMask, SplatMask.begin(), [](
int I) {
18983 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18986 BV = CreateShuffle(BV,
nullptr, SplatMask);
18989 Mask[Idx] = BVMask.size() + Idx;
18990 Vec = CreateShuffle(Vec, BV, Mask);
18998 SmallVector<int> ReuseMask(GatheredScalars.size(),
PoisonMaskElem);
18999 TryPackScalars(GatheredScalars, ReuseMask,
true);
19000 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
19001 ShuffleBuilder.add(BV, ReuseMask);
19002 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
19007 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
19011 Value *BV = ShuffleBuilder.gather(GatheredScalars);
19012 ShuffleBuilder.add(BV, Mask);
19013 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
19018 Res = ShuffleBuilder.createFreeze(Res);
19022Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
19023 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
19025 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
19033 for (
Value *V : VL)
19046 IRBuilderBase::InsertPointGuard Guard(Builder);
19048 Value *
V =
E->Scalars.front();
19049 Type *ScalarTy =
V->getType();
19052 auto It = MinBWs.find(
E);
19053 if (It != MinBWs.end()) {
19059 if (
E->VectorizedValue)
19060 return E->VectorizedValue;
19062 if (
E->isGather()) {
19064 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
19065 setInsertPointAfterBundle(
E);
19066 Value *Vec = createBuildVector(
E, ScalarTy);
19067 E->VectorizedValue = Vec;
19070 if (
E->State == TreeEntry::SplitVectorize) {
19071 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
19072 "Expected exactly 2 combined entries.");
19073 setInsertPointAfterBundle(
E);
19075 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
19077 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19078 "Expected same first part of scalars.");
19081 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
19083 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19084 "Expected same second part of scalars.");
19086 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
19087 bool IsSigned =
false;
19088 auto It = MinBWs.find(OpE);
19089 if (It != MinBWs.end())
19090 IsSigned = It->second.second;
19093 if (isa<PoisonValue>(V))
19095 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19102 Op1 = Builder.CreateIntCast(
19107 GetOperandSignedness(&OpTE1));
19112 Op2 = Builder.CreateIntCast(
19117 GetOperandSignedness(&OpTE2));
19119 if (
E->ReorderIndices.empty()) {
19123 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
19126 if (ScalarTyNumElements != 1) {
19130 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19132 E->CombinedEntriesWithIndices.back().second *
19133 ScalarTyNumElements);
19134 E->VectorizedValue = Vec;
19137 unsigned CommonVF =
19138 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19141 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
19143 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19147 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
19149 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19151 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
19152 E->VectorizedValue = Vec;
19156 bool IsReverseOrder =
19158 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
19160 if (
E->getOpcode() == Instruction::Store &&
19161 E->State == TreeEntry::Vectorize) {
19162 ArrayRef<int>
Mask =
19163 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
19164 E->ReorderIndices.size());
19165 ShuffleBuilder.add(V, Mask);
19166 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19167 E->State == TreeEntry::CompressVectorize) {
19168 ShuffleBuilder.addOrdered(V, {});
19170 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
19173 E->CombinedEntriesWithIndices.size());
19175 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
19176 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19179 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
19180 "Expected either combined subnodes or reordering");
19181 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
19184 assert(!
E->isGather() &&
"Unhandled state");
19185 unsigned ShuffleOrOp =
19186 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
19188 auto GetOperandSignedness = [&](
unsigned Idx) {
19189 const TreeEntry *OpE = getOperandEntry(
E, Idx);
19190 bool IsSigned =
false;
19191 auto It = MinBWs.find(OpE);
19192 if (It != MinBWs.end())
19193 IsSigned = It->second.second;
19196 if (isa<PoisonValue>(V))
19198 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19202 switch (ShuffleOrOp) {
19203 case Instruction::PHI: {
19204 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
19205 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
19206 "PHI reordering is free.");
19208 Builder.SetInsertPoint(PH->getParent(),
19209 PH->getParent()->getFirstNonPHIIt());
19211 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19215 Builder.SetInsertPoint(PH->getParent(),
19216 PH->getParent()->getFirstInsertionPt());
19219 V = FinalShuffle(V,
E);
19221 E->VectorizedValue =
V;
19228 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19235 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19239 if (!VisitedBBs.
insert(IBB).second) {
19242 TreeEntry *OpTE = getOperandEntry(
E,
I);
19243 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19244 OpTE->VectorizedValue = VecOp;
19250 Value *Vec = vectorizeOperand(
E,
I);
19251 if (VecTy != Vec->
getType()) {
19253 MinBWs.contains(getOperandEntry(
E,
I))) &&
19254 "Expected item in MinBWs.");
19255 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
19261 "Invalid number of incoming values");
19262 assert(
E->VectorizedValue &&
"Expected vectorized value.");
19263 return E->VectorizedValue;
19266 case Instruction::ExtractElement: {
19267 Value *
V =
E->getSingleOperand(0);
19268 setInsertPointAfterBundle(
E);
19269 V = FinalShuffle(V,
E);
19270 E->VectorizedValue =
V;
19273 case Instruction::ExtractValue: {
19275 Builder.SetInsertPoint(LI);
19276 Value *
Ptr = LI->getPointerOperand();
19277 LoadInst *
V = Builder.CreateAlignedLoad(VecTy,
Ptr, LI->getAlign());
19279 NewV = FinalShuffle(NewV,
E);
19280 E->VectorizedValue = NewV;
19283 case Instruction::InsertElement: {
19284 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
19285 if (
const TreeEntry *OpE = getOperandEntry(
E, 1);
19286 OpE && !OpE->isGather() && OpE->hasState() &&
19287 !OpE->hasCopyableElements())
19290 setInsertPointAfterBundle(
E);
19291 Value *
V = vectorizeOperand(
E, 1);
19293 Type *ScalarTy =
Op.front()->getType();
19296 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
19297 assert(Res.first > 0 &&
"Expected item in MinBWs.");
19298 V = Builder.CreateIntCast(
19308 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19310 const unsigned NumElts =
19312 const unsigned NumScalars =
E->Scalars.size();
19315 assert(
Offset < NumElts &&
"Failed to find vector index offset");
19318 SmallVector<int>
Mask;
19319 if (!
E->ReorderIndices.empty()) {
19324 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
19327 bool IsIdentity =
true;
19329 Mask.swap(PrevMask);
19330 for (
unsigned I = 0;
I < NumScalars; ++
I) {
19333 IsIdentity &= InsertIdx -
Offset ==
I;
19336 if (!IsIdentity || NumElts != NumScalars) {
19337 Value *V2 =
nullptr;
19338 bool IsVNonPoisonous =
19340 SmallVector<int> InsertMask(Mask);
19341 if (NumElts != NumScalars &&
Offset == 0) {
19350 InsertMask[*InsertIdx] = *InsertIdx;
19351 if (!
Ins->hasOneUse())
19354 Ins->getUniqueUndroppableUser());
19356 SmallBitVector UseMask =
19357 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19358 SmallBitVector IsFirstPoison =
19360 SmallBitVector IsFirstUndef =
19362 if (!IsFirstPoison.
all()) {
19364 for (
unsigned I = 0;
I < NumElts;
I++) {
19366 IsFirstUndef.
test(
I)) {
19367 if (IsVNonPoisonous) {
19368 InsertMask[
I] =
I < NumScalars ?
I : 0;
19373 if (Idx >= NumScalars)
19374 Idx = NumScalars - 1;
19375 InsertMask[
I] = NumScalars + Idx;
19388 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19390 GatherShuffleExtractSeq.insert(
I);
19391 CSEBlocks.insert(
I->getParent());
19396 for (
unsigned I = 0;
I < NumElts;
I++) {
19400 SmallBitVector UseMask =
19401 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19402 SmallBitVector IsFirstUndef =
19404 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
19405 NumElts != NumScalars) {
19406 if (IsFirstUndef.
all()) {
19408 SmallBitVector IsFirstPoison =
19410 if (!IsFirstPoison.
all()) {
19411 for (
unsigned I = 0;
I < NumElts;
I++) {
19413 InsertMask[
I] =
I + NumElts;
19416 V = Builder.CreateShuffleVector(
19422 GatherShuffleExtractSeq.insert(
I);
19423 CSEBlocks.insert(
I->getParent());
19427 SmallBitVector IsFirstPoison =
19429 for (
unsigned I = 0;
I < NumElts;
I++) {
19433 InsertMask[
I] += NumElts;
19435 V = Builder.CreateShuffleVector(
19436 FirstInsert->getOperand(0), V, InsertMask,
19439 GatherShuffleExtractSeq.insert(
I);
19440 CSEBlocks.insert(
I->getParent());
19445 ++NumVectorInstructions;
19446 E->VectorizedValue =
V;
19449 case Instruction::ZExt:
19450 case Instruction::SExt:
19451 case Instruction::FPToUI:
19452 case Instruction::FPToSI:
19453 case Instruction::FPExt:
19454 case Instruction::PtrToInt:
19455 case Instruction::IntToPtr:
19456 case Instruction::SIToFP:
19457 case Instruction::UIToFP:
19458 case Instruction::Trunc:
19459 case Instruction::FPTrunc:
19460 case Instruction::BitCast: {
19461 setInsertPointAfterBundle(
E);
19463 Value *InVec = vectorizeOperand(
E, 0);
19468 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
19470 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19473 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19474 if (SrcIt != MinBWs.end())
19475 SrcBWSz = SrcIt->second.first;
19476 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
19477 if (BWSz == SrcBWSz) {
19478 VecOpcode = Instruction::BitCast;
19479 }
else if (BWSz < SrcBWSz) {
19480 VecOpcode = Instruction::Trunc;
19481 }
else if (It != MinBWs.end()) {
19482 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19483 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19484 }
else if (SrcIt != MinBWs.end()) {
19485 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19487 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19489 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19490 !SrcIt->second.second) {
19491 VecOpcode = Instruction::UIToFP;
19493 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19495 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19496 V = FinalShuffle(V,
E);
19498 E->VectorizedValue =
V;
19499 ++NumVectorInstructions;
19502 case Instruction::FCmp:
19503 case Instruction::ICmp: {
19504 setInsertPointAfterBundle(
E);
19506 Value *
L = vectorizeOperand(
E, 0);
19507 Value *
R = vectorizeOperand(
E, 1);
19508 if (
L->getType() !=
R->getType()) {
19511 MinBWs.contains(getOperandEntry(
E, 0)) ||
19512 MinBWs.contains(getOperandEntry(
E, 1))) &&
19513 "Expected item in MinBWs.");
19518 ->getIntegerBitWidth()) {
19519 Type *CastTy =
R->getType();
19520 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19522 Type *CastTy =
L->getType();
19523 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19528 Value *
V = Builder.CreateCmp(P0, L, R);
19531 ICmp->setSameSign(
false);
19534 V = FinalShuffle(V,
E);
19536 E->VectorizedValue =
V;
19537 ++NumVectorInstructions;
19540 case Instruction::Select: {
19541 setInsertPointAfterBundle(
E);
19544 Value *True = vectorizeOperand(
E, 1);
19545 Value *False = vectorizeOperand(
E, 2);
19549 MinBWs.contains(getOperandEntry(
E, 1)) ||
19550 MinBWs.contains(getOperandEntry(
E, 2))) &&
19551 "Expected item in MinBWs.");
19552 if (True->
getType() != VecTy)
19553 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19554 if (False->
getType() != VecTy)
19555 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19560 assert(TrueNumElements >= CondNumElements &&
19561 TrueNumElements % CondNumElements == 0 &&
19562 "Cannot vectorize Instruction::Select");
19564 "Cannot vectorize Instruction::Select");
19565 if (CondNumElements != TrueNumElements) {
19568 Cond = Builder.CreateShuffleVector(
19573 "Cannot vectorize Instruction::Select");
19575 Builder.CreateSelectWithUnknownProfile(
Cond, True, False,
DEBUG_TYPE);
19576 V = FinalShuffle(V,
E);
19578 E->VectorizedValue =
V;
19579 ++NumVectorInstructions;
19582 case Instruction::FNeg: {
19583 setInsertPointAfterBundle(
E);
19585 Value *
Op = vectorizeOperand(
E, 0);
19587 Value *
V = Builder.CreateUnOp(
19593 V = FinalShuffle(V,
E);
19595 E->VectorizedValue =
V;
19596 ++NumVectorInstructions;
19600 case Instruction::Freeze: {
19601 setInsertPointAfterBundle(
E);
19603 Value *
Op = vectorizeOperand(
E, 0);
19605 if (
Op->getType() != VecTy) {
19607 MinBWs.contains(getOperandEntry(
E, 0))) &&
19608 "Expected item in MinBWs.");
19609 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
19611 Value *
V = Builder.CreateFreeze(
Op);
19612 V = FinalShuffle(V,
E);
19614 E->VectorizedValue =
V;
19615 ++NumVectorInstructions;
19619 case Instruction::Add:
19620 case Instruction::FAdd:
19621 case Instruction::Sub:
19622 case Instruction::FSub:
19623 case Instruction::Mul:
19624 case Instruction::FMul:
19625 case Instruction::UDiv:
19626 case Instruction::SDiv:
19627 case Instruction::FDiv:
19628 case Instruction::URem:
19629 case Instruction::SRem:
19630 case Instruction::FRem:
19631 case Instruction::Shl:
19632 case Instruction::LShr:
19633 case Instruction::AShr:
19634 case Instruction::And:
19635 case Instruction::Or:
19636 case Instruction::Xor: {
19637 setInsertPointAfterBundle(
E);
19641 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19646 return CI && CI->getValue().countr_one() >= It->second.first;
19648 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
19649 E->VectorizedValue =
V;
19650 ++NumVectorInstructions;
19658 MinBWs.contains(getOperandEntry(
E, 0)) ||
19659 MinBWs.contains(getOperandEntry(
E, 1))) &&
19660 "Expected item in MinBWs.");
19662 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
19664 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
19667 Value *
V = Builder.CreateBinOp(
19674 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
19676 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19678 I->setHasNoUnsignedWrap(
false);
19681 V = FinalShuffle(V,
E);
19683 E->VectorizedValue =
V;
19684 ++NumVectorInstructions;
19688 case Instruction::Load: {
19691 setInsertPointAfterBundle(
E);
19695 FixedVectorType *StridedLoadTy =
nullptr;
19696 Value *PO = LI->getPointerOperand();
19697 if (
E->State == TreeEntry::Vectorize) {
19698 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19699 }
else if (
E->State == TreeEntry::CompressVectorize) {
19700 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19701 CompressEntryToData.at(
E);
19702 Align CommonAlignment = LI->getAlign();
19708 for (
int I : CompressMask)
19712 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
19715 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19718 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19729 }
else if (
E->State == TreeEntry::StridedVectorize) {
19732 PO = IsReverseOrder ? PtrN : Ptr0;
19733 Type *StrideTy = DL->getIndexType(PO->
getType());
19735 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
19736 StridedLoadTy = SPtrInfo.Ty;
19737 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
19738 unsigned StridedLoadEC =
19741 Value *Stride = SPtrInfo.StrideVal;
19743 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19744 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
19745 SCEVExpander Expander(*SE, *DL,
"strided-load-vec");
19746 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
19747 &*Builder.GetInsertPoint());
19750 Builder.CreateIntCast(Stride, StrideTy,
true);
19751 StrideVal = Builder.CreateMul(
19752 NewStride, ConstantInt::get(
19753 StrideTy, (IsReverseOrder ? -1 : 1) *
19755 DL->getTypeAllocSize(ScalarTy))));
19757 auto *Inst = Builder.CreateIntrinsic(
19758 Intrinsic::experimental_vp_strided_load,
19759 {StridedLoadTy, PO->
getType(), StrideTy},
19762 Builder.getInt32(StridedLoadEC)});
19763 Inst->addParamAttr(
19768 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
19769 Value *VecPtr = vectorizeOperand(
E, 0);
19774 unsigned ScalarTyNumElements =
19776 unsigned VecTyNumElements =
19778 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19779 "Cannot expand getelementptr.");
19780 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19783 return Builder.getInt64(I % ScalarTyNumElements);
19785 VecPtr = Builder.CreateGEP(
19786 VecTy->getElementType(),
19787 Builder.CreateShuffleVector(
19793 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19795 Value *
V =
E->State == TreeEntry::CompressVectorize
19799 V = FinalShuffle(V,
E);
19800 E->VectorizedValue =
V;
19801 ++NumVectorInstructions;
19804 case Instruction::Store: {
19807 setInsertPointAfterBundle(
E);
19809 Value *VecValue = vectorizeOperand(
E, 0);
19810 if (VecValue->
getType() != VecTy)
19812 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19813 VecValue = FinalShuffle(VecValue,
E);
19817 if (
E->State == TreeEntry::Vectorize) {
19818 ST = Builder.CreateAlignedStore(VecValue,
Ptr,
SI->getAlign());
19820 assert(
E->State == TreeEntry::StridedVectorize &&
19821 "Expected either strided or consecutive stores.");
19822 if (!
E->ReorderIndices.empty()) {
19824 Ptr =
SI->getPointerOperand();
19827 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
19828 auto *Inst = Builder.CreateIntrinsic(
19829 Intrinsic::experimental_vp_strided_store,
19830 {VecTy,
Ptr->getType(), StrideTy},
19833 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19834 Builder.getAllOnesMask(VecTy->getElementCount()),
19835 Builder.getInt32(
E->Scalars.size())});
19836 Inst->addParamAttr(
19844 E->VectorizedValue =
V;
19845 ++NumVectorInstructions;
19848 case Instruction::GetElementPtr: {
19850 setInsertPointAfterBundle(
E);
19852 Value *Op0 = vectorizeOperand(
E, 0);
19855 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
19856 Value *OpVec = vectorizeOperand(
E, J);
19860 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19863 for (
Value *V :
E->Scalars) {
19870 V = FinalShuffle(V,
E);
19872 E->VectorizedValue =
V;
19873 ++NumVectorInstructions;
19877 case Instruction::Call: {
19879 setInsertPointAfterBundle(
E);
19884 CI,
ID, VecTy->getNumElements(),
19885 It != MinBWs.end() ? It->second.first : 0, TTI);
19888 VecCallCosts.first <= VecCallCosts.second;
19890 Value *ScalarArg =
nullptr;
19901 ScalarArg = CEI->getArgOperand(
I);
19904 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
19905 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19906 ScalarArg = Builder.getFalse();
19913 Value *OpVec = vectorizeOperand(
E,
I);
19914 ScalarArg = CEI->getArgOperand(
I);
19917 It == MinBWs.end()) {
19920 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
19921 }
else if (It != MinBWs.end()) {
19922 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
19931 if (!UseIntrinsic) {
19936 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19943 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
19946 V = FinalShuffle(V,
E);
19948 E->VectorizedValue =
V;
19949 ++NumVectorInstructions;
19952 case Instruction::ShuffleVector: {
19955 setInsertPointAfterBundle(
E);
19956 Value *Src = vectorizeOperand(
E, 0);
19959 SmallVector<int> NewMask(ThisMask.size());
19961 return SVSrc->getShuffleMask()[Mask];
19963 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19964 SVSrc->getOperand(1), NewMask);
19966 V = Builder.CreateShuffleVector(Src, ThisMask);
19971 V = FinalShuffle(V,
E);
19979 "Invalid Shuffle Vector Operand");
19983 setInsertPointAfterBundle(
E);
19984 LHS = vectorizeOperand(
E, 0);
19985 RHS = vectorizeOperand(
E, 1);
19987 setInsertPointAfterBundle(
E);
19988 LHS = vectorizeOperand(
E, 0);
19994 assert((It != MinBWs.end() ||
19995 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
19996 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
19997 MinBWs.contains(getOperandEntry(
E, 0)) ||
19998 MinBWs.contains(getOperandEntry(
E, 1))) &&
19999 "Expected item in MinBWs.");
20000 Type *CastTy = VecTy;
20006 ->getIntegerBitWidth())
20012 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
20014 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
20019 V0 = Builder.CreateBinOp(
20021 V1 = Builder.CreateBinOp(
20024 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
20027 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
20030 unsigned SrcBWSz = DL->getTypeSizeInBits(
20032 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
20033 if (BWSz <= SrcBWSz) {
20034 if (BWSz < SrcBWSz)
20035 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
20037 "Expected same type as operand.");
20041 E->VectorizedValue =
LHS;
20042 ++NumVectorInstructions;
20046 V0 = Builder.CreateCast(
20048 V1 = Builder.CreateCast(
20053 for (
Value *V : {V0, V1}) {
20055 GatherShuffleExtractSeq.insert(
I);
20056 CSEBlocks.insert(
I->getParent());
20064 SmallVector<int>
Mask;
20065 E->buildAltOpShuffleMask(
20066 [
E,
this](Instruction *
I) {
20067 assert(
E->getMatchingMainOpOrAltOp(
I) &&
20068 "Unexpected main/alternate opcode");
20072 Mask, &OpScalars, &AltScalars);
20076 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
20079 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
20081 if (isa<PoisonValue>(V))
20083 auto *IV = cast<Instruction>(V);
20084 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20086 I->setHasNoUnsignedWrap(
false);
20088 DropNuwFlag(V0,
E->getOpcode());
20089 DropNuwFlag(V1,
E->getAltOpcode());
20095 V = Builder.CreateShuffleVector(V0, V1, Mask);
20098 GatherShuffleExtractSeq.insert(
I);
20099 CSEBlocks.insert(
I->getParent());
20103 E->VectorizedValue =
V;
20104 ++NumVectorInstructions;
20122 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20125 EntryToLastInstruction.clear();
20127 for (
auto &BSIter : BlocksSchedules)
20128 scheduleBlock(*
this, BSIter.second.get());
20131 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20132 if (TE->isGather())
20134 (void)getLastInstructionInBundle(TE.get());
20138 Builder.SetInsertPoint(ReductionRoot->
getParent(),
20141 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20145 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20146 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20147 TE->UserTreeIndex.UserTE->hasState() &&
20148 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20149 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20150 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20151 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20152 all_of(TE->UserTreeIndex.UserTE->Scalars,
20153 [](
Value *V) { return isUsedOutsideBlock(V); })) {
20155 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20159 for (
auto &Entry : GatherEntries) {
20161 Builder.SetInsertPoint(Entry.second);
20162 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20167 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20168 if (GatheredLoadsEntriesFirst.has_value() &&
20169 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20170 (!TE->isGather() || TE->UserTreeIndex)) {
20171 assert((TE->UserTreeIndex ||
20172 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20173 "Expected gathered load node.");
20182 for (
const TreeEntry *E : PostponedNodes) {
20183 auto *TE =
const_cast<TreeEntry *
>(E);
20185 TE->VectorizedValue =
nullptr;
20196 (TE->UserTreeIndex.UserTE->hasState() &&
20197 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20206 if (UI->comesBefore(InsertPt))
20209 Builder.SetInsertPoint(InsertPt);
20211 Builder.SetInsertPoint(PrevVec);
20213 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20216 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20217 Builder.GetInsertPoint()->comesBefore(VecI))
20218 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20219 Builder.GetInsertPoint());
20220 if (Vec->
getType() != PrevVec->getType()) {
20222 PrevVec->getType()->isIntOrIntVectorTy() &&
20223 "Expected integer vector types only.");
20224 std::optional<bool> IsSigned;
20225 for (
Value *V : TE->Scalars) {
20227 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20228 auto It = MinBWs.find(MNTE);
20229 if (It != MinBWs.end()) {
20230 IsSigned = IsSigned.value_or(
false) || It->second.second;
20235 if (IsSigned.value_or(
false))
20238 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20239 auto It = MinBWs.find(BVE);
20240 if (It != MinBWs.end()) {
20241 IsSigned = IsSigned.value_or(
false) || It->second.second;
20246 if (IsSigned.value_or(
false))
20250 IsSigned.value_or(
false) ||
20254 if (IsSigned.value_or(
false))
20258 if (IsSigned.value_or(
false)) {
20260 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20261 if (It != MinBWs.end())
20262 IsSigned = It->second.second;
20265 "Expected user node or perfect diamond match in MinBWs.");
20266 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
20268 PrevVec->replaceAllUsesWith(Vec);
20269 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
20272 auto It = PostponedValues.
find(PrevVec);
20273 if (It != PostponedValues.
end()) {
20274 for (TreeEntry *VTE : It->getSecond())
20275 VTE->VectorizedValue = Vec;
20295 for (
const auto &ExternalUse : ExternalUses) {
20296 Value *Scalar = ExternalUse.Scalar;
20303 const TreeEntry *E = &ExternalUse.E;
20304 assert(E &&
"Invalid scalar");
20305 assert(!E->isGather() &&
"Extracting from a gather list");
20307 if (E->getOpcode() == Instruction::GetElementPtr &&
20311 Value *Vec = E->VectorizedValue;
20312 assert(Vec &&
"Can't find vectorizable value");
20314 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20315 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
20316 if (Scalar->getType() != Vec->
getType()) {
20317 Value *Ex =
nullptr;
20318 Value *ExV =
nullptr;
20320 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20321 auto It = ScalarToEEs.
find(Scalar);
20322 if (It != ScalarToEEs.
end()) {
20325 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20326 : Builder.GetInsertBlock());
20327 if (EEIt != It->second.end()) {
20328 Value *PrevV = EEIt->second.first;
20330 I && !ReplaceInst &&
20331 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20332 Builder.GetInsertPoint()->comesBefore(
I)) {
20333 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20334 Builder.GetInsertPoint());
20339 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20348 IgnoredExtracts.
insert(EE);
20351 auto *CloneInst = Inst->clone();
20352 CloneInst->insertBefore(Inst->getIterator());
20353 if (Inst->hasName())
20354 CloneInst->takeName(Inst);
20359 Value *V = ES->getVectorOperand();
20362 V = ETEs.front()->VectorizedValue;
20364 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
20365 IV->comesBefore(IVec))
20366 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20368 Ex = Builder.CreateExtractElement(Vec, Lane);
20369 }
else if (
auto *VecTy =
20372 unsigned VecTyNumElements = VecTy->getNumElements();
20377 ExternalUse.Lane * VecTyNumElements);
20379 Ex = Builder.CreateExtractElement(Vec, Lane);
20384 if (Scalar->getType() != Ex->
getType())
20385 ExV = Builder.CreateIntCast(
20390 : &F->getEntryBlock(),
20391 std::make_pair(Ex, ExV));
20397 GatherShuffleExtractSeq.insert(ExI);
20398 CSEBlocks.insert(ExI->getParent());
20404 "In-tree scalar of vector type is not insertelement?");
20413 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
20416 (ExternallyUsedValues.
count(Scalar) ||
20417 ExternalUsesWithNonUsers.count(Scalar) ||
20418 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20422 if (ExternalUsesAsOriginalScalar.contains(U))
20424 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20425 return !UseEntries.empty() &&
20426 (E->State == TreeEntry::Vectorize ||
20427 E->State == TreeEntry::StridedVectorize ||
20428 E->State == TreeEntry::CompressVectorize) &&
20429 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20430 return (UseEntry->State == TreeEntry::Vectorize ||
20432 TreeEntry::StridedVectorize ||
20434 TreeEntry::CompressVectorize) &&
20435 doesInTreeUserNeedToExtract(
20436 Scalar, getRootEntryInstruction(*UseEntry),
20440 "Scalar with nullptr User must be registered in "
20441 "ExternallyUsedValues map or remain as scalar in vectorized "
20445 if (
PHI->getParent()->isLandingPad())
20446 Builder.SetInsertPoint(
20449 PHI->getParent()->getLandingPadInst()->getIterator()));
20451 Builder.SetInsertPoint(
PHI->getParent(),
20452 PHI->getParent()->getFirstNonPHIIt());
20454 Builder.SetInsertPoint(VecI->getParent(),
20455 std::next(VecI->getIterator()));
20458 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20460 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20462 if (Scalar != NewInst) {
20465 "Extractelements should not be replaced.");
20466 Scalar->replaceAllUsesWith(NewInst);
20476 if (!UsedInserts.
insert(VU).second)
20479 auto BWIt = MinBWs.find(E);
20481 auto *ScalarTy = FTy->getElementType();
20482 auto Key = std::make_pair(Vec, ScalarTy);
20483 auto VecIt = VectorCasts.
find(
Key);
20484 if (VecIt == VectorCasts.
end()) {
20487 if (IVec->getParent()->isLandingPad())
20488 Builder.SetInsertPoint(IVec->getParent(),
20489 std::next(IVec->getParent()
20490 ->getLandingPadInst()
20493 Builder.SetInsertPoint(
20494 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20496 Builder.SetInsertPoint(IVec->getNextNode());
20498 Vec = Builder.CreateIntCast(
20503 BWIt->second.second);
20506 Vec = VecIt->second;
20513 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
20520 unsigned Idx = *InsertIdx;
20521 if (It == ShuffledInserts.
end()) {
20523 It = std::next(ShuffledInserts.
begin(),
20524 ShuffledInserts.
size() - 1);
20529 Mask[Idx] = ExternalUse.Lane;
20541 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
20542 if (PH->getIncomingValue(
I) == Scalar) {
20544 PH->getIncomingBlock(
I)->getTerminator();
20546 Builder.SetInsertPoint(VecI->getParent(),
20547 std::next(VecI->getIterator()));
20549 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
20551 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20552 PH->setOperand(
I, NewInst);
20557 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20561 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20562 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20573 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
20575 CombinedMask1[
I] = Mask[
I];
20577 CombinedMask2[
I] = Mask[
I] - VF;
20579 ShuffleInstructionBuilder ShuffleBuilder(
20581 ShuffleBuilder.add(V1, CombinedMask1);
20583 ShuffleBuilder.add(V2, CombinedMask2);
20584 return ShuffleBuilder.finalize({}, {}, {});
20587 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
20588 bool ForSingleMask) {
20589 unsigned VF =
Mask.size();
20592 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
20593 Vec = CreateShuffle(Vec,
nullptr, Mask);
20594 return std::make_pair(Vec,
true);
20596 if (!ForSingleMask) {
20598 for (
unsigned I = 0;
I < VF; ++
I) {
20602 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
20606 return std::make_pair(Vec,
false);
20610 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
20613 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
20614 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
20615 Builder.SetInsertPoint(LastInsert);
20616 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
20621 return cast<VectorType>(Vec->getType())
20622 ->getElementCount()
20623 .getKnownMinValue();
20626 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20628 assert((Vals.size() == 1 || Vals.size() == 2) &&
20629 "Expected exactly 1 or 2 input values.");
20630 if (Vals.size() == 1) {
20633 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20634 ->getNumElements() ||
20635 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20636 return CreateShuffle(Vals.front(), nullptr, Mask);
20637 return Vals.front();
20639 return CreateShuffle(Vals.
front() ? Vals.
front()
20641 Vals.
back(), Mask);
20643 auto It = ShuffledInserts[
I].InsertElements.rbegin();
20645 InsertElementInst *
II =
nullptr;
20646 if (It != ShuffledInserts[
I].InsertElements.rend())
20649 while (It != ShuffledInserts[
I].InsertElements.rend()) {
20650 assert(
II &&
"Must be an insertelement instruction.");
20657 for (Instruction *
II :
reverse(Inserts)) {
20658 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
20660 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
20661 II->moveAfter(NewI);
20665 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
20666 IE->replaceUsesOfWith(
IE->getOperand(0),
20668 IE->replaceUsesOfWith(
IE->getOperand(1),
20672 CSEBlocks.insert(LastInsert->
getParent());
20677 for (
auto &TEPtr : VectorizableTree) {
20678 TreeEntry *
Entry = TEPtr.get();
20681 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize)
20684 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
20687 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
20690 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
20694 EE && IgnoredExtracts.contains(EE))
20701 for (User *U :
Scalar->users()) {
20706 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20709 "Deleting out-of-tree value");
20713 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
20722 V->mergeDIAssignID(RemovedInsts);
20725 if (UserIgnoreList) {
20726 for (Instruction *
I : RemovedInsts) {
20727 const TreeEntry *
IE = getTreeEntries(
I).front();
20728 if (
IE->Idx != 0 &&
20729 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
20730 (ValueToGatherNodes.lookup(
I).contains(
20731 VectorizableTree.front().get()) ||
20732 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20733 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20734 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20735 IE->UserTreeIndex &&
20737 !(GatheredLoadsEntriesFirst.has_value() &&
20738 IE->Idx >= *GatheredLoadsEntriesFirst &&
20739 VectorizableTree.front()->isGather() &&
20741 !(!VectorizableTree.front()->isGather() &&
20742 VectorizableTree.front()->isCopyableElement(
I)))
20747 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20748 (match(U.getUser(), m_LogicalAnd()) ||
20749 match(U.getUser(), m_LogicalOr())) &&
20750 U.getOperandNo() == 0;
20751 if (IsPoisoningLogicalOp) {
20752 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20755 return UserIgnoreList->contains(
U.getUser());
20759 for (SelectInst *SI : LogicalOpSelects)
20769 Builder.ClearInsertionPoint();
20770 InstrElementSize.clear();
20772 const TreeEntry &RootTE = *VectorizableTree.front();
20773 Value *Vec = RootTE.VectorizedValue;
20774 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20775 It != MinBWs.end() &&
20776 ReductionBitWidth != It->second.first) {
20777 IRBuilder<>::InsertPointGuard Guard(Builder);
20778 Builder.SetInsertPoint(ReductionRoot->getParent(),
20779 ReductionRoot->getIterator());
20780 Vec = Builder.CreateIntCast(
20784 It->second.second);
20790 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
20791 <<
" gather sequences instructions.\n");
20798 Loop *L = LI->getLoopFor(
I->getParent());
20803 BasicBlock *PreHeader = L->getLoopPreheader();
20811 auto *OpI = dyn_cast<Instruction>(V);
20812 return OpI && L->contains(OpI);
20818 CSEBlocks.insert(PreHeader);
20823 CSEWorkList.
reserve(CSEBlocks.size());
20826 assert(DT->isReachableFromEntry(
N));
20833 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
20834 "Different nodes should have different DFS numbers");
20835 return A->getDFSNumIn() <
B->getDFSNumIn();
20843 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
20846 if (I1->getType() != I2->getType())
20851 return I1->isIdenticalTo(I2);
20852 if (SI1->isIdenticalTo(SI2))
20854 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
20855 if (SI1->getOperand(
I) != SI2->getOperand(
I))
20858 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20862 unsigned LastUndefsCnt = 0;
20863 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
20869 NewMask[
I] != SM1[
I])
20872 NewMask[
I] = SM1[
I];
20876 return SM1.
size() - LastUndefsCnt > 1 &&
20880 SM1.
size() - LastUndefsCnt));
20886 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
20888 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
20889 "Worklist not sorted properly!");
20896 !GatherShuffleExtractSeq.contains(&In))
20901 bool Replaced =
false;
20904 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20905 DT->dominates(V->getParent(), In.getParent())) {
20906 In.replaceAllUsesWith(V);
20909 if (!NewMask.
empty())
20910 SI->setShuffleMask(NewMask);
20915 GatherShuffleExtractSeq.contains(V) &&
20916 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20917 DT->dominates(In.getParent(), V->getParent())) {
20919 V->replaceAllUsesWith(&In);
20922 if (!NewMask.
empty())
20923 SI->setShuffleMask(NewMask);
20931 Visited.push_back(&In);
20936 GatherShuffleExtractSeq.clear();
20939BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20942 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20943 for (
Value *V : VL) {
20944 if (S.isNonSchedulable(V))
20947 if (S.isCopyableElement(V)) {
20949 ScheduleCopyableData &SD =
20950 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
20952 BundlePtr->add(&SD);
20955 ScheduleData *BundleMember = getScheduleData(V);
20956 assert(BundleMember &&
"no ScheduleData for bundle member "
20957 "(maybe not in same basic block)");
20959 BundlePtr->add(BundleMember);
20960 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
20963 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
20969std::optional<BoUpSLP::ScheduleBundle *>
20971 const InstructionsState &S,
20984 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
20985 EI.UserTE->doesNotNeedToSchedule() &&
20986 EI.UserTE->getOpcode() != Instruction::PHI &&
20988 auto *I = dyn_cast<Instruction>(V);
20989 if (!I || I->hasOneUser())
20991 for (User *U : I->users()) {
20992 auto *UI = cast<Instruction>(U);
20993 if (isa<BinaryOperator>(UI))
20998 return std::nullopt;
20999 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21000 EI.UserTE->hasCopyableElements() &&
21001 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
21003 if (S.isCopyableElement(V))
21007 return std::nullopt;
21008 bool HasCopyables = S.areInstructionsWithCopyableElements();
21010 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))) {
21014 SmallVector<ScheduleData *> ControlDependentMembers;
21015 for (
Value *V : VL) {
21017 if (!
I || (HasCopyables && S.isCopyableElement(V)))
21019 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
21020 for (
const Use &U :
I->operands()) {
21023 .first->getSecond();
21026 Op && areAllOperandsReplacedByCopyableData(
I,
Op, *SLP,
NumOps)) {
21027 if (ScheduleData *OpSD = getScheduleData(
Op);
21028 OpSD && OpSD->hasValidDependencies()) {
21029 OpSD->clearDirectDependencies();
21030 if (RegionHasStackSave ||
21032 ControlDependentMembers.
push_back(OpSD);
21037 if (!ControlDependentMembers.
empty()) {
21038 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21039 calculateDependencies(
Invalid,
true, SLP,
21040 ControlDependentMembers);
21047 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
21049 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
21052 SmallVector<ScheduleData *> ControlDependentMembers;
21053 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
21054 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
21055 for (ScheduleEntity *SE : Bundle.getBundle()) {
21057 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
21058 BundleMember && BundleMember->hasValidDependencies()) {
21059 BundleMember->clearDirectDependencies();
21060 if (RegionHasStackSave ||
21062 BundleMember->getInst()))
21063 ControlDependentMembers.
push_back(BundleMember);
21068 if (SD->hasValidDependencies() &&
21069 (!S.areInstructionsWithCopyableElements() ||
21070 !S.isCopyableElement(SD->getInst())) &&
21071 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21072 EI.UserTE->hasState() &&
21073 (!EI.UserTE->hasCopyableElements() ||
21074 !EI.UserTE->isCopyableElement(SD->getInst())))
21075 SD->clearDirectDependencies();
21076 for (
const Use &U : SD->getInst()->operands()) {
21079 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
21080 .first->getSecond();
21083 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
21085 if (ScheduleData *OpSD = getScheduleData(
Op);
21086 OpSD && OpSD->hasValidDependencies()) {
21087 OpSD->clearDirectDependencies();
21088 if (RegionHasStackSave ||
21090 ControlDependentMembers.
push_back(OpSD);
21101 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21102 for_each(ScheduleDataMap, [&](
auto &
P) {
21103 if (BB !=
P.first->getParent())
21105 ScheduleData *SD =
P.second;
21106 if (isInSchedulingRegion(*SD))
21107 SD->clearDependencies();
21109 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21110 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21111 if (isInSchedulingRegion(*SD))
21112 SD->clearDependencies();
21119 if (Bundle && !Bundle.getBundle().empty()) {
21120 if (S.areInstructionsWithCopyableElements() ||
21121 !ScheduleCopyableDataMap.empty())
21122 CheckIfNeedToClearDeps(Bundle);
21123 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
21125 calculateDependencies(Bundle, !ReSchedule, SLP,
21126 ControlDependentMembers);
21127 }
else if (!ControlDependentMembers.
empty()) {
21128 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21129 calculateDependencies(
Invalid, !ReSchedule, SLP,
21130 ControlDependentMembers);
21135 initialFillReadyList(ReadyInsts);
21142 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21143 !ReadyInsts.empty()) {
21144 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21145 assert(Picked->isReady() &&
"must be ready to schedule");
21146 schedule(*SLP, S, EI, Picked, ReadyInsts);
21147 if (Picked == &Bundle)
21154 for (
Value *V : VL) {
21155 if (S.isNonSchedulable(V))
21157 if (!extendSchedulingRegion(V, S)) {
21164 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21165 TryScheduleBundleImpl(
false,
Invalid);
21166 return std::nullopt;
21170 bool ReSchedule =
false;
21171 for (
Value *V : VL) {
21172 if (S.isNonSchedulable(V))
21176 if (!CopyableData.
empty()) {
21177 for (ScheduleCopyableData *SD : CopyableData)
21178 ReadyInsts.remove(SD);
21180 ScheduleData *BundleMember = getScheduleData(V);
21181 assert((BundleMember || S.isCopyableElement(V)) &&
21182 "no ScheduleData for bundle member (maybe not in same basic block)");
21188 ReadyInsts.remove(BundleMember);
21190 !Bundles.
empty()) {
21191 for (ScheduleBundle *
B : Bundles)
21192 ReadyInsts.remove(
B);
21195 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21202 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
21203 <<
" was already scheduled\n");
21207 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21208 TryScheduleBundleImpl(ReSchedule, Bundle);
21209 if (!Bundle.isReady()) {
21210 for (ScheduleEntity *BD : Bundle.getBundle()) {
21214 if (BD->isReady()) {
21216 if (Bundles.
empty()) {
21217 ReadyInsts.insert(BD);
21220 for (ScheduleBundle *
B : Bundles)
21222 ReadyInsts.insert(
B);
21225 ScheduledBundlesList.pop_back();
21226 SmallVector<ScheduleData *> ControlDependentMembers;
21227 SmallPtrSet<Instruction *, 4> Visited;
21228 for (
Value *V : VL) {
21229 if (S.isNonSchedulable(V))
21232 if (S.isCopyableElement(
I)) {
21235 auto KV = std::make_pair(EI,
I);
21236 assert(ScheduleCopyableDataMap.contains(KV) &&
21237 "no ScheduleCopyableData for copyable element");
21238 ScheduleCopyableData *SD =
21239 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
21240 ScheduleCopyableDataMapByUsers[
I].remove(SD);
21243 const auto *It =
find(
Op,
I);
21244 assert(It !=
Op.end() &&
"Lane not set");
21245 SmallPtrSet<Instruction *, 4> Visited;
21247 int Lane = std::distance(
Op.begin(), It);
21248 assert(Lane >= 0 &&
"Lane not set");
21250 !EI.UserTE->ReorderIndices.empty())
21251 Lane = EI.UserTE->ReorderIndices[Lane];
21252 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21253 "Couldn't find extract lane");
21255 if (!Visited.
insert(In).second) {
21259 ScheduleCopyableDataMapByInstUser
21260 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
21263 }
while (It !=
Op.end());
21265 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
21266 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
21268 if (ScheduleCopyableDataMapByUsers[
I].
empty())
21269 ScheduleCopyableDataMapByUsers.erase(
I);
21270 ScheduleCopyableDataMap.erase(KV);
21272 if (ScheduleData *OpSD = getScheduleData(
I);
21273 OpSD && OpSD->hasValidDependencies()) {
21274 OpSD->clearDirectDependencies();
21275 if (RegionHasStackSave ||
21277 ControlDependentMembers.
push_back(OpSD);
21281 ScheduledBundles.find(
I)->getSecond().pop_back();
21283 if (!ControlDependentMembers.
empty()) {
21284 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21285 calculateDependencies(
Invalid,
false, SLP,
21286 ControlDependentMembers);
21288 return std::nullopt;
21293BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21295 if (ChunkPos >= ChunkSize) {
21296 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21299 return &(ScheduleDataChunks.back()[ChunkPos++]);
21302bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21303 Value *V,
const InstructionsState &S) {
21305 assert(
I &&
"bundle member must be an instruction");
21306 if (getScheduleData(
I))
21308 if (!ScheduleStart) {
21310 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
21312 ScheduleEnd =
I->getNextNode();
21313 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21314 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
21322 ++ScheduleStart->getIterator().getReverse();
21328 return II->isAssumeLikeIntrinsic();
21331 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21332 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21333 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
21335 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21336 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
21343 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21344 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21346 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
21347 assert(
I->getParent() == ScheduleStart->getParent() &&
21348 "Instruction is in wrong basic block.");
21349 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
21355 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
21356 "Expected to reach top of the basic block or instruction down the "
21358 assert(
I->getParent() == ScheduleEnd->getParent() &&
21359 "Instruction is in wrong basic block.");
21360 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
21362 ScheduleEnd =
I->getNextNode();
21363 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21364 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
21368void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21370 ScheduleData *PrevLoadStore,
21371 ScheduleData *NextLoadStore) {
21372 ScheduleData *CurrentLoadStore = PrevLoadStore;
21377 ScheduleData *SD = ScheduleDataMap.lookup(
I);
21379 SD = allocateScheduleDataChunks();
21380 ScheduleDataMap[
I] = SD;
21382 assert(!isInSchedulingRegion(*SD) &&
21383 "new ScheduleData already in scheduling region");
21384 SD->init(SchedulingRegionID,
I);
21386 if (
I->mayReadOrWriteMemory() &&
21390 Intrinsic::pseudoprobe))) {
21392 if (CurrentLoadStore) {
21393 CurrentLoadStore->setNextLoadStore(SD);
21395 FirstLoadStoreInRegion = SD;
21397 CurrentLoadStore = SD;
21402 RegionHasStackSave =
true;
21404 if (NextLoadStore) {
21405 if (CurrentLoadStore)
21406 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21408 LastLoadStoreInRegion = CurrentLoadStore;
21412void BoUpSLP::BlockScheduling::calculateDependencies(
21413 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
21415 SmallVector<ScheduleEntity *> WorkList;
21416 auto ProcessNode = [&](ScheduleEntity *SE) {
21418 if (CD->hasValidDependencies())
21421 CD->initDependencies();
21422 CD->resetUnscheduledDeps();
21423 const EdgeInfo &EI = CD->getEdgeInfo();
21426 const auto *It =
find(
Op, CD->getInst());
21427 assert(It !=
Op.end() &&
"Lane not set");
21428 SmallPtrSet<Instruction *, 4> Visited;
21430 int Lane = std::distance(
Op.begin(), It);
21431 assert(Lane >= 0 &&
"Lane not set");
21433 !EI.UserTE->ReorderIndices.empty())
21434 Lane = EI.UserTE->ReorderIndices[Lane];
21435 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21436 "Couldn't find extract lane");
21438 if (EI.UserTE->isCopyableElement(In)) {
21441 if (ScheduleCopyableData *UseSD =
21442 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21443 CD->incDependencies();
21444 if (!UseSD->isScheduled())
21445 CD->incrementUnscheduledDeps(1);
21446 if (!UseSD->hasValidDependencies() ||
21447 (InsertInReadyList && UseSD->isReady()))
21450 }
else if (Visited.
insert(In).second) {
21451 if (ScheduleData *UseSD = getScheduleData(In)) {
21452 CD->incDependencies();
21453 if (!UseSD->isScheduled())
21454 CD->incrementUnscheduledDeps(1);
21455 if (!UseSD->hasValidDependencies() ||
21456 (InsertInReadyList && UseSD->isReady()))
21461 }
while (It !=
Op.end());
21462 if (CD->isReady() && CD->getDependencies() == 0 &&
21463 (EI.UserTE->hasState() &&
21464 (EI.UserTE->getMainOp()->getParent() !=
21465 CD->getInst()->getParent() ||
21467 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
21468 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21469 auto *IU = dyn_cast<Instruction>(U);
21472 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21478 CD->incDependencies();
21479 CD->incrementUnscheduledDeps(1);
21485 if (BundleMember->hasValidDependencies())
21487 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
21488 BundleMember->initDependencies();
21489 BundleMember->resetUnscheduledDeps();
21491 SmallDenseMap<Value *, unsigned> UserToNumOps;
21492 for (User *U : BundleMember->getInst()->users()) {
21495 if (ScheduleData *UseSD = getScheduleData(U)) {
21499 if (areAllOperandsReplacedByCopyableData(
21502 BundleMember->incDependencies();
21503 if (!UseSD->isScheduled())
21504 BundleMember->incrementUnscheduledDeps(1);
21505 if (!UseSD->hasValidDependencies() ||
21506 (InsertInReadyList && UseSD->isReady()))
21510 for (ScheduleCopyableData *UseSD :
21511 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21512 BundleMember->incDependencies();
21513 if (!UseSD->isScheduled())
21514 BundleMember->incrementUnscheduledDeps(1);
21515 if (!UseSD->hasValidDependencies() ||
21516 (InsertInReadyList && UseSD->isReady()))
21520 SmallPtrSet<const Instruction *, 4> Visited;
21523 if (!Visited.
insert(
I).second)
21525 auto *DepDest = getScheduleData(
I);
21526 assert(DepDest &&
"must be in schedule window");
21527 DepDest->addControlDependency(BundleMember);
21528 BundleMember->incDependencies();
21529 if (!DepDest->isScheduled())
21530 BundleMember->incrementUnscheduledDeps(1);
21531 if (!DepDest->hasValidDependencies() ||
21532 (InsertInReadyList && DepDest->isReady()))
21540 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21541 I != ScheduleEnd;
I =
I->getNextNode()) {
21546 MakeControlDependent(
I);
21554 if (RegionHasStackSave) {
21559 match(BundleMember->getInst(),
21561 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21562 I != ScheduleEnd;
I =
I->getNextNode()) {
21573 MakeControlDependent(
I);
21583 BundleMember->getInst()->mayReadOrWriteMemory()) {
21584 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21585 I != ScheduleEnd;
I =
I->getNextNode()) {
21591 MakeControlDependent(
I);
21598 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21599 if (!NextLoadStore)
21603 "NextLoadStore list for non memory effecting bundle?");
21606 unsigned NumAliased = 0;
21607 unsigned DistToSrc = 1;
21608 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
21610 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21611 DepDest = DepDest->getNextLoadStore()) {
21612 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
21622 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21624 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21631 DepDest->addMemoryDependency(BundleMember);
21632 BundleMember->incDependencies();
21633 if (!DepDest->isScheduled())
21634 BundleMember->incrementUnscheduledDeps(1);
21635 if (!DepDest->hasValidDependencies() ||
21636 (InsertInReadyList && DepDest->isReady()))
21660 "expected at least one instruction to schedule");
21662 WorkList.
push_back(Bundle.getBundle().front());
21664 SmallPtrSet<ScheduleBundle *, 16> Visited;
21665 while (!WorkList.
empty()) {
21670 CopyableBundle.
push_back(&CD->getBundle());
21671 Bundles = CopyableBundle;
21673 Bundles = getScheduleBundles(SD->getInst());
21675 if (Bundles.
empty()) {
21676 if (!SD->hasValidDependencies())
21678 if (InsertInReadyList && SD->isReady()) {
21679 ReadyInsts.insert(SD);
21680 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
21684 for (ScheduleBundle *Bundle : Bundles) {
21685 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
21687 assert(isInSchedulingRegion(*Bundle) &&
21688 "ScheduleData not in scheduling region");
21689 for_each(Bundle->getBundle(), ProcessNode);
21691 if (InsertInReadyList && SD->isReady()) {
21692 for (ScheduleBundle *Bundle : Bundles) {
21693 assert(isInSchedulingRegion(*Bundle) &&
21694 "ScheduleData not in scheduling region");
21695 if (!Bundle->isReady())
21697 ReadyInsts.insert(Bundle);
21705void BoUpSLP::BlockScheduling::resetSchedule() {
21707 "tried to reset schedule on block which has not been scheduled");
21708 for_each(ScheduleDataMap, [&](
auto &
P) {
21709 if (BB !=
P.first->getParent())
21711 ScheduleData *SD =
P.second;
21712 if (isInSchedulingRegion(*SD)) {
21713 SD->setScheduled(
false);
21714 SD->resetUnscheduledDeps();
21717 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21718 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21719 if (isInSchedulingRegion(*SD)) {
21720 SD->setScheduled(false);
21721 SD->resetUnscheduledDeps();
21725 for_each(ScheduledBundles, [&](
auto &
P) {
21726 for_each(
P.second, [&](ScheduleBundle *Bundle) {
21727 if (isInSchedulingRegion(*Bundle))
21728 Bundle->setScheduled(false);
21732 for (
auto &
P : ScheduleCopyableDataMap) {
21733 if (isInSchedulingRegion(*
P.second)) {
21734 P.second->setScheduled(
false);
21735 P.second->resetUnscheduledDeps();
21738 ReadyInsts.clear();
21741void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
21742 if (!BS->ScheduleStart)
21745 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
21752 BS->resetSchedule();
21759 struct ScheduleDataCompare {
21760 bool operator()(
const ScheduleEntity *SD1,
21761 const ScheduleEntity *SD2)
const {
21762 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21765 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21770 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21771 I =
I->getNextNode()) {
21773 if (!Bundles.
empty()) {
21774 for (ScheduleBundle *Bundle : Bundles) {
21775 Bundle->setSchedulingPriority(Idx++);
21776 if (!Bundle->hasValidDependencies())
21777 BS->calculateDependencies(*Bundle,
false,
this);
21780 for (ScheduleCopyableData *SD :
reverse(SDs)) {
21781 ScheduleBundle &Bundle = SD->getBundle();
21782 Bundle.setSchedulingPriority(Idx++);
21783 if (!Bundle.hasValidDependencies())
21784 BS->calculateDependencies(Bundle,
false,
this);
21789 BS->getScheduleCopyableDataUsers(
I);
21790 if (ScheduleData *SD = BS->getScheduleData(
I)) {
21793 SDTEs.
front()->doesNotNeedToSchedule() ||
21795 "scheduler and vectorizer bundle mismatch");
21796 SD->setSchedulingPriority(Idx++);
21797 if (!SD->hasValidDependencies() &&
21798 (!CopyableData.
empty() ||
21799 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
21800 assert(TE->isGather() &&
"expected gather node");
21801 return TE->hasState() && TE->hasCopyableElements() &&
21802 TE->isCopyableElement(I);
21808 ScheduleBundle Bundle;
21810 BS->calculateDependencies(Bundle,
false,
this);
21813 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
21814 ScheduleBundle &Bundle = SD->getBundle();
21815 Bundle.setSchedulingPriority(Idx++);
21816 if (!Bundle.hasValidDependencies())
21817 BS->calculateDependencies(Bundle,
false,
this);
21820 BS->initialFillReadyList(ReadyInsts);
21822 Instruction *LastScheduledInst = BS->ScheduleEnd;
21825 SmallPtrSet<Instruction *, 16> Scheduled;
21826 while (!ReadyInsts.empty()) {
21827 auto *Picked = *ReadyInsts.begin();
21828 ReadyInsts.erase(ReadyInsts.begin());
21833 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21834 Instruction *PickedInst = BundleMember->getInst();
21836 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21837 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21838 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
21840 if (PickedInst->
getNextNode() != LastScheduledInst)
21842 LastScheduledInst = PickedInst;
21844 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21845 LastScheduledInst);
21849 if (PickedInst->
getNextNode() != LastScheduledInst)
21851 LastScheduledInst = PickedInst;
21853 auto Invalid = InstructionsState::invalid();
21858#ifdef EXPENSIVE_CHECKS
21862#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21864 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21865 I =
I->getNextNode()) {
21868 [](
const ScheduleBundle *Bundle) {
21869 return Bundle->isScheduled();
21871 "must be scheduled at this point");
21876 BS->ScheduleStart =
nullptr;
21884 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21889 auto E = InstrElementSize.find(V);
21890 if (E != InstrElementSize.end())
21907 Value *FirstNonBool =
nullptr;
21908 while (!Worklist.
empty()) {
21913 auto *Ty =
I->getType();
21916 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21924 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21932 for (
Use &U :
I->operands()) {
21934 if (Visited.
insert(J).second &&
21940 FirstNonBool = U.get();
21951 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21953 Width = DL->getTypeSizeInBits(V->getType());
21957 InstrElementSize[
I] = Width;
21962bool BoUpSLP::collectValuesToDemote(
21963 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
21966 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
21971 unsigned OrigBitWidth =
21972 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21979 if (NodesToKeepBWs.
contains(E.Idx))
21985 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
21986 if (isa<PoisonValue>(R))
21988 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21990 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
21993 if (getTreeEntries(V).
size() > 1)
21999 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
22005 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
22010 unsigned BitWidth2 =
22011 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
22012 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
22018 BitWidth1 = std::min(BitWidth1, BitWidth2);
22023 auto FinalAnalysis = [&, TTI = TTI]() {
22024 if (!IsProfitableToDemote)
22027 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
22029 if (Res &&
E.isGather()) {
22030 if (
E.hasState()) {
22031 if (
const TreeEntry *SameTE =
22032 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars);
22034 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
22035 ToDemote, Visited, NodesToKeepBWs,
22036 MaxDepthLevel, IsProfitableToDemote,
22044 SmallPtrSet<Value *, 4> UniqueBases;
22045 for (
Value *V :
E.Scalars) {
22049 UniqueBases.
insert(EE->getVectorOperand());
22051 const unsigned VF =
E.Scalars.size();
22052 Type *OrigScalarTy =
E.Scalars.front()->getType();
22053 if (UniqueBases.
size() <= 2 ||
22066 if (
E.isGather() || !Visited.
insert(&
E).second ||
22068 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22069 return isa<InsertElementInst>(U) && !isVectorized(U);
22072 return FinalAnalysis();
22075 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22076 return isVectorized(U) ||
22077 (E.Idx == 0 && UserIgnoreList &&
22078 UserIgnoreList->contains(U)) ||
22079 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22080 !U->getType()->isScalableTy() &&
22081 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22082 }) && !IsPotentiallyTruncated(V,
BitWidth);
22087 bool &NeedToExit) {
22088 NeedToExit =
false;
22089 unsigned InitLevel = MaxDepthLevel;
22090 for (
const TreeEntry *
Op : Operands) {
22091 unsigned Level = InitLevel;
22092 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
22093 ToDemote, Visited, NodesToKeepBWs, Level,
22094 IsProfitableToDemote, IsTruncRoot)) {
22095 if (!IsProfitableToDemote)
22098 if (!FinalAnalysis())
22102 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22106 auto AttemptCheckBitwidth =
22107 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
22109 NeedToExit =
false;
22110 unsigned BestFailBitwidth = 0;
22112 if (Checker(
BitWidth, OrigBitWidth))
22114 if (BestFailBitwidth == 0 && FinalAnalysis())
22118 if (BestFailBitwidth == 0) {
22129 auto TryProcessInstruction =
22131 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
22132 if (Operands.empty()) {
22135 for (
Value *V :
E.Scalars)
22136 (void)IsPotentiallyTruncated(V,
BitWidth);
22141 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22144 bool NeedToExit =
false;
22145 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22149 if (!ProcessOperands(Operands, NeedToExit))
22158 return IsProfitableToDemote;
22161 if (
E.State == TreeEntry::SplitVectorize)
22162 return TryProcessInstruction(
22164 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
22165 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
22167 if (
E.isAltShuffle()) {
22169 auto IsDangerousOpcode = [](
unsigned Opcode) {
22171 case Instruction::Shl:
22172 case Instruction::AShr:
22173 case Instruction::LShr:
22174 case Instruction::UDiv:
22175 case Instruction::SDiv:
22176 case Instruction::URem:
22177 case Instruction::SRem:
22184 if (IsDangerousOpcode(
E.getAltOpcode()))
22185 return FinalAnalysis();
22188 switch (
E.getOpcode()) {
22192 case Instruction::Trunc:
22193 if (IsProfitableToDemoteRoot)
22194 IsProfitableToDemote =
true;
22195 return TryProcessInstruction(
BitWidth);
22196 case Instruction::ZExt:
22197 case Instruction::SExt:
22198 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
22199 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22200 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22202 IsProfitableToDemote =
true;
22203 return TryProcessInstruction(
BitWidth);
22207 case Instruction::Add:
22208 case Instruction::Sub:
22209 case Instruction::Mul:
22210 case Instruction::And:
22211 case Instruction::Or:
22212 case Instruction::Xor: {
22213 return TryProcessInstruction(
22214 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
22216 case Instruction::Freeze:
22217 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
22218 case Instruction::Shl: {
22221 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
22223 if (isa<PoisonValue>(V))
22225 if (E.isCopyableElement(V))
22227 auto *I = cast<Instruction>(V);
22228 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22229 return AmtKnownBits.getMaxValue().ult(BitWidth);
22232 return TryProcessInstruction(
22233 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
22235 case Instruction::LShr: {
22239 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22241 if (isa<PoisonValue>(V))
22243 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22244 if (E.isCopyableElement(V))
22245 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22246 auto *I = cast<Instruction>(V);
22247 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22248 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22249 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22250 SimplifyQuery(*DL));
22253 return TryProcessInstruction(
22254 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22257 case Instruction::AShr: {
22261 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22263 if (isa<PoisonValue>(V))
22265 auto *I = cast<Instruction>(V);
22266 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22267 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22268 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22270 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22273 return TryProcessInstruction(
22274 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22277 case Instruction::UDiv:
22278 case Instruction::URem: {
22280 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22283 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22284 if (E.hasCopyableElements() && E.isCopyableElement(V))
22285 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22286 auto *I = cast<Instruction>(V);
22287 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22288 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22291 return TryProcessInstruction(
22292 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
22296 case Instruction::Select: {
22297 return TryProcessInstruction(
22298 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
22302 case Instruction::PHI: {
22303 const unsigned NumOps =
E.getNumOperands();
22306 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
22311 case Instruction::Call: {
22316 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
22317 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
22320 function_ref<bool(
unsigned,
unsigned)> CallChecker;
22321 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22324 auto *I = cast<Instruction>(V);
22325 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22326 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22327 return MaskedValueIsZero(I->getOperand(0), Mask,
22328 SimplifyQuery(*DL)) &&
22329 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22331 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
22332 "Expected min/max intrinsics only.");
22333 unsigned SignBits = OrigBitWidth -
BitWidth;
22335 unsigned Op0SignBits =
22337 unsigned Op1SignBits =
22339 return SignBits <= Op0SignBits &&
22340 ((SignBits != Op0SignBits &&
22343 SimplifyQuery(*DL))) &&
22344 SignBits <= Op1SignBits &&
22345 ((SignBits != Op1SignBits &&
22350 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22353 auto *I = cast<Instruction>(V);
22354 unsigned SignBits = OrigBitWidth - BitWidth;
22355 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22356 unsigned Op0SignBits =
22357 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22358 return SignBits <= Op0SignBits &&
22359 ((SignBits != Op0SignBits &&
22360 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22361 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22364 if (
ID != Intrinsic::abs) {
22365 Operands.push_back(getOperandEntry(&
E, 1));
22366 CallChecker = CompChecker;
22368 CallChecker = AbsChecker;
22371 std::numeric_limits<InstructionCost::CostType>::max();
22373 unsigned VF =
E.Scalars.size();
22375 auto Checker = [&](
unsigned BitWidth, unsigned) {
22383 if (
Cost < BestCost) {
22389 [[maybe_unused]]
bool NeedToExit;
22390 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22392 return TryProcessInstruction(
BitWidth, Operands, CallChecker);
22400 return FinalAnalysis();
22407 bool IsStoreOrInsertElt =
22408 VectorizableTree.front()->hasState() &&
22409 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
22410 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22411 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22412 ExtraBitWidthNodes.size() <= 1 &&
22413 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22414 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22417 unsigned NodeIdx = 0;
22418 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22422 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
22423 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22424 "Unexpected tree is graph.");
22428 bool IsTruncRoot =
false;
22429 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22432 if (NodeIdx != 0 &&
22433 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22434 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
22435 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
22436 IsTruncRoot =
true;
22438 IsProfitableToDemoteRoot =
true;
22443 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22447 auto ComputeMaxBitWidth =
22448 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
22449 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
22453 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22454 !NodesToKeepBWs.
contains(E.Idx) &&
22455 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22457 return V->hasOneUse() || isa<Constant>(V) ||
22458 (!V->hasNUsesOrMore(UsesLimit) &&
22459 none_of(V->users(), [&](User *U) {
22460 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22461 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22462 if (TEs.empty() || is_contained(TEs, UserTE))
22464 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22466 isa<SIToFPInst, UIToFPInst>(U) ||
22467 (UserTE->hasState() &&
22468 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22469 SelectInst>(UserTE->getMainOp()) ||
22470 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22472 unsigned UserTESz = DL->getTypeSizeInBits(
22473 UserTE->Scalars.front()->getType());
22474 if (all_of(TEs, [&](const TreeEntry *TE) {
22475 auto It = MinBWs.find(TE);
22476 return It != MinBWs.end() &&
22477 It->second.first > UserTESz;
22480 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22484 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22485 auto It = MinBWs.find(UserTE);
22486 if (It != MinBWs.end())
22487 return It->second.first;
22488 unsigned MaxBitWidth =
22489 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22490 MaxBitWidth =
bit_ceil(MaxBitWidth);
22491 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22493 return MaxBitWidth;
22499 unsigned VF = E.getVectorFactor();
22500 Type *ScalarTy = E.Scalars.front()->getType();
22507 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
22516 unsigned MaxBitWidth = 1u;
22524 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
22525 if (isa<PoisonValue>(R))
22527 KnownBits Known = computeKnownBits(R, *DL);
22528 return Known.isNonNegative();
22531 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22532 E.UserTreeIndex.UserTE->hasState() &&
22533 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22535 std::min(DL->getTypeSizeInBits(
22536 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22537 DL->getTypeSizeInBits(ScalarTy));
22541 for (
Value *Root : E.Scalars) {
22547 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22563 if (!IsKnownPositive)
22568 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22571 APInt Mask = DB->getDemandedBits(
I);
22572 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22574 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22577 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22582 if (NumParts > 1 &&
22590 unsigned Opcode = E.getOpcode();
22591 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22592 Opcode == Instruction::SExt ||
22593 Opcode == Instruction::ZExt || NumParts > 1;
22598 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22599 bool NeedToDemote = IsProfitableToDemote;
22601 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22602 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22603 NeedToDemote, IsTruncRoot) ||
22604 (MaxDepthLevel <= Limit &&
22605 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22606 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22607 DL->getTypeSizeInBits(TreeRootIT) /
22608 DL->getTypeSizeInBits(
22609 E.getMainOp()->getOperand(0)->getType()) >
22613 MaxBitWidth =
bit_ceil(MaxBitWidth);
22615 return MaxBitWidth;
22622 if (UserIgnoreList &&
22626 if (
all_of(*UserIgnoreList,
22631 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22632 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22633 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22634 Builder.getInt1Ty()) {
22635 ReductionBitWidth = 1;
22637 for (
Value *V : *UserIgnoreList) {
22641 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
22642 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22645 unsigned BitWidth2 = BitWidth1;
22648 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22650 ReductionBitWidth =
22651 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22653 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22654 ReductionBitWidth = 8;
22656 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
22659 bool IsTopRoot = NodeIdx == 0;
22660 while (NodeIdx < VectorizableTree.size() &&
22661 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22662 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22663 RootDemotes.push_back(NodeIdx);
22665 IsTruncRoot =
true;
22667 bool IsSignedCmp =
false;
22668 if (UserIgnoreList &&
22672 IsSignedCmp =
true;
22673 while (NodeIdx < VectorizableTree.size()) {
22675 unsigned Limit = 2;
22677 ReductionBitWidth ==
22678 DL->getTypeSizeInBits(
22679 VectorizableTree.front()->Scalars.front()->getType()))
22681 unsigned MaxBitWidth = ComputeMaxBitWidth(
22682 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22683 IsTruncRoot, IsSignedCmp);
22684 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22685 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22686 ReductionBitWidth =
bit_ceil(MaxBitWidth);
22687 else if (MaxBitWidth == 0)
22688 ReductionBitWidth = 0;
22691 for (
unsigned Idx : RootDemotes) {
22692 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
22693 uint32_t OrigBitWidth =
22694 DL->getTypeSizeInBits(
V->getType()->getScalarType());
22695 if (OrigBitWidth > MaxBitWidth) {
22703 RootDemotes.clear();
22705 IsProfitableToDemoteRoot =
true;
22707 if (ExtraBitWidthNodes.empty()) {
22708 NodeIdx = VectorizableTree.size();
22710 unsigned NewIdx = 0;
22712 NewIdx = *ExtraBitWidthNodes.begin();
22713 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22714 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22717 NodeIdx < VectorizableTree.size() &&
22718 VectorizableTree[NodeIdx]->UserTreeIndex &&
22719 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22720 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22721 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22722 Instruction::Trunc &&
22723 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22725 NodeIdx < VectorizableTree.size() &&
22726 VectorizableTree[NodeIdx]->UserTreeIndex &&
22727 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22728 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22729 Instruction::ICmp &&
22731 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22733 auto *IC = dyn_cast<ICmpInst>(V);
22734 return IC && (IC->isSigned() ||
22735 !isKnownNonNegative(IC->getOperand(0),
22736 SimplifyQuery(*DL)) ||
22737 !isKnownNonNegative(IC->getOperand(1),
22738 SimplifyQuery(*DL)));
22744 if (MaxBitWidth == 0 ||
22748 if (UserIgnoreList)
22749 AnalyzedMinBWVals.insert_range(TreeRoot);
22756 for (
unsigned Idx : ToDemote) {
22757 TreeEntry *
TE = VectorizableTree[Idx].get();
22758 if (MinBWs.contains(TE))
22761 if (isa<PoisonValue>(R))
22763 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22765 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22806 DL = &
F.getDataLayout();
22814 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
22816 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
22821 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
22824 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
22828 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
22834 DT->updateDFSNumbers();
22837 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
22842 R.clearReductionData();
22843 collectSeedInstructions(BB);
22846 if (!Stores.empty()) {
22848 <<
" underlying objects.\n");
22849 Changed |= vectorizeStoreChains(R);
22853 Changed |= vectorizeChainsInBlock(BB, R);
22858 if (!GEPs.empty()) {
22860 <<
" underlying objects.\n");
22861 Changed |= vectorizeGEPIndices(BB, R);
22866 R.optimizeGatherSequence();
22874 unsigned Idx,
unsigned MinVF,
22879 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22880 unsigned VF = Chain.
size();
22886 VF < 2 || VF < MinVF) {
22894 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
22898 for (
Value *V : Chain)
22901 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
22902 InstructionsState S =
Analysis.buildInstructionsState(
22906 bool IsAllowedSize =
22910 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22911 (!S.getMainOp()->isSafeToRemove() ||
22914 return !isa<ExtractElementInst>(V) &&
22915 (V->getNumUses() > Chain.size() ||
22916 any_of(V->users(), [&](User *U) {
22917 return !Stores.contains(U);
22920 (ValOps.
size() > Chain.size() / 2 && !S)) {
22921 Size = (!IsAllowedSize && S) ? 1 : 2;
22925 if (
R.isLoadCombineCandidate(Chain))
22927 R.buildTree(Chain);
22929 if (
R.isTreeTinyAndNotFullyVectorizable()) {
22930 if (
R.isGathered(Chain.front()) ||
22932 return std::nullopt;
22933 Size =
R.getCanonicalGraphSize();
22936 if (
R.isProfitableToReorder()) {
22937 R.reorderTopToBottom();
22938 R.reorderBottomToTop();
22940 R.transformNodes();
22941 R.buildExternalUses();
22943 R.computeMinimumValueSizes();
22945 Size =
R.getCanonicalGraphSize();
22946 if (S && S.getOpcode() == Instruction::Load)
22954 using namespace ore;
22956 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
22958 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
22959 <<
" and with tree size "
22960 <<
NV(
"TreeSize",
R.getTreeSize()));
22974 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22975 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22976 unsigned Size = First ? Val.first : Val.second;
22988 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22989 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22990 unsigned P = First ? Val.first : Val.second;
22993 return V + (P - Mean) * (P - Mean);
22996 return Dev * 96 / (Mean * Mean) == 0;
23004class RelatedStoreInsts {
23007 : AllStores(AllStores) {
23008 reset(BaseInstrIdx);
23011 void reset(
unsigned NewBaseInstr) {
23012 assert(NewBaseInstr < AllStores.size() &&
23013 "Instruction index out of bounds");
23014 BaseInstrIdx = NewBaseInstr;
23016 insertOrLookup(NewBaseInstr, 0);
23023 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
23024 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
23025 return Inserted ? std::nullopt : std::make_optional(It->second);
23028 using DistToInstMap = std::map<int64_t, unsigned>;
23029 const DistToInstMap &getStores()
const {
return Instrs; }
23033 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
23034 ScalarEvolution &SE)
const {
23035 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
23038 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
23044 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
23045 int64_t DistFromCurBase) {
23046 DistToInstMap PrevSet = std::move(Instrs);
23047 reset(NewBaseInstIdx);
23052 for (
auto [Dist, InstIdx] : PrevSet) {
23053 if (InstIdx >= MinSafeIdx)
23054 insertOrLookup(InstIdx, Dist - DistFromCurBase);
23060 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
23061 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
23062 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
23067 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
23068 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
23073 unsigned BaseInstrIdx;
23076 DistToInstMap Instrs;
23084bool SLPVectorizerPass::vectorizeStores(
23086 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23093 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23094 int64_t PrevDist = -1;
23098 auto &[Dist, InstIdx] =
Data;
23099 if (Operands.
empty() || Dist - PrevDist == 1) {
23102 if (Idx != StoreSeq.size() - 1)
23111 if (Operands.
size() <= 1 ||
23113 .
insert({Operands.front(),
23114 cast<StoreInst>(Operands.front())->getValueOperand(),
23116 cast<StoreInst>(Operands.back())->getValueOperand(),
23121 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
23122 unsigned EltSize =
R.getVectorElementSize(Operands[0]);
23126 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23128 Type *StoreTy =
Store->getValueOperand()->getType();
23129 Type *ValueTy = StoreTy;
23131 ValueTy = Trunc->getSrcTy();
23140 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23143 MinVF = std::max<unsigned>(2, MinVF);
23145 if (MaxVF < MinVF) {
23146 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23148 <<
"MinVF (" << MinVF <<
")\n");
23152 unsigned NonPowerOf2VF = 0;
23157 unsigned CandVF = std::clamp<unsigned>(Operands.
size(), MinVF, MaxVF);
23159 NonPowerOf2VF = CandVF;
23160 assert(NonPowerOf2VF != MaxVF &&
23161 "Non-power-of-2 VF should not be equal to MaxVF");
23168 unsigned MaxRegVF = MaxVF;
23170 MaxVF = std::min<unsigned>(MaxVF,
bit_floor(Operands.
size()));
23171 if (MaxVF < MinVF) {
23172 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23174 <<
"MinVF (" << MinVF <<
")\n");
23178 SmallVector<unsigned> CandidateVFs;
23179 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23183 unsigned End = Operands.
size();
23184 unsigned Repeat = 0;
23185 constexpr unsigned MaxAttempts = 4;
23186 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.
size());
23187 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
23188 P.first =
P.second = 1;
23189 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23190 auto IsNotVectorized = [](
bool First,
23191 const std::pair<unsigned, unsigned> &
P) {
23192 return First ?
P.first > 0 :
P.second > 0;
23194 auto IsVectorized = [](
bool First,
23195 const std::pair<unsigned, unsigned> &
P) {
23196 return First ?
P.first == 0 :
P.second == 0;
23198 auto VFIsProfitable = [](
bool First,
unsigned Size,
23199 const std::pair<unsigned, unsigned> &
P) {
23202 auto FirstSizeSame = [](
unsigned Size,
23203 const std::pair<unsigned, unsigned> &
P) {
23204 return Size ==
P.first;
23208 bool RepeatChanged =
false;
23209 bool AnyProfitableGraph =
false;
23210 for (
unsigned VF : CandidateVFs) {
23211 AnyProfitableGraph =
false;
23212 unsigned FirstUnvecStore =
23213 std::distance(RangeSizes.begin(),
23214 find_if(RangeSizes, std::bind(IsNotVectorized,
23215 VF >= MaxRegVF, _1)));
23219 while (FirstUnvecStore < End) {
23220 unsigned FirstVecStore = std::distance(
23221 RangeSizes.begin(),
23222 find_if(RangeSizes.drop_front(FirstUnvecStore),
23223 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23224 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23225 for (
unsigned SliceStartIdx = FirstUnvecStore;
23226 SliceStartIdx + VF <= MaxSliceEnd;) {
23237 ->getValueOperand()
23240 ->getValueOperand()
23243 "Expected all operands of same type.");
23244 if (!NonSchedulable.
empty()) {
23245 auto [NonSchedSizeMax, NonSchedSizeMin] =
23247 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23250 SliceStartIdx += NonSchedSizeMax;
23255 std::optional<bool> Res =
23256 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23262 .first->getSecond()
23270 AnyProfitableGraph = RepeatChanged =
Changed =
true;
23273 for (std::pair<unsigned, unsigned> &
P :
23274 RangeSizes.slice(SliceStartIdx, VF))
23275 P.first =
P.second = 0;
23276 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23277 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
23278 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23279 P.first =
P.second = 0;
23280 FirstUnvecStore = SliceStartIdx + VF;
23282 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23283 for (std::pair<unsigned, unsigned> &
P :
23284 RangeSizes.slice(SliceStartIdx + VF,
23285 MaxSliceEnd - (SliceStartIdx + VF)))
23286 P.first =
P.second = 0;
23287 if (MaxSliceEnd == End)
23288 End = SliceStartIdx;
23289 MaxSliceEnd = SliceStartIdx;
23291 SliceStartIdx += VF;
23294 if (VF > 2 && Res &&
23295 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
23296 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23298 SliceStartIdx += VF;
23303 if (VF > MaxRegVF && TreeSize > 1 &&
23304 all_of(RangeSizes.slice(SliceStartIdx, VF),
23305 std::bind(FirstSizeSame, TreeSize, _1))) {
23306 SliceStartIdx += VF;
23307 while (SliceStartIdx != MaxSliceEnd &&
23308 RangeSizes[SliceStartIdx].first == TreeSize)
23312 if (TreeSize > 1) {
23313 for (std::pair<unsigned, unsigned> &
P :
23314 RangeSizes.slice(SliceStartIdx, VF)) {
23315 if (VF >= MaxRegVF)
23316 P.second = std::max(
P.second, TreeSize);
23318 P.first = std::max(
P.first, TreeSize);
23322 AnyProfitableGraph =
true;
23324 if (FirstUnvecStore >= End)
23326 if (MaxSliceEnd - FirstUnvecStore < VF &&
23327 MaxSliceEnd - FirstUnvecStore >= MinVF)
23328 AnyProfitableGraph =
true;
23329 FirstUnvecStore = std::distance(
23330 RangeSizes.begin(),
23331 find_if(RangeSizes.drop_front(MaxSliceEnd),
23332 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23334 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
23338 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
23339 return P.first == 0 &&
P.second == 0;
23343 if (Repeat >= MaxAttempts ||
23344 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23346 constexpr unsigned StoresLimit = 64;
23347 const unsigned MaxTotalNum = std::min<unsigned>(
23349 static_cast<unsigned>(
23352 RangeSizes.begin(),
23353 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
23355 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
23358 CandidateVFs.clear();
23360 CandidateVFs.push_back(Limit);
23361 if (VF > MaxTotalNum || VF >= StoresLimit)
23363 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
23365 P.first = std::max(
P.second,
P.first);
23369 CandidateVFs.push_back(VF);
23409 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
23410 std::optional<int64_t> PtrDist;
23411 auto *RelatedStores =
find_if(
23412 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
23413 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23414 return PtrDist.has_value();
23418 if (RelatedStores == SortedStores.
end()) {
23426 if (std::optional<unsigned> PrevInst =
23427 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23428 TryToVectorize(RelatedStores->getStores());
23429 RelatedStores->clearVectorizedStores(VectorizedStores);
23430 RelatedStores->rebase(*PrevInst + 1,
23435 Type *PrevValTy =
nullptr;
23437 if (
R.isDeleted(SI))
23440 PrevValTy =
SI->getValueOperand()->getType();
23442 if (PrevValTy !=
SI->getValueOperand()->getType()) {
23443 for (RelatedStoreInsts &StoreSeq : SortedStores)
23444 TryToVectorize(StoreSeq.getStores());
23445 SortedStores.clear();
23446 PrevValTy =
SI->getValueOperand()->getType();
23448 FillStoresSet(
I, SI);
23452 for (RelatedStoreInsts &StoreSeq : SortedStores)
23453 TryToVectorize(StoreSeq.getStores());
23458void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23466 for (Instruction &
I : *BB) {
23470 if (!
SI->isSimple())
23481 if (
GEP->getNumIndices() != 1)
23483 Value *Idx =
GEP->idx_begin()->get();
23488 if (
GEP->getType()->isVectorTy())
23500 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
23501 << VL.
size() <<
".\n");
23512 for (
Value *V : VL) {
23513 Type *Ty =
V->getType();
23517 R.getORE()->emit([&]() {
23518 std::string TypeStr;
23519 llvm::raw_string_ostream OS(TypeStr);
23521 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
23522 <<
"Cannot SLP vectorize list: type "
23523 << TypeStr +
" is unsupported by vectorizer";
23530 unsigned Sz =
R.getVectorElementSize(I0);
23531 unsigned MinVF =
R.getMinVF(Sz);
23532 unsigned MaxVF = std::max<unsigned>(
23534 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23536 R.getORE()->emit([&]() {
23537 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
23538 <<
"Cannot SLP vectorize list: vectorization factor "
23539 <<
"less than 2 is not supported";
23545 bool CandidateFound =
false;
23548 unsigned NextInst = 0, MaxInst = VL.size();
23549 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23555 if (TTI->getNumberOfParts(VecTy) == VF)
23557 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
23558 unsigned ActualVF = std::min(MaxInst -
I, VF);
23563 if (MaxVFOnly && ActualVF < MaxVF)
23565 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23570 for (
Value *V : VL.drop_front(
I)) {
23574 !Inst || !
R.isDeleted(Inst)) {
23577 if (Idx == ActualVF)
23582 if (Idx != ActualVF)
23585 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
23589 if (
R.isTreeTinyAndNotFullyVectorizable())
23591 if (
R.isProfitableToReorder()) {
23592 R.reorderTopToBottom();
23595 R.transformNodes();
23596 R.buildExternalUses();
23598 R.computeMinimumValueSizes();
23600 CandidateFound =
true;
23601 MinCost = std::min(MinCost,
Cost);
23604 <<
" for VF=" << ActualVF <<
"\n");
23607 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
23609 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
23610 <<
" and with tree size "
23611 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
23622 if (!
Changed && CandidateFound) {
23623 R.getORE()->emit([&]() {
23624 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
23625 <<
"List vectorization was possible but not beneficial with cost "
23626 <<
ore::NV(
"Cost", MinCost) <<
" >= "
23630 R.getORE()->emit([&]() {
23631 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
23632 <<
"Cannot SLP vectorize list: vectorization was impossible"
23633 <<
" with available vectorization factors";
23668 using ReductionOpsType = SmallVector<Value *, 16>;
23669 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23670 ReductionOpsListType ReductionOps;
23674 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23675 WeakTrackingVH ReductionRoot;
23680 bool IsSupportedHorRdxIdentityOp =
false;
23687 static bool isCmpSelMinMax(Instruction *
I) {
23695 static bool isBoolLogicOp(Instruction *
I) {
23701 static bool isVectorizable(
RecurKind Kind, Instruction *
I,
23702 bool TwoElementReduction =
false) {
23703 if (Kind == RecurKind::None)
23712 if (TwoElementReduction)
23715 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23719 return I->getFastMathFlags().noNaNs();
23722 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23725 return I->isAssociative();
23728 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
23734 return I->getOperand(2);
23735 return I->getOperand(Index);
23740 Value *
RHS,
const Twine &Name,
bool UseSelect) {
23744 case RecurKind::Or: {
23753 case RecurKind::And: {
23763 case RecurKind::Add:
23764 case RecurKind::Mul:
23765 case RecurKind::Xor:
23766 case RecurKind::FAdd:
23767 case RecurKind::FMul: {
23772 case RecurKind::SMax:
23773 case RecurKind::SMin:
23774 case RecurKind::UMax:
23775 case RecurKind::UMin:
23783 case RecurKind::FMax:
23784 case RecurKind::FMin:
23785 case RecurKind::FMaximum:
23786 case RecurKind::FMinimum:
23787 case RecurKind::FMaximumNum:
23788 case RecurKind::FMinimumNum: {
23801 const ReductionOpsListType &ReductionOps) {
23802 bool UseSelect = ReductionOps.size() == 2 ||
23804 (ReductionOps.size() == 1 &&
23806 assert((!UseSelect || ReductionOps.size() != 2 ||
23808 "Expected cmp + select pairs for reduction");
23809 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
23827 return RecurKind::None;
23829 return RecurKind::Add;
23831 return RecurKind::Mul;
23834 return RecurKind::And;
23837 return RecurKind::Or;
23839 return RecurKind::Xor;
23841 return RecurKind::FAdd;
23843 return RecurKind::FMul;
23846 return RecurKind::FMax;
23848 return RecurKind::FMin;
23851 return RecurKind::FMaximum;
23853 return RecurKind::FMinimum;
23859 return RecurKind::SMax;
23861 return RecurKind::SMin;
23863 return RecurKind::UMax;
23865 return RecurKind::UMin;
23891 return RecurKind::None;
23895 return RecurKind::None;
23898 return RecurKind::None;
23902 return RecurKind::None;
23907 return RecurKind::None;
23910 return RecurKind::SMax;
23913 return RecurKind::SMin;
23916 return RecurKind::UMax;
23919 return RecurKind::UMin;
23922 return RecurKind::None;
23926 static unsigned getFirstOperandIndex(Instruction *
I) {
23927 return isCmpSelMinMax(
I) ? 1 : 0;
23932 static unsigned getNumberOfOperands(Instruction *
I) {
23933 return isCmpSelMinMax(
I) ? 3 : 2;
23938 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
23939 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
23942 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
23944 return I->getParent() == BB;
23948 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
23949 if (IsCmpSelMinMax) {
23953 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23954 return I->hasNUses(2);
23962 void initReductionOps(Instruction *
I) {
23963 if (isCmpSelMinMax(
I))
23964 ReductionOps.assign(2, ReductionOpsType());
23966 ReductionOps.assign(1, ReductionOpsType());
23970 void addReductionOps(Instruction *
I) {
23971 if (isCmpSelMinMax(
I)) {
23973 ReductionOps[1].emplace_back(
I);
23975 ReductionOps[0].emplace_back(
I);
23980 int Sz =
Data.size();
23989 : ReductionRoot(
I), ReductionLimit(2) {
23990 RdxKind = HorizontalReduction::getRdxKind(
I);
23991 ReductionOps.emplace_back().push_back(
I);
23994 ReducedValsToOps[
V].push_back(
I);
23997 bool matchReductionForOperands()
const {
24000 assert(ReductionRoot &&
"Reduction root is not set!");
24003 return Ops.size() == 2;
24011 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
24012 ScalarEvolution &SE,
const DataLayout &
DL,
24013 const TargetLibraryInfo &TLI) {
24014 RdxKind = HorizontalReduction::getRdxKind(Root);
24015 if (!isVectorizable(RdxKind, Root))
24027 if (!Sel->getCondition()->hasOneUse())
24030 ReductionRoot = Root;
24035 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
24037 1, std::make_pair(Root, 0));
24042 SmallVectorImpl<Value *> &PossibleReducedVals,
24043 SmallVectorImpl<Instruction *> &ReductionOps,
24046 getNumberOfOperands(TreeN)))) {
24047 Value *EdgeVal = getRdxOperand(TreeN,
I);
24048 ReducedValsToOps[EdgeVal].push_back(TreeN);
24056 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
24057 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
24058 !isVectorizable(RdxKind, EdgeInst) ||
24059 (
R.isAnalyzedReductionRoot(EdgeInst) &&
24061 PossibleReducedVals.push_back(EdgeVal);
24064 ReductionOps.push_back(EdgeInst);
24073 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
24075 PossibleReducedVals;
24076 initReductionOps(Root);
24078 SmallSet<size_t, 2> LoadKeyUsed;
24080 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
24085 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
24086 if (LIt != LoadsMap.
end()) {
24087 for (LoadInst *RLI : LIt->second) {
24093 for (LoadInst *RLI : LIt->second) {
24100 if (LIt->second.size() > 2) {
24102 hash_value(LIt->second.back()->getPointerOperand());
24108 .first->second.push_back(LI);
24112 while (!Worklist.empty()) {
24113 auto [TreeN,
Level] = Worklist.pop_back_val();
24116 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24117 addReductionOps(TreeN);
24120 for (
Value *V : PossibleRedVals) {
24124 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
24126 for (Instruction *
I :
reverse(PossibleReductionOps))
24127 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
24129 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
24132 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
24133 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
24135 for (
auto &Slice : PossibleRedVals) {
24137 auto RedValsVect = Slice.second.takeVector();
24139 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
24140 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
24142 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
24143 return P1.size() > P2.size();
24150 }
else if (!isGoodForReduction(
Data)) {
24153 if (!LI || !LastLI ||
24158 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
24164 return P1.size() > P2.
size();
24170 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
24171 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24172 DominatorTree &DT) {
24173 constexpr unsigned RegMaxNumber = 4;
24174 constexpr unsigned RedValsMaxNumber = 128;
24178 if (
unsigned NumReducedVals = std::accumulate(
24179 ReducedVals.
begin(), ReducedVals.
end(), 0,
24181 if (!isGoodForReduction(Vals))
24183 return Num + Vals.size();
24185 NumReducedVals < ReductionLimit &&
24189 for (ReductionOpsType &RdxOps : ReductionOps)
24190 for (
Value *RdxOp : RdxOps)
24195 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24201 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
24202 ReducedVals.
front().size());
24206 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
24208 "Expected min/max reduction to have select root instruction");
24211 "Expected min/max reduction to have compare condition");
24215 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
24216 return isBoolLogicOp(cast<Instruction>(V));
24219 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
24220 if (VectorizedTree) {
24224 if (AnyBoolLogicOp) {
24225 auto It = ReducedValsToOps.
find(VectorizedTree);
24226 auto It1 = ReducedValsToOps.
find(Res);
24227 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
24229 (It != ReducedValsToOps.
end() &&
24230 any_of(It->getSecond(), [&](Instruction *
I) {
24231 return isBoolLogicOp(I) &&
24232 getRdxOperand(I, 0) == VectorizedTree;
24236 (It1 != ReducedValsToOps.
end() &&
24237 any_of(It1->getSecond(), [&](Instruction *
I) {
24238 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24242 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
24246 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
24252 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24253 ReductionOps.front().size());
24254 for (ReductionOpsType &RdxOps : ReductionOps)
24255 for (
Value *RdxOp : RdxOps) {
24258 IgnoreList.insert(RdxOp);
24261 FastMathFlags RdxFMF;
24263 for (
Value *U : IgnoreList)
24265 RdxFMF &= FPMO->getFastMathFlags();
24271 for (
Value *V : Candidates)
24272 TrackedVals.try_emplace(V, V);
24274 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24275 Value *
V) ->
unsigned & {
24276 auto *It = MV.
find(V);
24277 assert(It != MV.
end() &&
"Unable to find given key.");
24281 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
24284 SmallPtrSet<Value *, 4> RequiredExtract;
24285 WeakTrackingVH VectorizedTree =
nullptr;
24286 bool CheckForReusedReductionOps =
false;
24291 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
24293 InstructionsState S = States[
I];
24296 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
24297 for (
Value *ReducedVal : OrigReducedVals) {
24298 Value *RdxVal = TrackedVals.at(ReducedVal);
24305 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24309 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24311 bool ShuffledExtracts =
false;
24313 if (S && S.getOpcode() == Instruction::ExtractElement &&
24314 !S.isAltShuffle() &&
I + 1 <
E) {
24316 for (
Value *RV : ReducedVals[
I + 1]) {
24317 Value *RdxVal = TrackedVals.at(RV);
24324 CommonCandidates.push_back(RdxVal);
24325 TrackedToOrig.try_emplace(RdxVal, RV);
24327 SmallVector<int>
Mask;
24330 Candidates.
swap(CommonCandidates);
24331 ShuffledExtracts =
true;
24338 Value *OrigV = TrackedToOrig.at(Candidates.
front());
24339 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24341 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
24342 Value *OrigV = TrackedToOrig.at(VC);
24343 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24345 V.analyzedReductionRoot(ResI);
24347 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24351 unsigned NumReducedVals = Candidates.
size();
24352 if (NumReducedVals < ReductionLimit &&
24353 (NumReducedVals < 2 || !
isSplat(Candidates)))
24358 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24359 RdxKind != RecurKind::FMul &&
24360 RdxKind != RecurKind::FMulAdd;
24362 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24363 if (IsSupportedHorRdxIdentityOp)
24364 for (
Value *V : Candidates) {
24365 Value *OrigV = TrackedToOrig.at(V);
24366 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24378 bool SameScaleFactor =
false;
24379 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24380 SameValuesCounter.
size() != Candidates.size();
24382 if (OptReusedScalars) {
24384 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24385 RdxKind == RecurKind::Xor) &&
24387 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
24388 return P.second == SameValuesCounter.
front().second;
24390 Candidates.resize(SameValuesCounter.
size());
24391 transform(SameValuesCounter, Candidates.begin(),
24392 [&](
const auto &
P) { return TrackedVals.at(P.first); });
24393 NumReducedVals = Candidates.size();
24395 if (NumReducedVals == 1) {
24396 Value *OrigV = TrackedToOrig.at(Candidates.front());
24397 unsigned Cnt = At(SameValuesCounter, OrigV);
24399 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24400 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24401 VectorizedVals.try_emplace(OrigV, Cnt);
24402 ExternallyUsedValues.
insert(OrigV);
24407 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
24408 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
24409 const unsigned MaxElts = std::clamp<unsigned>(
24411 RegMaxNumber * RedValsMaxNumber);
24413 unsigned ReduxWidth = NumReducedVals;
24414 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
24415 unsigned NumParts, NumRegs;
24416 Type *ScalarTy = Candidates.front()->getType();
24423 while (NumParts > NumRegs) {
24424 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
24425 ReduxWidth =
bit_floor(ReduxWidth - 1);
24431 if (NumParts > NumRegs / 2)
24436 ReduxWidth = GetVectorFactor(ReduxWidth);
24437 ReduxWidth = std::min(ReduxWidth, MaxElts);
24439 unsigned Start = 0;
24440 unsigned Pos =
Start;
24442 unsigned PrevReduxWidth = ReduxWidth;
24443 bool CheckForReusedReductionOpsLocal =
false;
24444 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
24445 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
24446 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24449 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24452 if (Pos < NumReducedVals - ReduxWidth + 1)
24453 return IsAnyRedOpGathered;
24456 if (ReduxWidth > 1)
24457 ReduxWidth = GetVectorFactor(ReduxWidth);
24458 return IsAnyRedOpGathered;
24460 bool AnyVectorized =
false;
24461 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24462 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24463 ReduxWidth >= ReductionLimit) {
24466 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24468 CheckForReusedReductionOps =
true;
24471 PrevReduxWidth = ReduxWidth;
24474 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
24477 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
24479 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
24481 V.areAnalyzedReductionVals(VL)) {
24482 (void)AdjustReducedVals(
true);
24489 return RedValI &&
V.isDeleted(RedValI);
24492 V.buildTree(VL, IgnoreList);
24493 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
24494 if (!AdjustReducedVals())
24495 V.analyzedReductionVals(VL);
24498 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
24499 if (!AdjustReducedVals())
24500 V.analyzedReductionVals(VL);
24503 V.reorderTopToBottom();
24506 VL.front()->getType()->isIntOrIntVectorTy() ||
24507 ReductionLimit > 2);
24511 ExternallyUsedValues);
24515 LocalExternallyUsedValues.insert(ReductionRoot);
24516 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
24517 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
24519 for (
Value *V : ReducedVals[Cnt])
24521 LocalExternallyUsedValues.insert(TrackedVals[V]);
24523 if (!IsSupportedHorRdxIdentityOp) {
24526 "Reused values counter map is not empty");
24527 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24528 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24530 Value *
V = Candidates[Cnt];
24531 Value *OrigV = TrackedToOrig.at(V);
24532 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24535 V.transformNodes();
24538 SmallPtrSet<Value *, 4> Visited;
24539 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24540 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24542 Value *RdxVal = Candidates[Cnt];
24543 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24544 RdxVal = It->second;
24545 if (!Visited.
insert(RdxVal).second)
24549 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
24550 LocalExternallyUsedValues.insert(RdxVal);
24553 Value *OrigV = TrackedToOrig.at(RdxVal);
24555 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24556 if (
NumOps != ReducedValsToOps.
at(OrigV).size())
24557 LocalExternallyUsedValues.insert(RdxVal);
24560 if (!IsSupportedHorRdxIdentityOp)
24561 SameValuesCounter.
clear();
24562 for (
Value *RdxVal : VL)
24563 if (RequiredExtract.
contains(RdxVal))
24564 LocalExternallyUsedValues.insert(RdxVal);
24565 V.buildExternalUses(LocalExternallyUsedValues);
24567 V.computeMinimumValueSizes();
24571 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT,
DL, TLI);
24574 <<
" for reduction\n");
24578 V.getORE()->emit([&]() {
24579 return OptimizationRemarkMissed(
SV_NAME,
"HorSLPNotBeneficial",
24580 ReducedValsToOps.
at(VL[0]).front())
24581 <<
"Vectorizing horizontal reduction is possible "
24582 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
24583 <<
" and threshold "
24586 if (!AdjustReducedVals()) {
24587 V.analyzedReductionVals(VL);
24589 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
24592 *
TTI, VL.front()->getType(), ReduxWidth - 1);
24593 VF >= ReductionLimit;
24595 *
TTI, VL.front()->getType(), VF - 1)) {
24597 V.getCanonicalGraphSize() !=
V.getTreeSize())
24600 IgnoredCandidates.
insert(std::make_pair(
Offset + Idx, VF));
24607 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
24608 <<
Cost <<
". (HorRdx)\n");
24609 V.getORE()->emit([&]() {
24610 return OptimizationRemark(
SV_NAME,
"VectorizedHorizontalReduction",
24611 ReducedValsToOps.
at(VL[0]).front())
24612 <<
"Vectorized horizontal reduction with cost "
24613 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
24614 <<
ore::NV(
"TreeSize",
V.getTreeSize());
24623 if (IsCmpSelMinMax)
24624 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24627 Value *VectorizedRoot =
V.vectorizeTree(
24628 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24631 for (
Value *RdxVal : Candidates) {
24632 Value *OrigVal = TrackedToOrig.at(RdxVal);
24633 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24634 if (TransformedRdxVal != RdxVal)
24635 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24644 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
24647 if (OptReusedScalars && !SameScaleFactor) {
24648 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24649 SameValuesCounter, TrackedToOrig);
24652 Type *ScalarTy = VL.front()->getType();
24657 OptReusedScalars && SameScaleFactor
24658 ? SameValuesCounter.
front().second
24661 ?
V.isSignedMinBitwidthRootNode()
24665 for (
Value *RdxVal : VL) {
24666 Value *OrigV = TrackedToOrig.at(RdxVal);
24667 if (IsSupportedHorRdxIdentityOp) {
24668 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24671 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24672 if (!
V.isVectorized(RdxVal))
24673 RequiredExtract.
insert(RdxVal);
24677 ReduxWidth = NumReducedVals - Pos;
24678 if (ReduxWidth > 1)
24679 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24680 AnyVectorized =
true;
24682 if (OptReusedScalars && !AnyVectorized) {
24683 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
24684 Value *RdxVal = TrackedVals.at(
P.first);
24685 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
24686 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24687 VectorizedVals.try_emplace(
P.first,
P.second);
24692 if (!VectorValuesAndScales.
empty())
24693 VectorizedTree = GetNewVectorizedTree(
24695 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
24697 if (!VectorizedTree) {
24698 if (!CheckForReusedReductionOps) {
24699 for (ReductionOpsType &RdxOps : ReductionOps)
24700 for (
Value *RdxOp : RdxOps)
24722 auto FixBoolLogicalOps =
24725 if (!AnyBoolLogicOp)
24727 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
24728 getRdxOperand(RedOp1, 0) ==
LHS ||
24731 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
24732 getRdxOperand(RedOp2, 0) ==
RHS ||
24737 if (
LHS != VectorizedTree)
24745 unsigned Sz = InstVals.
size();
24747 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
24750 Value *RdxVal1 = InstVals[
I].second;
24751 Value *StableRdxVal1 = RdxVal1;
24752 auto It1 = TrackedVals.find(RdxVal1);
24753 if (It1 != TrackedVals.end())
24754 StableRdxVal1 = It1->second;
24755 Value *RdxVal2 = InstVals[
I + 1].second;
24756 Value *StableRdxVal2 = RdxVal2;
24757 auto It2 = TrackedVals.find(RdxVal2);
24758 if (It2 != TrackedVals.end())
24759 StableRdxVal2 = It2->second;
24763 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
24765 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24766 StableRdxVal2,
"op.rdx", ReductionOps);
24767 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
24770 ExtraReds[Sz / 2] = InstVals.
back();
24776 SmallPtrSet<Value *, 8> Visited;
24778 for (
Value *RdxVal : Candidates) {
24779 if (!Visited.
insert(RdxVal).second)
24781 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24782 for (Instruction *RedOp :
24788 bool InitStep =
true;
24789 while (ExtraReductions.
size() > 1) {
24791 FinalGen(ExtraReductions, InitStep);
24792 ExtraReductions.
swap(NewReds);
24795 VectorizedTree = ExtraReductions.
front().second;
24797 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24804 SmallPtrSet<Value *, 4> IgnoreSet;
24813 for (
auto *U :
Ignore->users()) {
24815 "All users must be either in the reduction ops list.");
24818 if (!
Ignore->use_empty()) {
24820 Ignore->replaceAllUsesWith(
P);
24823 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24825 return VectorizedTree;
24831 Value *createSingleOp(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24832 Value *Vec,
unsigned Scale,
bool IsSigned,
24856 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
24859 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
24861 if (Rdx->
getType() != DestTy)
24867 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24874 bool IsCmpSelMinMax, FastMathFlags FMF,
24875 const BoUpSLP &R, DominatorTree &DT,
24876 const DataLayout &
DL,
24877 const TargetLibraryInfo &TLI) {
24879 Type *ScalarTy = ReducedVals.
front()->getType();
24880 unsigned ReduxWidth = ReducedVals.
size();
24881 FixedVectorType *VectorTy =
R.getReductionType();
24886 auto EvaluateScalarCost = [&](function_ref<
InstructionCost()> GenCostFn) {
24889 int Cnt = ReducedVals.
size();
24890 for (
Value *RdxVal : ReducedVals) {
24895 Cost += GenCostFn();
24899 for (User *U : RdxVal->
users()) {
24901 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24902 if (RdxKind == RecurKind::FAdd) {
24912 FMACost -= FMulCost;
24914 ScalarCost += FMACost;
24921 ScalarCost = InstructionCost::getInvalid();
24925 Cost += ScalarCost;
24927 Cost += GenCostFn();
24936 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
24938 case RecurKind::Add:
24939 case RecurKind::Mul:
24940 case RecurKind::Or:
24941 case RecurKind::And:
24942 case RecurKind::Xor:
24943 case RecurKind::FAdd:
24944 case RecurKind::FMul: {
24947 if (DoesRequireReductionOp) {
24950 unsigned ScalarTyNumElements = VecTy->getNumElements();
24955 ReducedVals.size()),
24966 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24967 std::make_pair(RedTy,
true));
24968 if (RType == RedTy) {
24973 RdxOpcode, !IsSigned, RedTy,
24979 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24980 std::make_pair(RedTy,
true));
24983 if (RdxKind == RecurKind::FAdd) {
24988 for (
Value *RdxVal : ReducedVals) {
24994 FMF &= FPCI->getFastMathFlags();
24997 if (!
Ops.empty()) {
25002 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
25003 {RVecTy, RVecTy, RVecTy}, FMF);
25009 Instruction::FMul, RVecTy,
CostKind);
25011 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
25012 FMACost -= FMulCost;
25016 if (FMACost.isValid())
25017 VectorCost += FMACost;
25021 if (RType != RedTy) {
25022 unsigned Opcode = Instruction::Trunc;
25024 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25030 ScalarCost = EvaluateScalarCost([&]() {
25035 case RecurKind::FMax:
25036 case RecurKind::FMin:
25037 case RecurKind::FMaximum:
25038 case RecurKind::FMinimum:
25039 case RecurKind::SMax:
25040 case RecurKind::SMin:
25041 case RecurKind::UMax:
25042 case RecurKind::UMin: {
25045 if (DoesRequireReductionOp) {
25051 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
25052 std::make_pair(RedTy,
true));
25054 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
25056 if (RType != RedTy) {
25057 unsigned Opcode = Instruction::Trunc;
25059 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25065 ScalarCost = EvaluateScalarCost([&]() {
25066 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
25075 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
25077 <<
" (It is a splitting reduction)\n");
25078 return VectorCost - ScalarCost;
25084 Value *emitReduction(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
25086 Value *ReducedSubTree =
nullptr;
25088 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
25089 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy);
25090 if (ReducedSubTree)
25091 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
25092 "op.rdx", ReductionOps);
25094 ReducedSubTree = Rdx;
25096 if (VectorValuesAndScales.
size() == 1) {
25097 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
25098 CreateSingleOp(Vec, Scale, IsSigned);
25099 return ReducedSubTree;
25103 Value *VecRes =
nullptr;
25104 bool VecResSignedness =
false;
25105 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
25111 case RecurKind::Add: {
25112 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
25115 <<
". (HorRdx)\n");
25118 std::iota(std::next(
Mask.begin(), VF *
I),
25119 std::next(
Mask.begin(), VF * (
I + 1)), 0);
25120 ++NumVectorInstructions;
25131 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
25132 <<
". (HorRdx)\n");
25133 ++NumVectorInstructions;
25137 case RecurKind::Xor: {
25140 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
25145 case RecurKind::FAdd: {
25149 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
25150 <<
". (HorRdx)\n");
25151 ++NumVectorInstructions;
25155 case RecurKind::And:
25156 case RecurKind::Or:
25157 case RecurKind::SMax:
25158 case RecurKind::SMin:
25159 case RecurKind::UMax:
25160 case RecurKind::UMin:
25161 case RecurKind::FMax:
25162 case RecurKind::FMin:
25163 case RecurKind::FMaximum:
25164 case RecurKind::FMinimum:
25167 case RecurKind::Sub:
25168 case RecurKind::AddChainWithSubs:
25169 case RecurKind::Mul:
25170 case RecurKind::FMul:
25171 case RecurKind::FMulAdd:
25172 case RecurKind::AnyOf:
25173 case RecurKind::FindFirstIVSMin:
25174 case RecurKind::FindFirstIVUMin:
25175 case RecurKind::FindLastIVSMax:
25176 case RecurKind::FindLastIVUMax:
25177 case RecurKind::FMaxNum:
25178 case RecurKind::FMinNum:
25179 case RecurKind::FMaximumNum:
25180 case RecurKind::FMinimumNum:
25181 case RecurKind::None:
25188 VecResSignedness = IsSigned;
25190 ++NumVectorInstructions;
25191 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
25197 std::iota(
Mask.begin(),
Mask.end(), 0);
25199 if (VecResVF < VecVF) {
25203 if (VecResVF != VecVF) {
25205 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
25222 if (VecResVF < VecVF) {
25228 if (VecResVF != VecVF)
25230 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
25231 if (VecResVF != VecVF)
25236 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25237 CreateVecOp(Vec, Scale, IsSigned);
25238 CreateSingleOp(VecRes, 1,
false);
25240 return ReducedSubTree;
25244 Value *emitReduction(
Value *VectorizedValue, IRBuilderBase &Builder,
25245 const TargetTransformInfo *
TTI,
Type *DestTy) {
25246 assert(VectorizedValue &&
"Need to have a vectorized tree node");
25247 assert(RdxKind != RecurKind::FMulAdd &&
25248 "A call to the llvm.fmuladd intrinsic is not handled yet");
25251 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
25252 RdxKind == RecurKind::Add &&
25257 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
25258 ++NumVectorInstructions;
25261 ++NumVectorInstructions;
25266 Value *emitScaleForReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder,
25268 assert(IsSupportedHorRdxIdentityOp &&
25269 "The optimization of matched scalar identity horizontal reductions "
25270 "must be supported.");
25272 return VectorizedValue;
25274 case RecurKind::Add: {
25276 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
25278 << VectorizedValue <<
". (HorRdx)\n");
25279 return Builder.
CreateMul(VectorizedValue, Scale);
25281 case RecurKind::Xor: {
25283 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
25284 <<
". (HorRdx)\n");
25287 return VectorizedValue;
25289 case RecurKind::FAdd: {
25291 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
25293 << VectorizedValue <<
". (HorRdx)\n");
25294 return Builder.
CreateFMul(VectorizedValue, Scale);
25296 case RecurKind::And:
25297 case RecurKind::Or:
25298 case RecurKind::SMax:
25299 case RecurKind::SMin:
25300 case RecurKind::UMax:
25301 case RecurKind::UMin:
25302 case RecurKind::FMax:
25303 case RecurKind::FMin:
25304 case RecurKind::FMaximum:
25305 case RecurKind::FMinimum:
25307 return VectorizedValue;
25308 case RecurKind::Sub:
25309 case RecurKind::AddChainWithSubs:
25310 case RecurKind::Mul:
25311 case RecurKind::FMul:
25312 case RecurKind::FMulAdd:
25313 case RecurKind::AnyOf:
25314 case RecurKind::FindFirstIVSMin:
25315 case RecurKind::FindFirstIVUMin:
25316 case RecurKind::FindLastIVSMax:
25317 case RecurKind::FindLastIVUMax:
25318 case RecurKind::FMaxNum:
25319 case RecurKind::FMinNum:
25320 case RecurKind::FMaximumNum:
25321 case RecurKind::FMinimumNum:
25322 case RecurKind::None:
25331 emitReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25332 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25333 const DenseMap<Value *, Value *> &TrackedToOrig) {
25334 assert(IsSupportedHorRdxIdentityOp &&
25335 "The optimization of matched scalar identity horizontal reductions "
25336 "must be supported.");
25339 if (VTy->getElementType() != VL.
front()->getType()) {
25343 R.isSignedMinBitwidthRootNode());
25346 case RecurKind::Add: {
25349 for (
Value *V : VL) {
25350 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25351 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
25355 << VectorizedValue <<
". (HorRdx)\n");
25356 return Builder.
CreateMul(VectorizedValue, Scale);
25358 case RecurKind::And:
25359 case RecurKind::Or:
25362 <<
". (HorRdx)\n");
25363 return VectorizedValue;
25364 case RecurKind::SMax:
25365 case RecurKind::SMin:
25366 case RecurKind::UMax:
25367 case RecurKind::UMin:
25368 case RecurKind::FMax:
25369 case RecurKind::FMin:
25370 case RecurKind::FMaximum:
25371 case RecurKind::FMinimum:
25374 <<
". (HorRdx)\n");
25375 return VectorizedValue;
25376 case RecurKind::Xor: {
25381 SmallVector<int>
Mask(
25384 std::iota(
Mask.begin(),
Mask.end(), 0);
25385 bool NeedShuffle =
false;
25386 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
25388 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25389 if (Cnt % 2 == 0) {
25391 NeedShuffle =
true;
25397 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
25401 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
25402 return VectorizedValue;
25404 case RecurKind::FAdd: {
25407 for (
Value *V : VL) {
25408 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25409 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
25412 return Builder.
CreateFMul(VectorizedValue, Scale);
25414 case RecurKind::Sub:
25415 case RecurKind::AddChainWithSubs:
25416 case RecurKind::Mul:
25417 case RecurKind::FMul:
25418 case RecurKind::FMulAdd:
25419 case RecurKind::AnyOf:
25420 case RecurKind::FindFirstIVSMin:
25421 case RecurKind::FindFirstIVUMin:
25422 case RecurKind::FindLastIVSMax:
25423 case RecurKind::FindLastIVUMax:
25424 case RecurKind::FMaxNum:
25425 case RecurKind::FMinNum:
25426 case RecurKind::FMaximumNum:
25427 case RecurKind::FMinimumNum:
25428 case RecurKind::None:
25438 return HorizontalReduction::getRdxKind(V);
25444 unsigned AggregateSize = 1;
25446 Type *CurrentType =
IV->getType();
25449 for (
auto *Elt : ST->elements())
25450 if (Elt != ST->getElementType(0))
25451 return std::nullopt;
25452 AggregateSize *= ST->getNumElements();
25453 CurrentType = ST->getElementType(0);
25455 AggregateSize *= AT->getNumElements();
25456 CurrentType = AT->getElementType();
25458 AggregateSize *= VT->getNumElements();
25459 return AggregateSize;
25461 return AggregateSize;
25463 return std::nullopt;
25472 unsigned OperandOffset,
const BoUpSLP &R) {
25475 std::optional<unsigned> OperandIndex =
25477 if (!OperandIndex || R.isDeleted(LastInsertInst))
25481 BuildVectorOpds, InsertElts, *OperandIndex, R);
25484 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25485 InsertElts[*OperandIndex] = LastInsertInst;
25488 }
while (LastInsertInst !=
nullptr &&
25515 "Expected insertelement or insertvalue instruction!");
25518 "Expected empty result vectors!");
25521 if (!AggregateSize)
25523 BuildVectorOpds.
resize(*AggregateSize);
25524 InsertElts.
resize(*AggregateSize);
25529 if (BuildVectorOpds.
size() >= 2)
25547 auto DominatedReduxValue = [&](
Value *R) {
25555 if (
P->getIncomingBlock(0) == ParentBB) {
25557 }
else if (
P->getIncomingBlock(1) == ParentBB) {
25561 if (Rdx && DominatedReduxValue(Rdx))
25574 if (
P->getIncomingBlock(0) == BBLatch) {
25576 }
else if (
P->getIncomingBlock(1) == BBLatch) {
25580 if (Rdx && DominatedReduxValue(Rdx))
25616 "Expected binop, select, or intrinsic for reduction matching");
25618 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25620 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25631 Value *Op0 =
nullptr;
25632 Value *Op1 =
nullptr;
25641 Value *B0 =
nullptr, *B1 =
nullptr;
25646bool SLPVectorizerPass::vectorizeHorReduction(
25647 PHINode *
P, Instruction *Root, BasicBlock *BB,
BoUpSLP &R,
25648 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25657 auto SelectRoot = [&]() {
25676 std::queue<std::pair<Instruction *, unsigned>>
Stack;
25677 Stack.emplace(SelectRoot(), 0);
25678 SmallPtrSet<Value *, 8> VisitedInstrs;
25681 if (
R.isAnalyzedReductionRoot(Inst))
25686 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25688 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25690 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
25691 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25703 while (!
Stack.empty()) {
25706 std::tie(Inst, Level) =
Stack.front();
25711 if (
R.isDeleted(Inst))
25713 if (
Value *VectorizedV = TryToReduce(Inst)) {
25717 Stack.emplace(
I, Level);
25720 if (
R.isDeleted(Inst))
25724 if (!TryAppendToPostponedInsts(Inst)) {
25735 if (VisitedInstrs.
insert(
Op).second)
25740 !
R.isDeleted(
I) &&
I->getParent() == BB)
25741 Stack.emplace(
I, Level);
25746bool SLPVectorizerPass::tryToVectorize(Instruction *
I,
BoUpSLP &R) {
25753 if ((
I->getOpcode() == Instruction::FAdd ||
25754 I->getOpcode() == Instruction::FSub) &&
25764 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
25765 R.isDeleted(Op0) ||
R.isDeleted(Op1))
25775 if (
A &&
B &&
B->hasOneUse()) {
25778 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
25780 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
25784 if (
B &&
A &&
A->hasOneUse()) {
25787 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
25789 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
25793 auto TryToReduce = [
this, &
R, &TTI = *TTI](
Instruction *Inst,
25797 Type *Ty = Inst->getType();
25801 if (!HorRdx.matchReductionForOperands())
25807 TTI.getScalarizationOverhead(
25810 TTI.getInstructionCost(Inst,
CostKind);
25822 FMF = FPCI->getFastMathFlags();
25823 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25830 if (RedCost >= ScalarCost)
25833 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) !=
nullptr;
25835 if (Candidates.
size() == 1)
25836 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
25839 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
25840 if (!BestCandidate)
25842 return (*BestCandidate == 0 &&
25843 TryToReduce(
I, {Candidates[*BestCandidate].first,
25844 Candidates[*BestCandidate].second})) ||
25845 tryToVectorizeList({Candidates[*BestCandidate].first,
25846 Candidates[*BestCandidate].second},
25850bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *
P, Instruction *Root,
25851 BasicBlock *BB,
BoUpSLP &R) {
25853 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
25854 Res |= tryToVectorize(PostponedInsts, R);
25861 for (
Value *V : Insts)
25863 Res |= tryToVectorize(Inst, R);
25867bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25870 if (!
R.canMapToVector(IVI->
getType()))
25873 SmallVector<Value *, 16> BuildVectorOpds;
25874 SmallVector<Value *, 16> BuildVectorInsts;
25878 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
25879 R.getORE()->emit([&]() {
25880 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IVI)
25881 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
25882 "trying reduction first.";
25886 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
25888 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25891bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25894 SmallVector<Value *, 16> BuildVectorInsts;
25895 SmallVector<Value *, 16> BuildVectorOpds;
25896 SmallVector<int>
Mask;
25902 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
25903 R.getORE()->emit([&]() {
25904 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IEI)
25905 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
25906 "trying reduction first.";
25910 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
25911 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25914template <
typename T>
25919 bool MaxVFOnly,
BoUpSLP &R) {
25932 if (!
I || R.isDeleted(
I)) {
25936 auto *SameTypeIt = IncIt;
25939 AreCompatible(VL, *SameTypeIt))) {
25942 if (
I && !R.isDeleted(
I))
25947 unsigned NumElts = VL.
size();
25948 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
25949 << NumElts <<
")\n");
25959 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
25962 VL.
swap(Candidates);
25963 Candidates.
clear();
25971 auto GetMinNumElements = [&R](
Value *V) {
25972 unsigned EltSize = R.getVectorElementSize(V);
25973 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25975 if (NumElts < GetMinNumElements(*IncIt) &&
25976 (Candidates.
empty() ||
25977 Candidates.
front()->getType() == (*IncIt)->getType())) {
25985 if (Candidates.
size() > 1 &&
25986 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25987 if (TryToVectorizeHelper(Candidates,
false)) {
25990 }
else if (MaxVFOnly) {
25993 for (
auto *It = Candidates.
begin(), *End = Candidates.
end(); It != End;
25996 if (!
I || R.isDeleted(
I)) {
26000 auto *SameTypeIt = It;
26001 while (SameTypeIt != End &&
26004 AreCompatible(*SameTypeIt, *It))) {
26007 if (
I && !R.isDeleted(
I))
26010 unsigned NumElts = VL.
size();
26011 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
26017 Candidates.
clear();
26021 IncIt = SameTypeIt;
26033template <
bool IsCompatibility>
26038 "Expected valid element types only.");
26040 return IsCompatibility;
26043 if (CI1->getOperand(0)->getType()->getTypeID() <
26045 return !IsCompatibility;
26046 if (CI1->getOperand(0)->getType()->getTypeID() >
26049 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
26051 return !IsCompatibility;
26052 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
26061 if (BasePred1 < BasePred2)
26062 return !IsCompatibility;
26063 if (BasePred1 > BasePred2)
26066 bool CI1Preds = Pred1 == BasePred1;
26067 bool CI2Preds = Pred2 == BasePred1;
26068 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
26069 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
26074 return !IsCompatibility;
26079 if (IsCompatibility) {
26080 if (I1->getParent() != I2->getParent())
26087 return NodeI2 !=
nullptr;
26090 assert((NodeI1 == NodeI2) ==
26092 "Different nodes should have different DFS numbers");
26093 if (NodeI1 != NodeI2)
26097 if (S && (IsCompatibility || !S.isAltShuffle()))
26099 if (IsCompatibility)
26101 if (I1->getOpcode() != I2->getOpcode())
26102 return I1->getOpcode() < I2->getOpcode();
26105 return IsCompatibility;
26108template <
typename ItT>
26110 BasicBlock *BB,
BoUpSLP &R) {
26113 for (CmpInst *
I : CmpInsts) {
26114 if (
R.isDeleted(
I))
26118 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
26119 if (
R.isDeleted(
I))
26124 for (CmpInst *
I : CmpInsts) {
26125 if (
R.isDeleted(
I))
26144 for (Instruction *V : CmpInsts)
26147 if (Vals.
size() <= 1)
26150 Vals, CompareSorter, AreCompatibleCompares,
26153 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
26154 return any_of(
V->users(), [V](User *U) {
26155 auto *Select = dyn_cast<SelectInst>(U);
26157 Select->getParent() != cast<Instruction>(V)->getParent();
26160 if (ArePossiblyReducedInOtherBlock)
26162 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26168bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
26169 BasicBlock *BB,
BoUpSLP &R) {
26171 "This function only accepts Insert instructions");
26172 bool OpsChanged =
false;
26174 for (
auto *
I :
reverse(Instructions)) {
26180 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
26183 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
26186 if (
R.isDeleted(
I))
26188 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
26194 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
26196 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
26201 OpsChanged |= tryToVectorize(PostponedInsts, R);
26207bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,
BoUpSLP &R) {
26210 SmallPtrSet<Value *, 16> VisitedInstrs;
26214 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
26215 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
26218 "Expected vectorizable types only.");
26228 V2->getType()->getScalarSizeInBits())
26231 V2->getType()->getScalarSizeInBits())
26235 if (Opcodes1.
size() < Opcodes2.
size())
26237 if (Opcodes1.
size() > Opcodes2.
size())
26239 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26248 return NodeI2 !=
nullptr;
26251 assert((NodeI1 == NodeI2) ==
26253 "Different nodes should have different DFS numbers");
26254 if (NodeI1 != NodeI2)
26257 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
26273 DT->getNode(V1->getParent());
26275 DT->getNode(V2->getParent());
26277 return NodeI2 !=
nullptr;
26280 assert((NodeI1 == NodeI2) ==
26282 "Different nodes should have different DFS numbers");
26283 if (NodeI1 != NodeI2)
26285 return V1->comesBefore(V2);
26298 return *Id1 < *Id2;
26302 if (
I1->getOpcode() == I2->getOpcode())
26304 return I1->getOpcode() < I2->getOpcode();
26327 auto ValID1 = Opcodes1[
I]->getValueID();
26328 auto ValID2 = Opcodes2[
I]->getValueID();
26329 if (ValID1 == ValID2)
26331 if (ValID1 < ValID2)
26333 if (ValID1 > ValID2)
26342 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
26348 if (VL.empty() || V1 == VL.back())
26350 Value *V2 = VL.back();
26355 if (Opcodes1.
size() != Opcodes2.
size())
26357 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26363 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
26365 if (
I1->getParent() != I2->getParent())
26373 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
26379 bool HaveVectorizedPhiNodes =
false;
26383 for (Instruction &
I : *BB) {
26390 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
26395 if (Incoming.
size() <= 1)
26400 for (
Value *V : Incoming) {
26401 SmallVectorImpl<Value *> &Opcodes =
26403 if (!Opcodes.
empty())
26406 SmallPtrSet<Value *, 4> Visited;
26407 while (!Nodes.empty()) {
26411 for (
Value *V :
PHI->incoming_values()) {
26413 Nodes.push_back(PHI1);
26422 Incoming, PHICompare, AreCompatiblePHIs,
26424 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26427 Changed |= HaveVectorizedPhiNodes;
26428 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
26430 return !
PHI ||
R.isDeleted(
PHI);
26432 PHIToOpcodes.
clear();
26434 }
while (HaveVectorizedPhiNodes);
26436 VisitedInstrs.
clear();
26438 InstSetVector PostProcessInserts;
26439 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26442 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
26443 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26444 if (VectorizeCmps) {
26446 PostProcessCmps.
clear();
26448 PostProcessInserts.clear();
26454 return PostProcessCmps.
contains(Cmp);
26456 PostProcessInserts.contains(
I);
26462 return I->use_empty() &&
26472 if (
R.isDeleted(&*It))
26475 if (!VisitedInstrs.
insert(&*It).second) {
26476 if (HasNoUsers(&*It) &&
26477 VectorizeInsertsAndCmps(It->isTerminator())) {
26490 if (
P->getNumIncomingValues() == 2) {
26493 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
26507 if (BB ==
P->getIncomingBlock(
I) ||
26508 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
26514 PI && !IsInPostProcessInstrs(PI)) {
26516 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
26518 if (Res &&
R.isDeleted(
P)) {
26528 if (HasNoUsers(&*It)) {
26529 bool OpsChanged =
false;
26540 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
26541 SI->getValueOperand()->hasOneUse();
26543 if (TryToVectorizeRoot) {
26544 for (
auto *V : It->operand_values()) {
26548 VI && !IsInPostProcessInstrs(VI))
26550 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
26557 VectorizeInsertsAndCmps(It->isTerminator());
26569 PostProcessInserts.insert(&*It);
26577bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,
BoUpSLP &R) {
26579 for (
auto &Entry : GEPs) {
26582 if (
Entry.second.size() < 2)
26585 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
26586 <<
Entry.second.size() <<
".\n");
26594 return !R.isDeleted(GEP);
26596 if (It ==
Entry.second.end())
26598 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
26599 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
26600 if (MaxVecRegSize < EltSize)
26603 unsigned MaxElts = MaxVecRegSize / EltSize;
26604 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
26605 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26618 Candidates.remove_if([&R](
Value *
I) {
26628 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
26629 auto *GEPI = GEPList[
I];
26630 if (!Candidates.count(GEPI))
26632 const SCEV *SCEVI = SE->getSCEV(GEPList[
I]);
26633 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
26634 auto *GEPJ = GEPList[J];
26635 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26637 Candidates.remove(GEPI);
26638 Candidates.remove(GEPJ);
26639 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26640 Candidates.remove(GEPJ);
26647 if (Candidates.
size() < 2)
26653 SmallVector<Value *, 16> Bundle(Candidates.
size());
26654 auto BundleIndex = 0
u;
26655 for (
auto *V : Candidates) {
26657 auto *GEPIdx =
GEP->idx_begin()->get();
26659 Bundle[BundleIndex++] = GEPIdx;
26671 Changed |= tryToVectorizeList(Bundle, R);
26677bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
26682 auto &&StoreSorter = [
this](StoreInst *
V, StoreInst *V2) {
26683 if (
V->getValueOperand()->getType()->getTypeID() <
26686 if (
V->getValueOperand()->getType()->getTypeID() >
26689 if (
V->getPointerOperandType()->getTypeID() <
26690 V2->getPointerOperandType()->getTypeID())
26692 if (
V->getPointerOperandType()->getTypeID() >
26693 V2->getPointerOperandType()->getTypeID())
26695 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
26698 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
26704 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26705 DT->getNode(
I1->getParent());
26706 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26707 DT->getNode(I2->getParent());
26708 assert(NodeI1 &&
"Should only process reachable instructions");
26709 assert(NodeI2 &&
"Should only process reachable instructions");
26710 assert((NodeI1 == NodeI2) ==
26712 "Different nodes should have different DFS numbers");
26713 if (NodeI1 != NodeI2)
26715 return I1->getOpcode() < I2->getOpcode();
26717 return V->getValueOperand()->getValueID() <
26721 bool SameParent =
true;
26727 StoreInst *V2 = VL.
back();
26752 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
26754 for (
auto [SI, V] :
zip(VL, NewVL))
26755 V =
SI->getValueOperand();
26756 NewVL.back() = V1->getValueOperand();
26757 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
26758 InstructionsState S =
Analysis.buildInstructionsState(
26766 return V1->getValueOperand()->
getValueID() ==
26771 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26772 for (
auto &Pair : Stores) {
26773 if (Pair.second.size() < 2)
26777 << Pair.second.size() <<
".\n");
26786 Pair.second.rend());
26788 ReversedStores, StoreSorter, AreCompatibleStores,
26790 return vectorizeStores(Candidates, R, Attempted);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
static Type * getIndexType(Value *In)
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
~ShuffleInstructionBuilder()
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(unsigned CounterName)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateFreeze(Value *V, const Twine &Name="")
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator I
iterator_adaptor_base()=default
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
unsigned getTreeSize() const
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
FunctionAddr VTableAddr Value
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
@ LLVM_MARK_AS_BITMASK_ENUM
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
DenseMapInfo< unsigned > SecondInfo
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
TargetTransformInfo * TTI
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const